Add khanindex nevrax indexation

author: neodarz <neodarz@neodarz.net> 2019-01-19 00:02:57 +0100
committer: neodarz <neodarz@neodarz.net> 2019-01-19 00:02:57 +0100
commit: a3f01580faf6caee4abcc8e682567b87380857b9 (patch)
tree: dbf5e0a9866b3aac7e7bb64f2eab8c005e1c28cd /crawler/nevrax/spiders/scrape.py
parent: 073e919ef198a04da1e5ed28a7dfbc5d9681fc14 (diff)
download: khanindexer-a3f01580faf6caee4abcc8e682567b87380857b9.tar.xz
khanindexer-a3f01580faf6caee4abcc8e682567b87380857b9.zip
1 files changed, 45 insertions, 0 deletions
diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py
new file mode 100644
index 0000000..8a7b8ec
--- /dev/null
+++ b/crawler/nevrax/spiders/scrape.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from scrapy import Selector
+
+import config
+
+class NevraxSpider(CrawlSpider):
+    name = "nevrax"
+    custom_settings = {
+        'ITEM_PIPELINES': {
+            'crawler.nevrax.pipelines.NevraxPipeline': 0
+        }
+    }
+    allow_domains = [config.NEVRAX_URL]
+    start_urls = [
+        'http://'+config.NEVRAX_URL+'/',
+    ]
+
+    rules = [
+            Rule(
+                LinkExtractor(
+                    canonicalize=True,
+                    unique=True,
+                    allow_domains=config.NEVRAX_URL,
+                    #deny=".*\.neodarz\.net.*"
+                ),
+                follow=True,
+                callback="parse_items"
+            )
+    ]
+
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url, callback=self.parse, dont_filter=True)
+
+    def parse_items(self, response):
+        sel = Selector(response)
+        yield {
+                'url': response.url,
+                'title': response.css('title::text').extract_first(),
+                'content': ''.join(sel.select("//body//text()").extract()).strip()
+        }
author	neodarz <neodarz@neodarz.net>	2019-01-19 00:02:57 +0100
committer	neodarz <neodarz@neodarz.net>	2019-01-19 00:02:57 +0100
commit	a3f01580faf6caee4abcc8e682567b87380857b9 (patch)
tree	dbf5e0a9866b3aac7e7bb64f2eab8c005e1c28cd /crawler/nevrax/spiders/scrape.py
parent	073e919ef198a04da1e5ed28a7dfbc5d9681fc14 (diff)
download	khanindexer-a3f01580faf6caee4abcc8e682567b87380857b9.tar.xz khanindexer-a3f01580faf6caee4abcc8e682567b87380857b9.zip