aboutsummaryrefslogtreecommitdiff
path: root/crawler
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-01-19 00:02:57 +0100
committerneodarz <neodarz@neodarz.net>2019-01-19 00:02:57 +0100
commita3f01580faf6caee4abcc8e682567b87380857b9 (patch)
treedbf5e0a9866b3aac7e7bb64f2eab8c005e1c28cd /crawler
parent073e919ef198a04da1e5ed28a7dfbc5d9681fc14 (diff)
downloadkhanindexer-a3f01580faf6caee4abcc8e682567b87380857b9.tar.xz
khanindexer-a3f01580faf6caee4abcc8e682567b87380857b9.zip
Add khanindex nevrax indexation
Diffstat (limited to 'crawler')
-rw-r--r--crawler/neodarznet/settings.py12
-rw-r--r--crawler/neodarznet/spiders/scrape.py9
-rw-r--r--crawler/nevrax/__init__.py0
-rw-r--r--crawler/nevrax/pipelines.py19
-rw-r--r--crawler/nevrax/spiders/__init__.py0
-rw-r--r--crawler/nevrax/spiders/scrape.py45
-rw-r--r--crawler/settings.py12
7 files changed, 83 insertions, 14 deletions
diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py
deleted file mode 100644
index 2e5f184..0000000
--- a/crawler/neodarznet/settings.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# -*- coding: utf-8 -*-
-
-BOT_NAME = 'neodarznet'
-
-SPIDER_MODULES = ['crawler.neodarznet.spiders']
-NEWSPIDER_MODULE = 'crawler.neodarznet.spiders'
-
-ROBOTSTXT_OBEY = True
-
-DEPTH_LIMIT = 0
-
-ITEM_PIPELINES = {'crawler.neodarznet.pipelines.NeodarznetPipeline': 0}
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py
index a32a3e4..e16ede2 100644
--- a/crawler/neodarznet/spiders/scrape.py
+++ b/crawler/neodarznet/spiders/scrape.py
@@ -4,8 +4,13 @@ from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import Selector
-class ScrapSpider(CrawlSpider):
- name = "scrape"
+class NeodarznetSpider(CrawlSpider):
+ name = "neodarznet"
+ custom_settings = {
+ 'ITEM_PIPELINES': {
+ 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0
+ }
+ }
allow_domains = ['neodarz.net']
start_urls = [
'https://neodarz.net/',
diff --git a/crawler/nevrax/__init__.py b/crawler/nevrax/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/crawler/nevrax/__init__.py
diff --git a/crawler/nevrax/pipelines.py b/crawler/nevrax/pipelines.py
new file mode 100644
index 0000000..775d5df
--- /dev/null
+++ b/crawler/nevrax/pipelines.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+import logging
+
+from database.models import Nevrax
+
+
+class NevraxPipeline(object):
+
+ def process_item(self, item, spider):
+ try:
+ page = Nevrax.get(Nevrax.url == item['url'])
+ q = Nevrax.update(**item).where(Nevrax.url == item['url'])
+ q.execute()
+ logging.info("Update item {}".format(page))
+ except Nevrax.DoesNotExist:
+ page = Nevrax.create(**item)
+ logging.info("Create item {}".format(page))
+ logging.info('Item {} stored in db'.format(page))
+ return item
diff --git a/crawler/nevrax/spiders/__init__.py b/crawler/nevrax/spiders/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/crawler/nevrax/spiders/__init__.py
diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py
new file mode 100644
index 0000000..8a7b8ec
--- /dev/null
+++ b/crawler/nevrax/spiders/scrape.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from scrapy import Selector
+
+import config
+
+class NevraxSpider(CrawlSpider):
+ name = "nevrax"
+ custom_settings = {
+ 'ITEM_PIPELINES': {
+ 'crawler.nevrax.pipelines.NevraxPipeline': 0
+ }
+ }
+ allow_domains = [config.NEVRAX_URL]
+ start_urls = [
+ 'http://'+config.NEVRAX_URL+'/',
+ ]
+
+ rules = [
+ Rule(
+ LinkExtractor(
+ canonicalize=True,
+ unique=True,
+ allow_domains=config.NEVRAX_URL,
+ #deny=".*\.neodarz\.net.*"
+ ),
+ follow=True,
+ callback="parse_items"
+ )
+ ]
+
+
+ def start_requests(self):
+ for url in self.start_urls:
+ yield scrapy.Request(url, callback=self.parse, dont_filter=True)
+
+ def parse_items(self, response):
+ sel = Selector(response)
+ yield {
+ 'url': response.url,
+ 'title': response.css('title::text').extract_first(),
+ 'content': ''.join(sel.select("//body//text()").extract()).strip()
+ }
diff --git a/crawler/settings.py b/crawler/settings.py
new file mode 100644
index 0000000..2ccdc11
--- /dev/null
+++ b/crawler/settings.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+
+BOT_NAME = 'khanindex'
+
+SPIDER_MODULES = ['crawler.nevrax.spiders', 'crawler.neodarznet.spiders']
+#NEWSPIDER_MODULE = 'crawler.nevrax.spiders'
+
+ROBOTSTXT_OBEY = True
+
+DEPTH_LIMIT = 0
+
+#ITEM_PIPELINES = {'crawler.nevrax.pipelines.NevraxPipeline': 0, 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0}