From a3f01580faf6caee4abcc8e682567b87380857b9 Mon Sep 17 00:00:00 2001 From: neodarz Date: Sat, 19 Jan 2019 00:02:57 +0100 Subject: Add khanindex nevrax indexation --- crawler/neodarznet/settings.py | 12 ---------- crawler/neodarznet/spiders/scrape.py | 9 ++++++-- crawler/nevrax/__init__.py | 0 crawler/nevrax/pipelines.py | 19 +++++++++++++++ crawler/nevrax/spiders/__init__.py | 0 crawler/nevrax/spiders/scrape.py | 45 ++++++++++++++++++++++++++++++++++++ crawler/settings.py | 12 ++++++++++ 7 files changed, 83 insertions(+), 14 deletions(-) delete mode 100644 crawler/neodarznet/settings.py create mode 100644 crawler/nevrax/__init__.py create mode 100644 crawler/nevrax/pipelines.py create mode 100644 crawler/nevrax/spiders/__init__.py create mode 100644 crawler/nevrax/spiders/scrape.py create mode 100644 crawler/settings.py (limited to 'crawler') diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py deleted file mode 100644 index 2e5f184..0000000 --- a/crawler/neodarznet/settings.py +++ /dev/null @@ -1,12 +0,0 @@ -# -*- coding: utf-8 -*- - -BOT_NAME = 'neodarznet' - -SPIDER_MODULES = ['crawler.neodarznet.spiders'] -NEWSPIDER_MODULE = 'crawler.neodarznet.spiders' - -ROBOTSTXT_OBEY = True - -DEPTH_LIMIT = 0 - -ITEM_PIPELINES = {'crawler.neodarznet.pipelines.NeodarznetPipeline': 0} diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py index a32a3e4..e16ede2 100644 --- a/crawler/neodarznet/spiders/scrape.py +++ b/crawler/neodarznet/spiders/scrape.py @@ -4,8 +4,13 @@ from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy import Selector -class ScrapSpider(CrawlSpider): - name = "scrape" +class NeodarznetSpider(CrawlSpider): + name = "neodarznet" + custom_settings = { + 'ITEM_PIPELINES': { + 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0 + } + } allow_domains = ['neodarz.net'] start_urls = [ 'https://neodarz.net/', diff --git a/crawler/nevrax/__init__.py b/crawler/nevrax/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/nevrax/pipelines.py b/crawler/nevrax/pipelines.py new file mode 100644 index 0000000..775d5df --- /dev/null +++ b/crawler/nevrax/pipelines.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +import logging + +from database.models import Nevrax + + +class NevraxPipeline(object): + + def process_item(self, item, spider): + try: + page = Nevrax.get(Nevrax.url == item['url']) + q = Nevrax.update(**item).where(Nevrax.url == item['url']) + q.execute() + logging.info("Update item {}".format(page)) + except Nevrax.DoesNotExist: + page = Nevrax.create(**item) + logging.info("Create item {}".format(page)) + logging.info('Item {} stored in db'.format(page)) + return item diff --git a/crawler/nevrax/spiders/__init__.py b/crawler/nevrax/spiders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py new file mode 100644 index 0000000..8a7b8ec --- /dev/null +++ b/crawler/nevrax/spiders/scrape.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor +from scrapy import Selector + +import config + +class NevraxSpider(CrawlSpider): + name = "nevrax" + custom_settings = { + 'ITEM_PIPELINES': { + 'crawler.nevrax.pipelines.NevraxPipeline': 0 + } + } + allow_domains = [config.NEVRAX_URL] + start_urls = [ + 'http://'+config.NEVRAX_URL+'/', + ] + + rules = [ + Rule( + LinkExtractor( + canonicalize=True, + unique=True, + allow_domains=config.NEVRAX_URL, + #deny=".*\.neodarz\.net.*" + ), + follow=True, + callback="parse_items" + ) + ] + + + def start_requests(self): + for url in self.start_urls: + yield scrapy.Request(url, callback=self.parse, dont_filter=True) + + def parse_items(self, response): + sel = Selector(response) + yield { + 'url': response.url, + 'title': response.css('title::text').extract_first(), + 'content': ''.join(sel.select("//body//text()").extract()).strip() + } diff --git a/crawler/settings.py b/crawler/settings.py new file mode 100644 index 0000000..2ccdc11 --- /dev/null +++ b/crawler/settings.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- + +BOT_NAME = 'khanindex' + +SPIDER_MODULES = ['crawler.nevrax.spiders', 'crawler.neodarznet.spiders'] +#NEWSPIDER_MODULE = 'crawler.nevrax.spiders' + +ROBOTSTXT_OBEY = True + +DEPTH_LIMIT = 0 + +#ITEM_PIPELINES = {'crawler.nevrax.pipelines.NevraxPipeline': 0, 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0} -- cgit v1.2.1