diff options
author | neodarz <neodarz@neodarz.net> | 2019-02-06 19:15:36 +0100 |
---|---|---|
committer | neodarz <neodarz@neodarz.net> | 2019-02-06 19:15:36 +0100 |
commit | 9a8badd5dffe47813489ab0b355f5db5faa66646 (patch) | |
tree | bf53db13612cd7c32f2eadfe905cad83ce50a0d7 /crawler/neodarznet | |
parent | f84e8fb75b8096dff5a39936ac26c933fdba3059 (diff) | |
download | khanindexer-9a8badd5dffe47813489ab0b355f5db5faa66646.tar.xz khanindexer-9a8badd5dffe47813489ab0b355f5db5faa66646.zip |
Add ability to update url who are one week old and content modified
Diffstat (limited to 'crawler/neodarznet')
-rw-r--r-- | crawler/neodarznet/spiders/scrape.py | 2 | ||||
-rw-r--r-- | crawler/neodarznet/spiders/update.py | 49 |
2 files changed, 50 insertions, 1 deletions
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py index bd97067..2d3c32b 100644 --- a/crawler/neodarznet/spiders/scrape.py +++ b/crawler/neodarznet/spiders/scrape.py @@ -7,7 +7,7 @@ from scrapy import Selector import datetime class NeodarznetSpider(CrawlSpider): - name = "neodarznet" + name = "neodarznet_crawler" custom_settings = { 'ITEM_PIPELINES': { 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0 diff --git a/crawler/neodarznet/spiders/update.py b/crawler/neodarznet/spiders/update.py new file mode 100644 index 0000000..38f1863 --- /dev/null +++ b/crawler/neodarznet/spiders/update.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor +from scrapy import Selector + +import datetime + +from database.models import Neodarznet + +from dateutil.relativedelta import * + +import logging + +class NeodarznetSpider(CrawlSpider): + name = "neodarznet_updater" + custom_settings = { + 'ITEM_PIPELINES': { + 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0 + } + } + allow_domains = ['neodarz.net'] + + datas = Neodarznet.select(Neodarznet.url).dicts() + datas_array = [] + for value in datas: + datas_array.append(value['url']) + start_urls = datas_array + + def start_requests(self): + for url in self.start_urls: + logging.info(url) + try: + page = Neodarznet.get(Neodarznet.url == url) + if page.date_updated < datetime.datetime.now()+relativedelta(weeks=-1): + yield scrapy.Request(url, callback=self.parse_url, dont_filter=True) + except Neodarznet.DoesNotExist: + yield scrapy.Request(url, callback=self.parse_url, dont_filter=True) + continue + + def parse_url(self, response): + sel = Selector(response) + yield { + 'url': response.url, + 'title': response.css('title::text').extract_first(), + 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(), + 'content_length': len(response.body), + 'date_updated': datetime.datetime.now() + } |