# -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy import Selector import config import datetime from database.models import Nevrax from dateutil.relativedelta import * import logging class NevraxSpider(CrawlSpider): name = "nevrax_updater" custom_settings = { 'ITEM_PIPELINES': { 'crawler.nevrax.pipelines.NevraxPipeline': 0 } } allow_domains = [config.NEVRAX_URL] datas = Nevrax.select(Nevrax.url).dicts() datas_array = [] for value in datas: datas_array.append(value['url']) start_urls = datas_array def start_requests(self): for url in self.start_urls: logging.info(url) try: page = Nevrax.get(Nevrax.url == url) if page.date_updated < datetime.datetime.now()+relativedelta(weeks=-1): yield scrapy.Request(url, callback=self.parse_url, dont_filter=True) except Nevrax.DoesNotExist: yield scrapy.Request(url, callback=self.parse_url, dont_filter=True) continue def parse_url(self, response): sel = Selector(response) yield { 'url': response.url, 'title': response.css('title::text').extract_first(), 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(), 'content_length': len(response.body), 'date_updated': datetime.datetime.now() }