# -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy import Selector import config import datetime class NevraxSpider(CrawlSpider): name = "nevrax_crawler" custom_settings = { 'ITEM_PIPELINES': { 'crawler.nevrax.pipelines.NevraxPipeline': 0 } } allow_domains = [config.NEVRAX_URL] start_urls = [ 'http://'+config.NEVRAX_URL+'/', ] rules = [ Rule( LinkExtractor( canonicalize=True, unique=True, allow_domains=config.NEVRAX_URL, #deny=".*\.neodarz\.net.*" ), follow=True, callback="parse_items" ) ] def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, callback=self.parse, dont_filter=True) def parse_items(self, response): sel = Selector(response) yield { 'url': response.url, 'title': response.css('title::text').extract_first(), 'content': ''.join(sel.select("//body//text()").extract()).strip(), 'content_length': len(response.body), 'date_updated': datetime.datetime.now() }