# -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy import Selector class NeodarznetSpider(CrawlSpider): name = "neodarznet" custom_settings = { 'ITEM_PIPELINES': { 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0 } } allow_domains = ['neodarz.net'] start_urls = [ 'https://neodarz.net/', ] rules = [ Rule( LinkExtractor( canonicalize=True, unique=True, allow_domains="neodarz.net", deny=".*\.neodarz\.net.*" ), follow=True, callback="parse_items" ) ] def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, callback=self.parse, dont_filter=True) def parse_items(self, response): sel = Selector(response) yield { 'url': response.url, 'title': response.css('title::text').extract_first(), 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(), 'content_length': len(response.body) }