# -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy import Selector class ScrapSpider(CrawlSpider): name = "scrape" allow_domains = ['neodarz.net'] start_urls = [ 'https://neodarz.net/', ] rules = [ Rule( LinkExtractor( canonicalize=True, unique=True, allow_domains="neodarz.net", deny=".*\.neodarz\.net.*" ), follow=True, callback="parse_items" ) ] def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, callback=self.parse, dont_filter=True) def parse_items(self, response): sel = Selector(response) yield { 'url': response.url, 'title': response.css('title::text').extract_first(), 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip() }