aboutsummaryrefslogtreecommitdiff
path: root/crawler/neodarznet/spiders/scrape.py
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-01-10 23:54:18 +0100
committerneodarz <neodarz@neodarz.net>2019-01-10 23:54:18 +0100
commit82746a2b76e5948333133bfe07c1042af2cd33b7 (patch)
tree6a6302270472d079924d648644a2786ad115d200 /crawler/neodarznet/spiders/scrape.py
downloadkhanindexer-82746a2b76e5948333133bfe07c1042af2cd33b7.tar.xz
khanindexer-82746a2b76e5948333133bfe07c1042af2cd33b7.zip
Initial commit
Diffstat (limited to 'crawler/neodarznet/spiders/scrape.py')
-rw-r--r--crawler/neodarznet/spiders/scrape.py38
1 files changed, 38 insertions, 0 deletions
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py
new file mode 100644
index 0000000..a32a3e4
--- /dev/null
+++ b/crawler/neodarznet/spiders/scrape.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from scrapy import Selector
+
+class ScrapSpider(CrawlSpider):
+ name = "scrape"
+ allow_domains = ['neodarz.net']
+ start_urls = [
+ 'https://neodarz.net/',
+ ]
+
+ rules = [
+ Rule(
+ LinkExtractor(
+ canonicalize=True,
+ unique=True,
+ allow_domains="neodarz.net",
+ deny=".*\.neodarz\.net.*"
+ ),
+ follow=True,
+ callback="parse_items"
+ )
+ ]
+
+
+ def start_requests(self):
+ for url in self.start_urls:
+ yield scrapy.Request(url, callback=self.parse, dont_filter=True)
+
+ def parse_items(self, response):
+ sel = Selector(response)
+ yield {
+ 'url': response.url,
+ 'title': response.css('title::text').extract_first(),
+ 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip()
+ }