diff options
author | neodarz <neodarz@neodarz.net> | 2019-01-10 23:54:18 +0100 |
---|---|---|
committer | neodarz <neodarz@neodarz.net> | 2019-01-10 23:54:18 +0100 |
commit | 82746a2b76e5948333133bfe07c1042af2cd33b7 (patch) | |
tree | 6a6302270472d079924d648644a2786ad115d200 /crawler | |
download | khanindexer-82746a2b76e5948333133bfe07c1042af2cd33b7.tar.xz khanindexer-82746a2b76e5948333133bfe07c1042af2cd33b7.zip |
Initial commit
Diffstat (limited to '')
-rw-r--r-- | crawler/__init__.py | 0 | ||||
-rw-r--r-- | crawler/neodarznet/__init__.py | 0 | ||||
-rw-r--r-- | crawler/neodarznet/items.py | 7 | ||||
-rw-r--r-- | crawler/neodarznet/pipelines.py | 6 | ||||
-rw-r--r-- | crawler/neodarznet/settings.py | 10 | ||||
-rw-r--r-- | crawler/neodarznet/spiders/__init__.py | 0 | ||||
-rw-r--r-- | crawler/neodarznet/spiders/scrape.py | 38 |
7 files changed, 61 insertions, 0 deletions
diff --git a/crawler/__init__.py b/crawler/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/crawler/__init__.py diff --git a/crawler/neodarznet/__init__.py b/crawler/neodarznet/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/crawler/neodarznet/__init__.py diff --git a/crawler/neodarznet/items.py b/crawler/neodarznet/items.py new file mode 100644 index 0000000..850cec8 --- /dev/null +++ b/crawler/neodarznet/items.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +import scrapy + + +class NeodarznetItem(scrapy.Item): + pass diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py new file mode 100644 index 0000000..71e7865 --- /dev/null +++ b/crawler/neodarznet/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + + +class NeodarznetPipeline(object): + def process_time(self, item, spider): + return item diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py new file mode 100644 index 0000000..8d65b09 --- /dev/null +++ b/crawler/neodarznet/settings.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- + +BOT_NAME = 'neodarznet' + +SPIDER_MODULES = ['crawler.neodarznet.spiders'] +NEWSPIDER_MODULE = 'crawler.neodarznet.spiders' + +ROBOTSTXT_OBEY = True + +DEPTH_LIMIT = 0 diff --git a/crawler/neodarznet/spiders/__init__.py b/crawler/neodarznet/spiders/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/crawler/neodarznet/spiders/__init__.py diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py new file mode 100644 index 0000000..a32a3e4 --- /dev/null +++ b/crawler/neodarznet/spiders/scrape.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor +from scrapy import Selector + +class ScrapSpider(CrawlSpider): + name = "scrape" + allow_domains = ['neodarz.net'] + start_urls = [ + 'https://neodarz.net/', + ] + + rules = [ + Rule( + LinkExtractor( + canonicalize=True, + unique=True, + allow_domains="neodarz.net", + deny=".*\.neodarz\.net.*" + ), + follow=True, + callback="parse_items" + ) + ] + + + def start_requests(self): + for url in self.start_urls: + yield scrapy.Request(url, callback=self.parse, dont_filter=True) + + def parse_items(self, response): + sel = Selector(response) + yield { + 'url': response.url, + 'title': response.css('title::text').extract_first(), + 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip() + } |