Initial commit

author: neodarz <neodarz@neodarz.net> 2019-01-10 23:54:18 +0100
committer: neodarz <neodarz@neodarz.net> 2019-01-10 23:54:18 +0100
commit: 82746a2b76e5948333133bfe07c1042af2cd33b7 (patch)
tree: 6a6302270472d079924d648644a2786ad115d200 /crawler
download: khanindexer-82746a2b76e5948333133bfe07c1042af2cd33b7.tar.xz
khanindexer-82746a2b76e5948333133bfe07c1042af2cd33b7.zip
7 files changed, 61 insertions, 0 deletions
diff --git a/crawler/__init__.py b/crawler/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/crawler/__init__.py
diff --git a/crawler/neodarznet/__init__.py b/crawler/neodarznet/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/crawler/neodarznet/__init__.py
diff --git a/crawler/neodarznet/items.py b/crawler/neodarznet/items.py
new file mode 100644
index 0000000..850cec8
--- /dev/null
+++ b/crawler/neodarznet/items.py
@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+
+import scrapy
+
+
+class NeodarznetItem(scrapy.Item):
+    pass
diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py
new file mode 100644
index 0000000..71e7865
--- /dev/null
+++ b/crawler/neodarznet/pipelines.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+
+class NeodarznetPipeline(object):
+    def process_time(self, item, spider):
+        return item
diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py
new file mode 100644
index 0000000..8d65b09
--- /dev/null
+++ b/crawler/neodarznet/settings.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+
+BOT_NAME = 'neodarznet'
+
+SPIDER_MODULES = ['crawler.neodarznet.spiders']
+NEWSPIDER_MODULE = 'crawler.neodarznet.spiders'
+
+ROBOTSTXT_OBEY = True
+
+DEPTH_LIMIT = 0
diff --git a/crawler/neodarznet/spiders/__init__.py b/crawler/neodarznet/spiders/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/crawler/neodarznet/spiders/__init__.py
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py
new file mode 100644
index 0000000..a32a3e4
--- /dev/null
+++ b/crawler/neodarznet/spiders/scrape.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from scrapy import Selector
+
+class ScrapSpider(CrawlSpider):
+    name = "scrape"
+    allow_domains = ['neodarz.net']
+    start_urls = [
+        'https://neodarz.net/',
+    ]
+
+    rules = [
+            Rule(
+                LinkExtractor(
+                    canonicalize=True,
+                    unique=True,
+                    allow_domains="neodarz.net",
+                    deny=".*\.neodarz\.net.*"
+                ),
+                follow=True,
+                callback="parse_items"
+            )
+    ]
+
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url, callback=self.parse, dont_filter=True)
+
+    def parse_items(self, response):
+        sel = Selector(response)
+        yield {
+                'url': response.url,
+                'title': response.css('title::text').extract_first(),
+                'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip()
+        }
author	neodarz <neodarz@neodarz.net>	2019-01-10 23:54:18 +0100
committer	neodarz <neodarz@neodarz.net>	2019-01-10 23:54:18 +0100
commit	82746a2b76e5948333133bfe07c1042af2cd33b7 (patch)
tree	6a6302270472d079924d648644a2786ad115d200 /crawler
download	khanindexer-82746a2b76e5948333133bfe07c1042af2cd33b7.tar.xz khanindexer-82746a2b76e5948333133bfe07c1042af2cd33b7.zip