aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-01-10 23:54:18 +0100
committerneodarz <neodarz@neodarz.net>2019-01-10 23:54:18 +0100
commit82746a2b76e5948333133bfe07c1042af2cd33b7 (patch)
tree6a6302270472d079924d648644a2786ad115d200
downloadkhanindexer-82746a2b76e5948333133bfe07c1042af2cd33b7.tar.xz
khanindexer-82746a2b76e5948333133bfe07c1042af2cd33b7.zip
Initial commit
-rw-r--r--.gitignore89
-rw-r--r--README.md10
-rw-r--r--crawler/__init__.py0
-rw-r--r--crawler/neodarznet/__init__.py0
-rw-r--r--crawler/neodarznet/items.py7
-rw-r--r--crawler/neodarznet/pipelines.py6
-rw-r--r--crawler/neodarznet/settings.py10
-rw-r--r--crawler/neodarznet/spiders/__init__.py0
-rw-r--r--crawler/neodarznet/spiders/scrape.py38
-rw-r--r--scrapy.cfg5
10 files changed, 165 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..72364f9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,89 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..726b105
--- /dev/null
+++ b/README.md
@@ -0,0 +1,10 @@
+Simple search engine - but you can search nothing for the moment
+
+# Crawling
+
+For now there is an example spider with neodarz website.
+For testing it just run:
+
+```
+scrapy crawl scrape -o out.json
+```
diff --git a/crawler/__init__.py b/crawler/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/crawler/__init__.py
diff --git a/crawler/neodarznet/__init__.py b/crawler/neodarznet/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/crawler/neodarznet/__init__.py
diff --git a/crawler/neodarznet/items.py b/crawler/neodarznet/items.py
new file mode 100644
index 0000000..850cec8
--- /dev/null
+++ b/crawler/neodarznet/items.py
@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+
+import scrapy
+
+
+class NeodarznetItem(scrapy.Item):
+ pass
diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py
new file mode 100644
index 0000000..71e7865
--- /dev/null
+++ b/crawler/neodarznet/pipelines.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+
+class NeodarznetPipeline(object):
+ def process_time(self, item, spider):
+ return item
diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py
new file mode 100644
index 0000000..8d65b09
--- /dev/null
+++ b/crawler/neodarznet/settings.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+
+BOT_NAME = 'neodarznet'
+
+SPIDER_MODULES = ['crawler.neodarznet.spiders']
+NEWSPIDER_MODULE = 'crawler.neodarznet.spiders'
+
+ROBOTSTXT_OBEY = True
+
+DEPTH_LIMIT = 0
diff --git a/crawler/neodarznet/spiders/__init__.py b/crawler/neodarznet/spiders/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/crawler/neodarznet/spiders/__init__.py
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py
new file mode 100644
index 0000000..a32a3e4
--- /dev/null
+++ b/crawler/neodarznet/spiders/scrape.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from scrapy import Selector
+
+class ScrapSpider(CrawlSpider):
+ name = "scrape"
+ allow_domains = ['neodarz.net']
+ start_urls = [
+ 'https://neodarz.net/',
+ ]
+
+ rules = [
+ Rule(
+ LinkExtractor(
+ canonicalize=True,
+ unique=True,
+ allow_domains="neodarz.net",
+ deny=".*\.neodarz\.net.*"
+ ),
+ follow=True,
+ callback="parse_items"
+ )
+ ]
+
+
+ def start_requests(self):
+ for url in self.start_urls:
+ yield scrapy.Request(url, callback=self.parse, dont_filter=True)
+
+ def parse_items(self, response):
+ sel = Selector(response)
+ yield {
+ 'url': response.url,
+ 'title': response.css('title::text').extract_first(),
+ 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip()
+ }
diff --git a/scrapy.cfg b/scrapy.cfg
new file mode 100644
index 0000000..22162ef
--- /dev/null
+++ b/scrapy.cfg
@@ -0,0 +1,5 @@
+[settings]
+default = crawler.neodarznet.settings
+
+[deploy]
+project = crawler.neodarznet