From 82746a2b76e5948333133bfe07c1042af2cd33b7 Mon Sep 17 00:00:00 2001 From: neodarz Date: Thu, 10 Jan 2019 23:54:18 +0100 Subject: Initial commit --- .gitignore | 89 ++++++++++++++++++++++++++++++++++ README.md | 10 ++++ crawler/__init__.py | 0 crawler/neodarznet/__init__.py | 0 crawler/neodarznet/items.py | 7 +++ crawler/neodarznet/pipelines.py | 6 +++ crawler/neodarznet/settings.py | 10 ++++ crawler/neodarznet/spiders/__init__.py | 0 crawler/neodarznet/spiders/scrape.py | 38 +++++++++++++++ scrapy.cfg | 5 ++ 10 files changed, 165 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 crawler/__init__.py create mode 100644 crawler/neodarznet/__init__.py create mode 100644 crawler/neodarznet/items.py create mode 100644 crawler/neodarznet/pipelines.py create mode 100644 crawler/neodarznet/settings.py create mode 100644 crawler/neodarznet/spiders/__init__.py create mode 100644 crawler/neodarznet/spiders/scrape.py create mode 100644 scrapy.cfg diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..72364f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,89 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject diff --git a/README.md b/README.md new file mode 100644 index 0000000..726b105 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +Simple search engine - but you can search nothing for the moment + +# Crawling + +For now there is an example spider with neodarz website. +For testing it just run: + +``` +scrapy crawl scrape -o out.json +``` diff --git a/crawler/__init__.py b/crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/neodarznet/__init__.py b/crawler/neodarznet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/neodarznet/items.py b/crawler/neodarznet/items.py new file mode 100644 index 0000000..850cec8 --- /dev/null +++ b/crawler/neodarznet/items.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +import scrapy + + +class NeodarznetItem(scrapy.Item): + pass diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py new file mode 100644 index 0000000..71e7865 --- /dev/null +++ b/crawler/neodarznet/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + + +class NeodarznetPipeline(object): + def process_time(self, item, spider): + return item diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py new file mode 100644 index 0000000..8d65b09 --- /dev/null +++ b/crawler/neodarznet/settings.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- + +BOT_NAME = 'neodarznet' + +SPIDER_MODULES = ['crawler.neodarznet.spiders'] +NEWSPIDER_MODULE = 'crawler.neodarznet.spiders' + +ROBOTSTXT_OBEY = True + +DEPTH_LIMIT = 0 diff --git a/crawler/neodarznet/spiders/__init__.py b/crawler/neodarznet/spiders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py new file mode 100644 index 0000000..a32a3e4 --- /dev/null +++ b/crawler/neodarznet/spiders/scrape.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor +from scrapy import Selector + +class ScrapSpider(CrawlSpider): + name = "scrape" + allow_domains = ['neodarz.net'] + start_urls = [ + 'https://neodarz.net/', + ] + + rules = [ + Rule( + LinkExtractor( + canonicalize=True, + unique=True, + allow_domains="neodarz.net", + deny=".*\.neodarz\.net.*" + ), + follow=True, + callback="parse_items" + ) + ] + + + def start_requests(self): + for url in self.start_urls: + yield scrapy.Request(url, callback=self.parse, dont_filter=True) + + def parse_items(self, response): + sel = Selector(response) + yield { + 'url': response.url, + 'title': response.css('title::text').extract_first(), + 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip() + } diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..22162ef --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,5 @@ +[settings] +default = crawler.neodarznet.settings + +[deploy] +project = crawler.neodarznet -- cgit v1.2.1