diff options
Diffstat (limited to 'crawler/nevrax')
-rw-r--r-- | crawler/nevrax/__init__.py | 0 | ||||
-rw-r--r-- | crawler/nevrax/pipelines.py | 19 | ||||
-rw-r--r-- | crawler/nevrax/spiders/__init__.py | 0 | ||||
-rw-r--r-- | crawler/nevrax/spiders/scrape.py | 45 |
4 files changed, 64 insertions, 0 deletions
diff --git a/crawler/nevrax/__init__.py b/crawler/nevrax/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/crawler/nevrax/__init__.py diff --git a/crawler/nevrax/pipelines.py b/crawler/nevrax/pipelines.py new file mode 100644 index 0000000..775d5df --- /dev/null +++ b/crawler/nevrax/pipelines.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +import logging + +from database.models import Nevrax + + +class NevraxPipeline(object): + + def process_item(self, item, spider): + try: + page = Nevrax.get(Nevrax.url == item['url']) + q = Nevrax.update(**item).where(Nevrax.url == item['url']) + q.execute() + logging.info("Update item {}".format(page)) + except Nevrax.DoesNotExist: + page = Nevrax.create(**item) + logging.info("Create item {}".format(page)) + logging.info('Item {} stored in db'.format(page)) + return item diff --git a/crawler/nevrax/spiders/__init__.py b/crawler/nevrax/spiders/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/crawler/nevrax/spiders/__init__.py diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py new file mode 100644 index 0000000..8a7b8ec --- /dev/null +++ b/crawler/nevrax/spiders/scrape.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor +from scrapy import Selector + +import config + +class NevraxSpider(CrawlSpider): + name = "nevrax" + custom_settings = { + 'ITEM_PIPELINES': { + 'crawler.nevrax.pipelines.NevraxPipeline': 0 + } + } + allow_domains = [config.NEVRAX_URL] + start_urls = [ + 'http://'+config.NEVRAX_URL+'/', + ] + + rules = [ + Rule( + LinkExtractor( + canonicalize=True, + unique=True, + allow_domains=config.NEVRAX_URL, + #deny=".*\.neodarz\.net.*" + ), + follow=True, + callback="parse_items" + ) + ] + + + def start_requests(self): + for url in self.start_urls: + yield scrapy.Request(url, callback=self.parse, dont_filter=True) + + def parse_items(self, response): + sel = Selector(response) + yield { + 'url': response.url, + 'title': response.css('title::text').extract_first(), + 'content': ''.join(sel.select("//body//text()").extract()).strip() + } |