From a3f01580faf6caee4abcc8e682567b87380857b9 Mon Sep 17 00:00:00 2001 From: neodarz Date: Sat, 19 Jan 2019 00:02:57 +0100 Subject: Add khanindex nevrax indexation --- app.py | 10 ++++---- config.py | 2 ++ crawler/neodarznet/settings.py | 12 ---------- crawler/neodarznet/spiders/scrape.py | 9 ++++++-- crawler/nevrax/__init__.py | 0 crawler/nevrax/pipelines.py | 19 +++++++++++++++ crawler/nevrax/spiders/__init__.py | 0 crawler/nevrax/spiders/scrape.py | 45 ++++++++++++++++++++++++++++++++++++ crawler/settings.py | 12 ++++++++++ database/models.py | 6 +++++ scrapy.cfg | 4 ++-- sphinx_search.conf | 21 +++++++++++++++++ 12 files changed, 120 insertions(+), 20 deletions(-) delete mode 100644 crawler/neodarznet/settings.py create mode 100644 crawler/nevrax/__init__.py create mode 100644 crawler/nevrax/pipelines.py create mode 100644 crawler/nevrax/spiders/__init__.py create mode 100644 crawler/nevrax/spiders/scrape.py create mode 100644 crawler/settings.py diff --git a/app.py b/app.py index 31f338d..66ca6c4 100644 --- a/app.py +++ b/app.py @@ -1,14 +1,13 @@ import scrapy import sys from scrapy.crawler import CrawlerProcess +from scrapy import spiderloader from scrapy.utils.project import get_project_settings from flask import Flask, request, jsonify import json from sphinx import sphinx -from crawler.neodarznet.spiders.scrape import ScrapSpider - from database.models import Page, db import config @@ -34,8 +33,11 @@ def search(): def crawl(): try: db.create_tables(Page.__subclasses__()) - process = CrawlerProcess(get_project_settings()) - process.crawl(ScrapSpider) + settings = get_project_settings() + process = CrawlerProcess(settings) + spiders = spiderloader.SpiderLoader.from_settings(settings) + for spider in spiders.list(): + process.crawl(spider) process.start() except Exception as e: print(e) diff --git a/config.py b/config.py index f00b52f..3e1ddbc 100644 --- a/config.py +++ b/config.py @@ -10,3 +10,5 @@ DB_PASS = "root" SPHINX_HOST = '127.0.0.1' SPHINX_PORT = 9312 + +NEVRAX_URL = "127.0.0.1:8000" diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py deleted file mode 100644 index 2e5f184..0000000 --- a/crawler/neodarznet/settings.py +++ /dev/null @@ -1,12 +0,0 @@ -# -*- coding: utf-8 -*- - -BOT_NAME = 'neodarznet' - -SPIDER_MODULES = ['crawler.neodarznet.spiders'] -NEWSPIDER_MODULE = 'crawler.neodarznet.spiders' - -ROBOTSTXT_OBEY = True - -DEPTH_LIMIT = 0 - -ITEM_PIPELINES = {'crawler.neodarznet.pipelines.NeodarznetPipeline': 0} diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py index a32a3e4..e16ede2 100644 --- a/crawler/neodarznet/spiders/scrape.py +++ b/crawler/neodarznet/spiders/scrape.py @@ -4,8 +4,13 @@ from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy import Selector -class ScrapSpider(CrawlSpider): - name = "scrape" +class NeodarznetSpider(CrawlSpider): + name = "neodarznet" + custom_settings = { + 'ITEM_PIPELINES': { + 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0 + } + } allow_domains = ['neodarz.net'] start_urls = [ 'https://neodarz.net/', diff --git a/crawler/nevrax/__init__.py b/crawler/nevrax/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/nevrax/pipelines.py b/crawler/nevrax/pipelines.py new file mode 100644 index 0000000..775d5df --- /dev/null +++ b/crawler/nevrax/pipelines.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +import logging + +from database.models import Nevrax + + +class NevraxPipeline(object): + + def process_item(self, item, spider): + try: + page = Nevrax.get(Nevrax.url == item['url']) + q = Nevrax.update(**item).where(Nevrax.url == item['url']) + q.execute() + logging.info("Update item {}".format(page)) + except Nevrax.DoesNotExist: + page = Nevrax.create(**item) + logging.info("Create item {}".format(page)) + logging.info('Item {} stored in db'.format(page)) + return item diff --git a/crawler/nevrax/spiders/__init__.py b/crawler/nevrax/spiders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py new file mode 100644 index 0000000..8a7b8ec --- /dev/null +++ b/crawler/nevrax/spiders/scrape.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor +from scrapy import Selector + +import config + +class NevraxSpider(CrawlSpider): + name = "nevrax" + custom_settings = { + 'ITEM_PIPELINES': { + 'crawler.nevrax.pipelines.NevraxPipeline': 0 + } + } + allow_domains = [config.NEVRAX_URL] + start_urls = [ + 'http://'+config.NEVRAX_URL+'/', + ] + + rules = [ + Rule( + LinkExtractor( + canonicalize=True, + unique=True, + allow_domains=config.NEVRAX_URL, + #deny=".*\.neodarz\.net.*" + ), + follow=True, + callback="parse_items" + ) + ] + + + def start_requests(self): + for url in self.start_urls: + yield scrapy.Request(url, callback=self.parse, dont_filter=True) + + def parse_items(self, response): + sel = Selector(response) + yield { + 'url': response.url, + 'title': response.css('title::text').extract_first(), + 'content': ''.join(sel.select("//body//text()").extract()).strip() + } diff --git a/crawler/settings.py b/crawler/settings.py new file mode 100644 index 0000000..2ccdc11 --- /dev/null +++ b/crawler/settings.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- + +BOT_NAME = 'khanindex' + +SPIDER_MODULES = ['crawler.nevrax.spiders', 'crawler.neodarznet.spiders'] +#NEWSPIDER_MODULE = 'crawler.nevrax.spiders' + +ROBOTSTXT_OBEY = True + +DEPTH_LIMIT = 0 + +#ITEM_PIPELINES = {'crawler.nevrax.pipelines.NevraxPipeline': 0, 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0} diff --git a/database/models.py b/database/models.py index cc9b58e..213fb48 100644 --- a/database/models.py +++ b/database/models.py @@ -20,3 +20,9 @@ class Neodarznet(Page): Page du site neodarz.net """ pass + +class Nevrax(Page): + """ + Page of website nevrax + """ + pass diff --git a/scrapy.cfg b/scrapy.cfg index 22162ef..98f6f15 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -1,5 +1,5 @@ [settings] -default = crawler.neodarznet.settings +default = crawler.settings [deploy] -project = crawler.neodarznet +project = crawler diff --git a/sphinx_search.conf b/sphinx_search.conf index 63d8d0f..7740000 100644 --- a/sphinx_search.conf +++ b/sphinx_search.conf @@ -14,11 +14,32 @@ source neodarznet { } +source nevrax { + type = pgsql + + sql_host = 127.0.0.1 + sql_user = root + sql_pass = root + sql_db = khanindexer + + sql_query = SELECT id, url, title, content FROM nevrax + + sql_field_string = url + sql_field_string = title + sql_field_string = content + +} + index neodarznet { source = neodarznet path = /tmp/data/neodarznet } +index nevrax { + source = nevrax + path = /tmp/data/nevrax +} + indexer { mem_limit = 32M } -- cgit v1.2.1