Add khanindex nevrax indexation

author: neodarz <neodarz@neodarz.net> 2019-01-19 00:02:57 +0100
committer: neodarz <neodarz@neodarz.net> 2019-01-19 00:02:57 +0100
commit: a3f01580faf6caee4abcc8e682567b87380857b9 (patch)
tree: dbf5e0a9866b3aac7e7bb64f2eab8c005e1c28cd
parent: 073e919ef198a04da1e5ed28a7dfbc5d9681fc14 (diff)
download: khanindexer-a3f01580faf6caee4abcc8e682567b87380857b9.tar.xz
khanindexer-a3f01580faf6caee4abcc8e682567b87380857b9.zip
12 files changed, 120 insertions, 20 deletions
diff --git a/app.py b/app.py
index 31f338d..66ca6c4 100644
--- a/app.py
+++ b/app.py
@@ -1,14 +1,13 @@
 import scrapy
 import sys
 from scrapy.crawler import CrawlerProcess
+from scrapy import spiderloader
 from scrapy.utils.project import get_project_settings
 from flask import Flask, request, jsonify
 import json
 
 from sphinx import sphinx
 
-from crawler.neodarznet.spiders.scrape import ScrapSpider
-
 from database.models import Page, db
 import config
 
@@ -34,8 +33,11 @@ def search():
 def crawl():
     try:
         db.create_tables(Page.__subclasses__())
-        process = CrawlerProcess(get_project_settings())
-        process.crawl(ScrapSpider)
+        settings = get_project_settings()
+        process = CrawlerProcess(settings)
+        spiders = spiderloader.SpiderLoader.from_settings(settings)
+        for spider in spiders.list():
+            process.crawl(spider)
         process.start()
     except Exception as e:
         print(e)
diff --git a/config.py b/config.py
index f00b52f..3e1ddbc 100644
--- a/config.py
+++ b/config.py
@@ -10,3 +10,5 @@ DB_PASS = "root"
 
 SPHINX_HOST = '127.0.0.1'
 SPHINX_PORT = 9312
+
+NEVRAX_URL = "127.0.0.1:8000"
diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py
deleted file mode 100644
index 2e5f184..0000000
--- a/crawler/neodarznet/settings.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# -*- coding: utf-8 -*-
-
-BOT_NAME = 'neodarznet'
-
-SPIDER_MODULES = ['crawler.neodarznet.spiders']
-NEWSPIDER_MODULE = 'crawler.neodarznet.spiders'
-
-ROBOTSTXT_OBEY = True
-
-DEPTH_LIMIT = 0
-
-ITEM_PIPELINES = {'crawler.neodarznet.pipelines.NeodarznetPipeline': 0}
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py
index a32a3e4..e16ede2 100644
--- a/crawler/neodarznet/spiders/scrape.py
+++ b/crawler/neodarznet/spiders/scrape.py
@@ -4,8 +4,13 @@ from scrapy.spiders import CrawlSpider, Rule
 from scrapy.linkextractors import LinkExtractor
 from scrapy import Selector
 
-class ScrapSpider(CrawlSpider):
-    name = "scrape"
+class NeodarznetSpider(CrawlSpider):
+    name = "neodarznet"
+    custom_settings = {
+        'ITEM_PIPELINES': {
+            'crawler.neodarznet.pipelines.NeodarznetPipeline': 0
+        }
+    }
     allow_domains = ['neodarz.net']
     start_urls = [
         'https://neodarz.net/',
diff --git a/crawler/nevrax/__init__.py b/crawler/nevrax/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/crawler/nevrax/__init__.py
diff --git a/crawler/nevrax/pipelines.py b/crawler/nevrax/pipelines.py
new file mode 100644
index 0000000..775d5df
--- /dev/null
+++ b/crawler/nevrax/pipelines.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+import logging
+
+from database.models import Nevrax
+
+
+class NevraxPipeline(object):
+
+    def process_item(self, item, spider):
+        try:
+            page = Nevrax.get(Nevrax.url == item['url'])
+            q = Nevrax.update(**item).where(Nevrax.url == item['url'])
+            q.execute()
+            logging.info("Update item {}".format(page))
+        except Nevrax.DoesNotExist:
+            page = Nevrax.create(**item)
+            logging.info("Create item {}".format(page))
+        logging.info('Item {} stored in db'.format(page))
+        return item
diff --git a/crawler/nevrax/spiders/__init__.py b/crawler/nevrax/spiders/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/crawler/nevrax/spiders/__init__.py
diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py
new file mode 100644
index 0000000..8a7b8ec
--- /dev/null
+++ b/crawler/nevrax/spiders/scrape.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from scrapy import Selector
+
+import config
+
+class NevraxSpider(CrawlSpider):
+    name = "nevrax"
+    custom_settings = {
+        'ITEM_PIPELINES': {
+            'crawler.nevrax.pipelines.NevraxPipeline': 0
+        }
+    }
+    allow_domains = [config.NEVRAX_URL]
+    start_urls = [
+        'http://'+config.NEVRAX_URL+'/',
+    ]
+
+    rules = [
+            Rule(
+                LinkExtractor(
+                    canonicalize=True,
+                    unique=True,
+                    allow_domains=config.NEVRAX_URL,
+                    #deny=".*\.neodarz\.net.*"
+                ),
+                follow=True,
+                callback="parse_items"
+            )
+    ]
+
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url, callback=self.parse, dont_filter=True)
+
+    def parse_items(self, response):
+        sel = Selector(response)
+        yield {
+                'url': response.url,
+                'title': response.css('title::text').extract_first(),
+                'content': ''.join(sel.select("//body//text()").extract()).strip()
+        }
diff --git a/crawler/settings.py b/crawler/settings.py
new file mode 100644
index 0000000..2ccdc11
--- /dev/null
+++ b/crawler/settings.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+
+BOT_NAME = 'khanindex'
+
+SPIDER_MODULES = ['crawler.nevrax.spiders', 'crawler.neodarznet.spiders']
+#NEWSPIDER_MODULE = 'crawler.nevrax.spiders'
+
+ROBOTSTXT_OBEY = True
+
+DEPTH_LIMIT = 0
+
+#ITEM_PIPELINES = {'crawler.nevrax.pipelines.NevraxPipeline': 0, 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0}
diff --git a/database/models.py b/database/models.py
index cc9b58e..213fb48 100644
--- a/database/models.py
+++ b/database/models.py
@@ -20,3 +20,9 @@ class Neodarznet(Page):
     Page du site neodarz.net
     """
     pass
+
+class Nevrax(Page):
+    """
+    Page of website nevrax
+    """
+    pass
diff --git a/scrapy.cfg b/scrapy.cfg
index 22162ef..98f6f15 100644
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -1,5 +1,5 @@
 [settings]
-default = crawler.neodarznet.settings
+default = crawler.settings
 
 [deploy]
-project = crawler.neodarznet
+project = crawler
diff --git a/sphinx_search.conf b/sphinx_search.conf
index 63d8d0f..7740000 100644
--- a/sphinx_search.conf
+++ b/sphinx_search.conf
@@ -14,11 +14,32 @@ source neodarznet {
 
 }
 
+source nevrax {
+    type = pgsql
+
+    sql_host = 127.0.0.1
+    sql_user = root
+    sql_pass = root
+    sql_db = khanindexer
+
+    sql_query = SELECT id, url, title, content FROM nevrax
+
+    sql_field_string = url
+    sql_field_string = title
+    sql_field_string = content
+
+}
+
 index neodarznet {
     source = neodarznet
     path = /tmp/data/neodarznet
 }
 
+index nevrax {
+    source = nevrax
+    path = /tmp/data/nevrax
+}
+
 indexer {
     mem_limit = 32M
 }
author	neodarz <neodarz@neodarz.net>	2019-01-19 00:02:57 +0100
committer	neodarz <neodarz@neodarz.net>	2019-01-19 00:02:57 +0100
commit	a3f01580faf6caee4abcc8e682567b87380857b9 (patch)
tree	dbf5e0a9866b3aac7e7bb64f2eab8c005e1c28cd
parent	073e919ef198a04da1e5ed28a7dfbc5d9681fc14 (diff)
download	khanindexer-a3f01580faf6caee4abcc8e682567b87380857b9.tar.xz khanindexer-a3f01580faf6caee4abcc8e682567b87380857b9.zip