From 073e919ef198a04da1e5ed28a7dfbc5d9681fc14 Mon Sep 17 00:00:00 2001 From: neodarz Date: Thu, 17 Jan 2019 22:39:56 +0100 Subject: Be more specific on index source database --- README.md | 6 +++--- app.py | 2 +- crawler/neodarznet/pipelines.py | 10 +++++----- database/models.py | 6 ++++++ sphinx_search.conf | 2 +- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index bd4967d..b27c907 100644 --- a/README.md +++ b/README.md @@ -62,9 +62,9 @@ his parent folder). Example with the configuration for the indexer `datas`: ``` -index datas { - source = datas - path = /tmp/data/datas +index neodarznet { + source = neodarznet + path = /tmp/data/neodarznet } ``` Here the folder is `/tmp/data/` diff --git a/app.py b/app.py index 933a140..31f338d 100644 --- a/app.py +++ b/app.py @@ -33,7 +33,7 @@ def search(): def crawl(): try: - db.create_tables([Page]) + db.create_tables(Page.__subclasses__()) process = CrawlerProcess(get_project_settings()) process.crawl(ScrapSpider) process.start() diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py index 6703796..fbfebbb 100644 --- a/crawler/neodarznet/pipelines.py +++ b/crawler/neodarznet/pipelines.py @@ -1,19 +1,19 @@ # -*- coding: utf-8 -*- import logging -from database.models import Page +from database.models import Neodarznet class NeodarznetPipeline(object): def process_item(self, item, spider): try: - page = Page.get(Page.url == item['url']) - q = Page.update(**item).where(Page.url == item['url']) + page = Neodarznet.get(Neodarznet.url == item['url']) + q = Neodarznet.update(**item).where(Neodarznet.url == item['url']) q.execute() logging.info("Update item {}".format(page)) - except Page.DoesNotExist: - page = Page.create(**item) + except Neodarznet.DoesNotExist: + page = Neodarznet.create(**item) logging.info("Create item {}".format(page)) logging.info('Item {} stored in db'.format(page)) return item diff --git a/database/models.py b/database/models.py index 2f9528a..cc9b58e 100644 --- a/database/models.py +++ b/database/models.py @@ -14,3 +14,9 @@ class Page(Model): class Meta: database = db + +class Neodarznet(Page): + """ + Page du site neodarz.net + """ + pass diff --git a/sphinx_search.conf b/sphinx_search.conf index 6e8f214..63d8d0f 100644 --- a/sphinx_search.conf +++ b/sphinx_search.conf @@ -6,7 +6,7 @@ source neodarznet { sql_pass = root sql_db = khanindexer - sql_query = SELECT id, url, title, content FROM page + sql_query = SELECT id, url, title, content FROM neodarznet sql_field_string = url sql_field_string = title -- cgit v1.2.1