diff options
-rw-r--r-- | README.md | 6 | ||||
-rw-r--r-- | app.py | 2 | ||||
-rw-r--r-- | crawler/neodarznet/pipelines.py | 10 | ||||
-rw-r--r-- | database/models.py | 6 | ||||
-rw-r--r-- | sphinx_search.conf | 2 |
5 files changed, 16 insertions, 10 deletions
@@ -62,9 +62,9 @@ his parent folder). Example with the configuration for the indexer `datas`: ``` -index datas { - source = datas - path = /tmp/data/datas +index neodarznet { + source = neodarznet + path = /tmp/data/neodarznet } ``` Here the folder is `/tmp/data/` @@ -33,7 +33,7 @@ def search(): def crawl(): try: - db.create_tables([Page]) + db.create_tables(Page.__subclasses__()) process = CrawlerProcess(get_project_settings()) process.crawl(ScrapSpider) process.start() diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py index 6703796..fbfebbb 100644 --- a/crawler/neodarznet/pipelines.py +++ b/crawler/neodarznet/pipelines.py @@ -1,19 +1,19 @@ # -*- coding: utf-8 -*- import logging -from database.models import Page +from database.models import Neodarznet class NeodarznetPipeline(object): def process_item(self, item, spider): try: - page = Page.get(Page.url == item['url']) - q = Page.update(**item).where(Page.url == item['url']) + page = Neodarznet.get(Neodarznet.url == item['url']) + q = Neodarznet.update(**item).where(Neodarznet.url == item['url']) q.execute() logging.info("Update item {}".format(page)) - except Page.DoesNotExist: - page = Page.create(**item) + except Neodarznet.DoesNotExist: + page = Neodarznet.create(**item) logging.info("Create item {}".format(page)) logging.info('Item {} stored in db'.format(page)) return item diff --git a/database/models.py b/database/models.py index 2f9528a..cc9b58e 100644 --- a/database/models.py +++ b/database/models.py @@ -14,3 +14,9 @@ class Page(Model): class Meta: database = db + +class Neodarznet(Page): + """ + Page du site neodarz.net + """ + pass diff --git a/sphinx_search.conf b/sphinx_search.conf index 6e8f214..63d8d0f 100644 --- a/sphinx_search.conf +++ b/sphinx_search.conf @@ -6,7 +6,7 @@ source neodarznet { sql_pass = root sql_db = khanindexer - sql_query = SELECT id, url, title, content FROM page + sql_query = SELECT id, url, title, content FROM neodarznet sql_field_string = url sql_field_string = title |