aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md6
-rw-r--r--app.py2
-rw-r--r--crawler/neodarznet/pipelines.py10
-rw-r--r--database/models.py6
-rw-r--r--sphinx_search.conf2
5 files changed, 16 insertions, 10 deletions
diff --git a/README.md b/README.md
index bd4967d..b27c907 100644
--- a/README.md
+++ b/README.md
@@ -62,9 +62,9 @@ his parent folder).
Example with the configuration for the indexer `datas`:
```
-index datas {
- source = datas
- path = /tmp/data/datas
+index neodarznet {
+ source = neodarznet
+ path = /tmp/data/neodarznet
}
```
Here the folder is `/tmp/data/`
diff --git a/app.py b/app.py
index 933a140..31f338d 100644
--- a/app.py
+++ b/app.py
@@ -33,7 +33,7 @@ def search():
def crawl():
try:
- db.create_tables([Page])
+ db.create_tables(Page.__subclasses__())
process = CrawlerProcess(get_project_settings())
process.crawl(ScrapSpider)
process.start()
diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py
index 6703796..fbfebbb 100644
--- a/crawler/neodarznet/pipelines.py
+++ b/crawler/neodarznet/pipelines.py
@@ -1,19 +1,19 @@
# -*- coding: utf-8 -*-
import logging
-from database.models import Page
+from database.models import Neodarznet
class NeodarznetPipeline(object):
def process_item(self, item, spider):
try:
- page = Page.get(Page.url == item['url'])
- q = Page.update(**item).where(Page.url == item['url'])
+ page = Neodarznet.get(Neodarznet.url == item['url'])
+ q = Neodarznet.update(**item).where(Neodarznet.url == item['url'])
q.execute()
logging.info("Update item {}".format(page))
- except Page.DoesNotExist:
- page = Page.create(**item)
+ except Neodarznet.DoesNotExist:
+ page = Neodarznet.create(**item)
logging.info("Create item {}".format(page))
logging.info('Item {} stored in db'.format(page))
return item
diff --git a/database/models.py b/database/models.py
index 2f9528a..cc9b58e 100644
--- a/database/models.py
+++ b/database/models.py
@@ -14,3 +14,9 @@ class Page(Model):
class Meta:
database = db
+
+class Neodarznet(Page):
+ """
+ Page du site neodarz.net
+ """
+ pass
diff --git a/sphinx_search.conf b/sphinx_search.conf
index 6e8f214..63d8d0f 100644
--- a/sphinx_search.conf
+++ b/sphinx_search.conf
@@ -6,7 +6,7 @@ source neodarznet {
sql_pass = root
sql_db = khanindexer
- sql_query = SELECT id, url, title, content FROM page
+ sql_query = SELECT id, url, title, content FROM neodarznet
sql_field_string = url
sql_field_string = title