aboutsummaryrefslogtreecommitdiff
path: root/crawler
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-01-13 11:22:16 +0100
committerneodarz <neodarz@neodarz.net>2019-01-13 11:22:16 +0100
commitf0b4712c897ee35f2d79cf0408f480c2c0bb41da (patch)
treed180494b88350861cfef3ecfddb655aca4363086 /crawler
parentc01eb0e3220924c24aabc3ccd282f89f9ed9cb3e (diff)
downloadkhanindexer-f0b4712c897ee35f2d79cf0408f480c2c0bb41da.tar.xz
khanindexer-f0b4712c897ee35f2d79cf0408f480c2c0bb41da.zip
Save all crawled datas in database
Diffstat (limited to 'crawler')
-rw-r--r--crawler/neodarznet/pipelines.py15
-rw-r--r--crawler/neodarznet/settings.py2
2 files changed, 16 insertions, 1 deletions
diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py
index 71e7865..6703796 100644
--- a/crawler/neodarznet/pipelines.py
+++ b/crawler/neodarznet/pipelines.py
@@ -1,6 +1,19 @@
# -*- coding: utf-8 -*-
+import logging
+
+from database.models import Page
class NeodarznetPipeline(object):
- def process_time(self, item, spider):
+
+ def process_item(self, item, spider):
+ try:
+ page = Page.get(Page.url == item['url'])
+ q = Page.update(**item).where(Page.url == item['url'])
+ q.execute()
+ logging.info("Update item {}".format(page))
+ except Page.DoesNotExist:
+ page = Page.create(**item)
+ logging.info("Create item {}".format(page))
+ logging.info('Item {} stored in db'.format(page))
return item
diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py
index 8d65b09..2e5f184 100644
--- a/crawler/neodarznet/settings.py
+++ b/crawler/neodarznet/settings.py
@@ -8,3 +8,5 @@ NEWSPIDER_MODULE = 'crawler.neodarznet.spiders'
ROBOTSTXT_OBEY = True
DEPTH_LIMIT = 0
+
+ITEM_PIPELINES = {'crawler.neodarznet.pipelines.NeodarznetPipeline': 0}