diff options
author | neodarz <neodarz@neodarz.net> | 2019-01-13 11:22:16 +0100 |
---|---|---|
committer | neodarz <neodarz@neodarz.net> | 2019-01-13 11:22:16 +0100 |
commit | f0b4712c897ee35f2d79cf0408f480c2c0bb41da (patch) | |
tree | d180494b88350861cfef3ecfddb655aca4363086 /crawler | |
parent | c01eb0e3220924c24aabc3ccd282f89f9ed9cb3e (diff) | |
download | khanindexer-f0b4712c897ee35f2d79cf0408f480c2c0bb41da.tar.xz khanindexer-f0b4712c897ee35f2d79cf0408f480c2c0bb41da.zip |
Save all crawled datas in database
Diffstat (limited to 'crawler')
-rw-r--r-- | crawler/neodarznet/pipelines.py | 15 | ||||
-rw-r--r-- | crawler/neodarznet/settings.py | 2 |
2 files changed, 16 insertions, 1 deletions
diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py index 71e7865..6703796 100644 --- a/crawler/neodarznet/pipelines.py +++ b/crawler/neodarznet/pipelines.py @@ -1,6 +1,19 @@ # -*- coding: utf-8 -*- +import logging + +from database.models import Page class NeodarznetPipeline(object): - def process_time(self, item, spider): + + def process_item(self, item, spider): + try: + page = Page.get(Page.url == item['url']) + q = Page.update(**item).where(Page.url == item['url']) + q.execute() + logging.info("Update item {}".format(page)) + except Page.DoesNotExist: + page = Page.create(**item) + logging.info("Create item {}".format(page)) + logging.info('Item {} stored in db'.format(page)) return item diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py index 8d65b09..2e5f184 100644 --- a/crawler/neodarznet/settings.py +++ b/crawler/neodarznet/settings.py @@ -8,3 +8,5 @@ NEWSPIDER_MODULE = 'crawler.neodarznet.spiders' ROBOTSTXT_OBEY = True DEPTH_LIMIT = 0 + +ITEM_PIPELINES = {'crawler.neodarznet.pipelines.NeodarznetPipeline': 0} |