From f0b4712c897ee35f2d79cf0408f480c2c0bb41da Mon Sep 17 00:00:00 2001 From: neodarz Date: Sun, 13 Jan 2019 11:22:16 +0100 Subject: Save all crawled datas in database --- README.md | 4 +++- app.py | 11 +++++++++++ config.py | 2 +- crawler/neodarznet/pipelines.py | 15 ++++++++++++++- crawler/neodarznet/settings.py | 2 ++ database/models.py | 4 ++-- 6 files changed, 33 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 726b105..0710132 100644 --- a/README.md +++ b/README.md @@ -6,5 +6,7 @@ For now there is an example spider with neodarz website. For testing it just run: ``` -scrapy crawl scrape -o out.json +python run.py ``` + +The database is in the sqlite file `khanindexer.db` at the root of the project. diff --git a/app.py b/app.py index 2a80507..281f932 100644 --- a/app.py +++ b/app.py @@ -1,8 +1,19 @@ +import scrapy +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + +from crawler.neodarznet.spiders.scrape import ScrapSpider + from database.models import Page, db import config +process = CrawlerProcess(get_project_settings()) + def main(): db.create_tables([Page]) + process.crawl(ScrapSpider) + process.start() + if __name__ == '__main__': main() diff --git a/config.py b/config.py index 3717715..58de78f 100644 --- a/config.py +++ b/config.py @@ -2,4 +2,4 @@ from os import path APP_DIR = path.dirname(__file__) -DATABASE = '%s' % path.join(APP_DIR, 'spider.db') +DATABASE = '%s' % path.join(APP_DIR, 'khanindexer.db') diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py index 71e7865..6703796 100644 --- a/crawler/neodarznet/pipelines.py +++ b/crawler/neodarznet/pipelines.py @@ -1,6 +1,19 @@ # -*- coding: utf-8 -*- +import logging + +from database.models import Page class NeodarznetPipeline(object): - def process_time(self, item, spider): + + def process_item(self, item, spider): + try: + page = Page.get(Page.url == item['url']) + q = Page.update(**item).where(Page.url == item['url']) + q.execute() + logging.info("Update item {}".format(page)) + except Page.DoesNotExist: + page = Page.create(**item) + logging.info("Create item {}".format(page)) + logging.info('Item {} stored in db'.format(page)) return item diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py index 8d65b09..2e5f184 100644 --- a/crawler/neodarznet/settings.py +++ b/crawler/neodarznet/settings.py @@ -8,3 +8,5 @@ NEWSPIDER_MODULE = 'crawler.neodarznet.spiders' ROBOTSTXT_OBEY = True DEPTH_LIMIT = 0 + +ITEM_PIPELINES = {'crawler.neodarznet.pipelines.NeodarznetPipeline': 0} diff --git a/database/models.py b/database/models.py index 4b27806..c731a08 100644 --- a/database/models.py +++ b/database/models.py @@ -10,8 +10,8 @@ class Page(Model): Page of a website """ url = CharField() - title = CharField() - content = CharField() + title = CharField(null=True) + content = CharField(null=True) class Meta: database = db -- cgit v1.2.1