From f0b4712c897ee35f2d79cf0408f480c2c0bb41da Mon Sep 17 00:00:00 2001
From: neodarz <neodarz@neodarz.net>
Date: Sun, 13 Jan 2019 11:22:16 +0100
Subject: Save all crawled datas in database

---
 README.md                       |  4 +++-
 app.py                          | 11 +++++++++++
 config.py                       |  2 +-
 crawler/neodarznet/pipelines.py | 15 ++++++++++++++-
 crawler/neodarznet/settings.py  |  2 ++
 database/models.py              |  4 ++--
 6 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 726b105..0710132 100644
--- a/README.md
+++ b/README.md
@@ -6,5 +6,7 @@ For now there is an example spider with neodarz website.
 For testing it just run:
 
 ```
-scrapy crawl scrape -o out.json
+python run.py
 ```
+
+The database is in the sqlite file `khanindexer.db` at the root of the project.
diff --git a/app.py b/app.py
index 2a80507..281f932 100644
--- a/app.py
+++ b/app.py
@@ -1,8 +1,19 @@
+import scrapy
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+from crawler.neodarznet.spiders.scrape import ScrapSpider
+
 from database.models import Page, db
 import config
 
+process = CrawlerProcess(get_project_settings())
+
 def main():
     db.create_tables([Page])
 
+    process.crawl(ScrapSpider)
+    process.start()
+
 if __name__ == '__main__':
     main()
diff --git a/config.py b/config.py
index 3717715..58de78f 100644
--- a/config.py
+++ b/config.py
@@ -2,4 +2,4 @@ from os import path
 
 APP_DIR = path.dirname(__file__)
 
-DATABASE = '%s' % path.join(APP_DIR, 'spider.db')
+DATABASE = '%s' % path.join(APP_DIR, 'khanindexer.db')
diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py
index 71e7865..6703796 100644
--- a/crawler/neodarznet/pipelines.py
+++ b/crawler/neodarznet/pipelines.py
@@ -1,6 +1,19 @@
 # -*- coding: utf-8 -*-
+import logging
+
+from database.models import Page
 
 
 class NeodarznetPipeline(object):
-    def process_time(self, item, spider):
+
+    def process_item(self, item, spider):
+        try:
+            page = Page.get(Page.url == item['url'])
+            q = Page.update(**item).where(Page.url == item['url'])
+            q.execute()
+            logging.info("Update item {}".format(page))
+        except Page.DoesNotExist:
+            page = Page.create(**item)
+            logging.info("Create item {}".format(page))
+        logging.info('Item {} stored in db'.format(page))
         return item
diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py
index 8d65b09..2e5f184 100644
--- a/crawler/neodarznet/settings.py
+++ b/crawler/neodarznet/settings.py
@@ -8,3 +8,5 @@ NEWSPIDER_MODULE = 'crawler.neodarznet.spiders'
 ROBOTSTXT_OBEY = True
 
 DEPTH_LIMIT = 0
+
+ITEM_PIPELINES = {'crawler.neodarznet.pipelines.NeodarznetPipeline': 0}
diff --git a/database/models.py b/database/models.py
index 4b27806..c731a08 100644
--- a/database/models.py
+++ b/database/models.py
@@ -10,8 +10,8 @@ class Page(Model):
     Page of a website
     """
     url = CharField()
-    title = CharField()
-    content = CharField()
+    title = CharField(null=True)
+    content = CharField(null=True)
 
     class Meta:
         database = db
-- 
cgit v1.2.1