aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-01-13 11:22:16 +0100
committerneodarz <neodarz@neodarz.net>2019-01-13 11:22:16 +0100
commitf0b4712c897ee35f2d79cf0408f480c2c0bb41da (patch)
treed180494b88350861cfef3ecfddb655aca4363086
parentc01eb0e3220924c24aabc3ccd282f89f9ed9cb3e (diff)
downloadkhanindexer-f0b4712c897ee35f2d79cf0408f480c2c0bb41da.tar.xz
khanindexer-f0b4712c897ee35f2d79cf0408f480c2c0bb41da.zip
Save all crawled datas in database
-rw-r--r--README.md4
-rw-r--r--app.py11
-rw-r--r--config.py2
-rw-r--r--crawler/neodarznet/pipelines.py15
-rw-r--r--crawler/neodarznet/settings.py2
-rw-r--r--database/models.py4
6 files changed, 33 insertions, 5 deletions
diff --git a/README.md b/README.md
index 726b105..0710132 100644
--- a/README.md
+++ b/README.md
@@ -6,5 +6,7 @@ For now there is an example spider with neodarz website.
For testing it just run:
```
-scrapy crawl scrape -o out.json
+python run.py
```
+
+The database is in the sqlite file `khanindexer.db` at the root of the project.
diff --git a/app.py b/app.py
index 2a80507..281f932 100644
--- a/app.py
+++ b/app.py
@@ -1,8 +1,19 @@
+import scrapy
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+from crawler.neodarznet.spiders.scrape import ScrapSpider
+
from database.models import Page, db
import config
+process = CrawlerProcess(get_project_settings())
+
def main():
db.create_tables([Page])
+ process.crawl(ScrapSpider)
+ process.start()
+
if __name__ == '__main__':
main()
diff --git a/config.py b/config.py
index 3717715..58de78f 100644
--- a/config.py
+++ b/config.py
@@ -2,4 +2,4 @@ from os import path
APP_DIR = path.dirname(__file__)
-DATABASE = '%s' % path.join(APP_DIR, 'spider.db')
+DATABASE = '%s' % path.join(APP_DIR, 'khanindexer.db')
diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py
index 71e7865..6703796 100644
--- a/crawler/neodarznet/pipelines.py
+++ b/crawler/neodarznet/pipelines.py
@@ -1,6 +1,19 @@
# -*- coding: utf-8 -*-
+import logging
+
+from database.models import Page
class NeodarznetPipeline(object):
- def process_time(self, item, spider):
+
+ def process_item(self, item, spider):
+ try:
+ page = Page.get(Page.url == item['url'])
+ q = Page.update(**item).where(Page.url == item['url'])
+ q.execute()
+ logging.info("Update item {}".format(page))
+ except Page.DoesNotExist:
+ page = Page.create(**item)
+ logging.info("Create item {}".format(page))
+ logging.info('Item {} stored in db'.format(page))
return item
diff --git a/crawler/neodarznet/settings.py b/crawler/neodarznet/settings.py
index 8d65b09..2e5f184 100644
--- a/crawler/neodarznet/settings.py
+++ b/crawler/neodarznet/settings.py
@@ -8,3 +8,5 @@ NEWSPIDER_MODULE = 'crawler.neodarznet.spiders'
ROBOTSTXT_OBEY = True
DEPTH_LIMIT = 0
+
+ITEM_PIPELINES = {'crawler.neodarznet.pipelines.NeodarznetPipeline': 0}
diff --git a/database/models.py b/database/models.py
index 4b27806..c731a08 100644
--- a/database/models.py
+++ b/database/models.py
@@ -10,8 +10,8 @@ class Page(Model):
Page of a website
"""
url = CharField()
- title = CharField()
- content = CharField()
+ title = CharField(null=True)
+ content = CharField(null=True)
class Meta:
database = db