From ad6212da067fdc05a8564e79943692fd9d466110 Mon Sep 17 00:00:00 2001 From: neodarz Date: Sun, 3 Feb 2019 20:02:18 +0100 Subject: Add ability to update a page in db if updated --- crawler/neodarznet/pipelines.py | 5 +++-- crawler/neodarznet/spiders/scrape.py | 3 ++- crawler/nevrax/pipelines.py | 5 +++-- crawler/nevrax/spiders/scrape.py | 3 ++- database/models.py | 3 ++- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py index fbfebbb..1f3a9fc 100644 --- a/crawler/neodarznet/pipelines.py +++ b/crawler/neodarznet/pipelines.py @@ -9,8 +9,9 @@ class NeodarznetPipeline(object): def process_item(self, item, spider): try: page = Neodarznet.get(Neodarznet.url == item['url']) - q = Neodarznet.update(**item).where(Neodarznet.url == item['url']) - q.execute() + if page.content_length != item['content_length']: + q = Neodarznet.update(**item).where(Neodarznet.url == item['url']) + q.execute() logging.info("Update item {}".format(page)) except Neodarznet.DoesNotExist: page = Neodarznet.create(**item) diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py index e16ede2..0f54c4a 100644 --- a/crawler/neodarznet/spiders/scrape.py +++ b/crawler/neodarznet/spiders/scrape.py @@ -39,5 +39,6 @@ class NeodarznetSpider(CrawlSpider): yield { 'url': response.url, 'title': response.css('title::text').extract_first(), - 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip() + 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(), + 'content_length': len(response.body) } diff --git a/crawler/nevrax/pipelines.py b/crawler/nevrax/pipelines.py index 775d5df..dbbb782 100644 --- a/crawler/nevrax/pipelines.py +++ b/crawler/nevrax/pipelines.py @@ -9,8 +9,9 @@ class NevraxPipeline(object): def process_item(self, item, spider): try: page = Nevrax.get(Nevrax.url == item['url']) - q = Nevrax.update(**item).where(Nevrax.url == item['url']) - q.execute() + if page.content_length != item['content_length']: + q = Nevrax.update(**item).where(Nevrax.url == item['url']) + q.execute() logging.info("Update item {}".format(page)) except Nevrax.DoesNotExist: page = Nevrax.create(**item) diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py index 8a7b8ec..785ec3f 100644 --- a/crawler/nevrax/spiders/scrape.py +++ b/crawler/nevrax/spiders/scrape.py @@ -41,5 +41,6 @@ class NevraxSpider(CrawlSpider): yield { 'url': response.url, 'title': response.css('title::text').extract_first(), - 'content': ''.join(sel.select("//body//text()").extract()).strip() + 'content': ''.join(sel.select("//body//text()").extract()).strip(), + 'content_length': len(response.body) } diff --git a/database/models.py b/database/models.py index 3727c93..a4c3f65 100644 --- a/database/models.py +++ b/database/models.py @@ -1,4 +1,4 @@ -from peewee import Model, CharField, TextField, PostgresqlDatabase +from peewee import Model, CharField, TextField, IntegerField, PostgresqlDatabase import config @@ -11,6 +11,7 @@ class Page(Model): url = CharField() title = CharField(null=True) content = TextField(null=True) + content_length = IntegerField() class Meta: database = db -- cgit v1.2.1