Add ability to update a page in db if updated

author: neodarz <neodarz@neodarz.net> 2019-02-03 20:02:18 +0100
committer: neodarz <neodarz@neodarz.net> 2019-02-03 20:02:18 +0100
commit: ad6212da067fdc05a8564e79943692fd9d466110 (patch)
tree: 7d83c4d90e7b0bb0260cf6dce39ea132761ebea8
parent: c66ea291fe937a9b1321f5b3d417669b2aafa43b (diff)
download: khanindexer-ad6212da067fdc05a8564e79943692fd9d466110.tar.xz
khanindexer-ad6212da067fdc05a8564e79943692fd9d466110.zip
5 files changed, 12 insertions, 7 deletions
diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py
index fbfebbb..1f3a9fc 100644
--- a/crawler/neodarznet/pipelines.py
+++ b/crawler/neodarznet/pipelines.py
@@ -9,8 +9,9 @@ class NeodarznetPipeline(object):
     def process_item(self, item, spider):
         try:
             page = Neodarznet.get(Neodarznet.url == item['url'])
-            q = Neodarznet.update(**item).where(Neodarznet.url == item['url'])
-            q.execute()
+            if page.content_length != item['content_length']:
+                q = Neodarznet.update(**item).where(Neodarznet.url == item['url'])
+                q.execute()
             logging.info("Update item {}".format(page))
         except Neodarznet.DoesNotExist:
             page = Neodarznet.create(**item)
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py
index e16ede2..0f54c4a 100644
--- a/crawler/neodarznet/spiders/scrape.py
+++ b/crawler/neodarznet/spiders/scrape.py
@@ -39,5 +39,6 @@ class NeodarznetSpider(CrawlSpider):
         yield {
                 'url': response.url,
                 'title': response.css('title::text').extract_first(),
-                'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip()
+                'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(),
+                'content_length': len(response.body)
         }
diff --git a/crawler/nevrax/pipelines.py b/crawler/nevrax/pipelines.py
index 775d5df..dbbb782 100644
--- a/crawler/nevrax/pipelines.py
+++ b/crawler/nevrax/pipelines.py
@@ -9,8 +9,9 @@ class NevraxPipeline(object):
     def process_item(self, item, spider):
         try:
             page = Nevrax.get(Nevrax.url == item['url'])
-            q = Nevrax.update(**item).where(Nevrax.url == item['url'])
-            q.execute()
+            if page.content_length != item['content_length']:
+                q = Nevrax.update(**item).where(Nevrax.url == item['url'])
+                q.execute()
             logging.info("Update item {}".format(page))
         except Nevrax.DoesNotExist:
             page = Nevrax.create(**item)
diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py
index 8a7b8ec..785ec3f 100644
--- a/crawler/nevrax/spiders/scrape.py
+++ b/crawler/nevrax/spiders/scrape.py
@@ -41,5 +41,6 @@ class NevraxSpider(CrawlSpider):
         yield {
                 'url': response.url,
                 'title': response.css('title::text').extract_first(),
-                'content': ''.join(sel.select("//body//text()").extract()).strip()
+                'content': ''.join(sel.select("//body//text()").extract()).strip(),
+                'content_length': len(response.body)
         }
diff --git a/database/models.py b/database/models.py
index 3727c93..a4c3f65 100644
--- a/database/models.py
+++ b/database/models.py
@@ -1,4 +1,4 @@
-from peewee import Model, CharField, TextField, PostgresqlDatabase
+from peewee import Model, CharField, TextField, IntegerField, PostgresqlDatabase
 
 import config
 
@@ -11,6 +11,7 @@ class Page(Model):
     url = CharField()
     title = CharField(null=True)
     content = TextField(null=True)
+    content_length = IntegerField()
 
     class Meta:
         database = db
author	neodarz <neodarz@neodarz.net>	2019-02-03 20:02:18 +0100
committer	neodarz <neodarz@neodarz.net>	2019-02-03 20:02:18 +0100
commit	ad6212da067fdc05a8564e79943692fd9d466110 (patch)
tree	7d83c4d90e7b0bb0260cf6dce39ea132761ebea8
parent	c66ea291fe937a9b1321f5b3d417669b2aafa43b (diff)
download	khanindexer-ad6212da067fdc05a8564e79943692fd9d466110.tar.xz khanindexer-ad6212da067fdc05a8564e79943692fd9d466110.zip