aboutsummaryrefslogtreecommitdiff
path: root/crawler/nevrax/pipelines.py
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-02-03 20:02:18 +0100
committerneodarz <neodarz@neodarz.net>2019-02-03 20:02:18 +0100
commitad6212da067fdc05a8564e79943692fd9d466110 (patch)
tree7d83c4d90e7b0bb0260cf6dce39ea132761ebea8 /crawler/nevrax/pipelines.py
parentc66ea291fe937a9b1321f5b3d417669b2aafa43b (diff)
downloadkhanindexer-ad6212da067fdc05a8564e79943692fd9d466110.tar.xz
khanindexer-ad6212da067fdc05a8564e79943692fd9d466110.zip
Add ability to update a page in db if updated
Diffstat (limited to 'crawler/nevrax/pipelines.py')
-rw-r--r--crawler/nevrax/pipelines.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/crawler/nevrax/pipelines.py b/crawler/nevrax/pipelines.py
index 775d5df..dbbb782 100644
--- a/crawler/nevrax/pipelines.py
+++ b/crawler/nevrax/pipelines.py
@@ -9,8 +9,9 @@ class NevraxPipeline(object):
def process_item(self, item, spider):
try:
page = Nevrax.get(Nevrax.url == item['url'])
- q = Nevrax.update(**item).where(Nevrax.url == item['url'])
- q.execute()
+ if page.content_length != item['content_length']:
+ q = Nevrax.update(**item).where(Nevrax.url == item['url'])
+ q.execute()
logging.info("Update item {}".format(page))
except Nevrax.DoesNotExist:
page = Nevrax.create(**item)