diff options
author | neodarz <neodarz@neodarz.net> | 2019-02-03 20:02:18 +0100 |
---|---|---|
committer | neodarz <neodarz@neodarz.net> | 2019-02-03 20:02:18 +0100 |
commit | ad6212da067fdc05a8564e79943692fd9d466110 (patch) | |
tree | 7d83c4d90e7b0bb0260cf6dce39ea132761ebea8 /crawler/neodarznet | |
parent | c66ea291fe937a9b1321f5b3d417669b2aafa43b (diff) | |
download | khanindexer-ad6212da067fdc05a8564e79943692fd9d466110.tar.xz khanindexer-ad6212da067fdc05a8564e79943692fd9d466110.zip |
Add ability to update a page in db if updated
Diffstat (limited to 'crawler/neodarznet')
-rw-r--r-- | crawler/neodarznet/pipelines.py | 5 | ||||
-rw-r--r-- | crawler/neodarznet/spiders/scrape.py | 3 |
2 files changed, 5 insertions, 3 deletions
diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py index fbfebbb..1f3a9fc 100644 --- a/crawler/neodarznet/pipelines.py +++ b/crawler/neodarznet/pipelines.py @@ -9,8 +9,9 @@ class NeodarznetPipeline(object): def process_item(self, item, spider): try: page = Neodarznet.get(Neodarznet.url == item['url']) - q = Neodarznet.update(**item).where(Neodarznet.url == item['url']) - q.execute() + if page.content_length != item['content_length']: + q = Neodarznet.update(**item).where(Neodarznet.url == item['url']) + q.execute() logging.info("Update item {}".format(page)) except Neodarznet.DoesNotExist: page = Neodarznet.create(**item) diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py index e16ede2..0f54c4a 100644 --- a/crawler/neodarznet/spiders/scrape.py +++ b/crawler/neodarznet/spiders/scrape.py @@ -39,5 +39,6 @@ class NeodarznetSpider(CrawlSpider): yield { 'url': response.url, 'title': response.css('title::text').extract_first(), - 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip() + 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(), + 'content_length': len(response.body) } |