aboutsummaryrefslogtreecommitdiff
path: root/crawler
diff options
context:
space:
mode:
Diffstat (limited to 'crawler')
-rw-r--r--crawler/neodarznet/pipelines.py5
-rw-r--r--crawler/neodarznet/spiders/scrape.py3
-rw-r--r--crawler/nevrax/pipelines.py5
-rw-r--r--crawler/nevrax/spiders/scrape.py3
4 files changed, 10 insertions, 6 deletions
diff --git a/crawler/neodarznet/pipelines.py b/crawler/neodarznet/pipelines.py
index fbfebbb..1f3a9fc 100644
--- a/crawler/neodarznet/pipelines.py
+++ b/crawler/neodarznet/pipelines.py
@@ -9,8 +9,9 @@ class NeodarznetPipeline(object):
def process_item(self, item, spider):
try:
page = Neodarznet.get(Neodarznet.url == item['url'])
- q = Neodarznet.update(**item).where(Neodarznet.url == item['url'])
- q.execute()
+ if page.content_length != item['content_length']:
+ q = Neodarznet.update(**item).where(Neodarznet.url == item['url'])
+ q.execute()
logging.info("Update item {}".format(page))
except Neodarznet.DoesNotExist:
page = Neodarznet.create(**item)
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py
index e16ede2..0f54c4a 100644
--- a/crawler/neodarznet/spiders/scrape.py
+++ b/crawler/neodarznet/spiders/scrape.py
@@ -39,5 +39,6 @@ class NeodarznetSpider(CrawlSpider):
yield {
'url': response.url,
'title': response.css('title::text').extract_first(),
- 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip()
+ 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(),
+ 'content_length': len(response.body)
}
diff --git a/crawler/nevrax/pipelines.py b/crawler/nevrax/pipelines.py
index 775d5df..dbbb782 100644
--- a/crawler/nevrax/pipelines.py
+++ b/crawler/nevrax/pipelines.py
@@ -9,8 +9,9 @@ class NevraxPipeline(object):
def process_item(self, item, spider):
try:
page = Nevrax.get(Nevrax.url == item['url'])
- q = Nevrax.update(**item).where(Nevrax.url == item['url'])
- q.execute()
+ if page.content_length != item['content_length']:
+ q = Nevrax.update(**item).where(Nevrax.url == item['url'])
+ q.execute()
logging.info("Update item {}".format(page))
except Nevrax.DoesNotExist:
page = Nevrax.create(**item)
diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py
index 8a7b8ec..785ec3f 100644
--- a/crawler/nevrax/spiders/scrape.py
+++ b/crawler/nevrax/spiders/scrape.py
@@ -41,5 +41,6 @@ class NevraxSpider(CrawlSpider):
yield {
'url': response.url,
'title': response.css('title::text').extract_first(),
- 'content': ''.join(sel.select("//body//text()").extract()).strip()
+ 'content': ''.join(sel.select("//body//text()").extract()).strip(),
+ 'content_length': len(response.body)
}