aboutsummaryrefslogtreecommitdiff
path: root/crawler/neodarznet/spiders
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-02-03 20:02:18 +0100
committerneodarz <neodarz@neodarz.net>2019-02-03 20:02:18 +0100
commitad6212da067fdc05a8564e79943692fd9d466110 (patch)
tree7d83c4d90e7b0bb0260cf6dce39ea132761ebea8 /crawler/neodarznet/spiders
parentc66ea291fe937a9b1321f5b3d417669b2aafa43b (diff)
downloadkhanindexer-ad6212da067fdc05a8564e79943692fd9d466110.tar.xz
khanindexer-ad6212da067fdc05a8564e79943692fd9d466110.zip
Add ability to update a page in db if updated
Diffstat (limited to '')
-rw-r--r--crawler/neodarznet/spiders/scrape.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py
index e16ede2..0f54c4a 100644
--- a/crawler/neodarznet/spiders/scrape.py
+++ b/crawler/neodarznet/spiders/scrape.py
@@ -39,5 +39,6 @@ class NeodarznetSpider(CrawlSpider):
yield {
'url': response.url,
'title': response.css('title::text').extract_first(),
- 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip()
+ 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(),
+ 'content_length': len(response.body)
}