Add date when page are crawled

author: neodarz <neodarz@neodarz.net> 2019-02-06 00:02:31 +0100
committer: neodarz <neodarz@neodarz.net> 2019-02-06 00:02:31 +0100
commit: 74dbf4defed8ae348a327e4674d917b3dd869713 (patch)
tree: 36cf3c0f45b443616eccf4473e6c02e15e6a2347 /crawler/neodarznet/spiders
parent: ad6212da067fdc05a8564e79943692fd9d466110 (diff)
download: khanindexer-74dbf4defed8ae348a327e4674d917b3dd869713.tar.xz
khanindexer-74dbf4defed8ae348a327e4674d917b3dd869713.zip
1 files changed, 4 insertions, 1 deletions
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py
index 0f54c4a..bd97067 100644
--- a/crawler/neodarznet/spiders/scrape.py
+++ b/crawler/neodarznet/spiders/scrape.py
@@ -4,6 +4,8 @@ from scrapy.spiders import CrawlSpider, Rule
 from scrapy.linkextractors import LinkExtractor
 from scrapy import Selector
 
+import datetime
+
 class NeodarznetSpider(CrawlSpider):
     name = "neodarznet"
     custom_settings = {
@@ -40,5 +42,6 @@ class NeodarznetSpider(CrawlSpider):
                 'url': response.url,
                 'title': response.css('title::text').extract_first(),
                 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(),
-                'content_length': len(response.body)
+                'content_length': len(response.body),
+                'date_updated': datetime.datetime.now()
         }
author	neodarz <neodarz@neodarz.net>	2019-02-06 00:02:31 +0100
committer	neodarz <neodarz@neodarz.net>	2019-02-06 00:02:31 +0100
commit	74dbf4defed8ae348a327e4674d917b3dd869713 (patch)
tree	36cf3c0f45b443616eccf4473e6c02e15e6a2347 /crawler/neodarznet/spiders
parent	ad6212da067fdc05a8564e79943692fd9d466110 (diff)
download	khanindexer-74dbf4defed8ae348a327e4674d917b3dd869713.tar.xz khanindexer-74dbf4defed8ae348a327e4674d917b3dd869713.zip