aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-02-06 00:02:31 +0100
committerneodarz <neodarz@neodarz.net>2019-02-06 00:02:31 +0100
commit74dbf4defed8ae348a327e4674d917b3dd869713 (patch)
tree36cf3c0f45b443616eccf4473e6c02e15e6a2347
parentad6212da067fdc05a8564e79943692fd9d466110 (diff)
downloadkhanindexer-74dbf4defed8ae348a327e4674d917b3dd869713.tar.xz
khanindexer-74dbf4defed8ae348a327e4674d917b3dd869713.zip
Add date when page are crawled
-rw-r--r--crawler/neodarznet/spiders/scrape.py5
-rw-r--r--crawler/nevrax/spiders/scrape.py4
-rw-r--r--database/models.py5
3 files changed, 11 insertions, 3 deletions
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py
index 0f54c4a..bd97067 100644
--- a/crawler/neodarznet/spiders/scrape.py
+++ b/crawler/neodarznet/spiders/scrape.py
@@ -4,6 +4,8 @@ from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import Selector
+import datetime
+
class NeodarznetSpider(CrawlSpider):
name = "neodarznet"
custom_settings = {
@@ -40,5 +42,6 @@ class NeodarznetSpider(CrawlSpider):
'url': response.url,
'title': response.css('title::text').extract_first(),
'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(),
- 'content_length': len(response.body)
+ 'content_length': len(response.body),
+ 'date_updated': datetime.datetime.now()
}
diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py
index 785ec3f..d27aecf 100644
--- a/crawler/nevrax/spiders/scrape.py
+++ b/crawler/nevrax/spiders/scrape.py
@@ -5,6 +5,7 @@ from scrapy.linkextractors import LinkExtractor
from scrapy import Selector
import config
+import datetime
class NevraxSpider(CrawlSpider):
name = "nevrax"
@@ -42,5 +43,6 @@ class NevraxSpider(CrawlSpider):
'url': response.url,
'title': response.css('title::text').extract_first(),
'content': ''.join(sel.select("//body//text()").extract()).strip(),
- 'content_length': len(response.body)
+ 'content_length': len(response.body),
+ 'date_updated': datetime.datetime.now()
}
diff --git a/database/models.py b/database/models.py
index a4c3f65..af7b6f5 100644
--- a/database/models.py
+++ b/database/models.py
@@ -1,7 +1,9 @@
-from peewee import Model, CharField, TextField, IntegerField, PostgresqlDatabase
+from peewee import Model, CharField, TextField, IntegerField, DateTimeField, PostgresqlDatabase
import config
+import datetime
+
db = PostgresqlDatabase(config.DB, host=config.DB_HOST, port=config.DB_PORT, user=config.DB_USER, password=config.DB_PASS, autocommit=True, autorollback=True)
class Page(Model):
@@ -12,6 +14,7 @@ class Page(Model):
title = CharField(null=True)
content = TextField(null=True)
content_length = IntegerField()
+ date_updated = DateTimeField(default=datetime.datetime.now)
class Meta:
database = db