aboutsummaryrefslogtreecommitdiff
path: root/crawler
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-02-06 19:15:36 +0100
committerneodarz <neodarz@neodarz.net>2019-02-06 19:15:36 +0100
commit9a8badd5dffe47813489ab0b355f5db5faa66646 (patch)
treebf53db13612cd7c32f2eadfe905cad83ce50a0d7 /crawler
parentf84e8fb75b8096dff5a39936ac26c933fdba3059 (diff)
downloadkhanindexer-9a8badd5dffe47813489ab0b355f5db5faa66646.tar.xz
khanindexer-9a8badd5dffe47813489ab0b355f5db5faa66646.zip
Add ability to update url who are one week old and content modified
Diffstat (limited to 'crawler')
-rw-r--r--crawler/neodarznet/spiders/scrape.py2
-rw-r--r--crawler/neodarznet/spiders/update.py49
-rw-r--r--crawler/nevrax/spiders/scrape.py2
-rw-r--r--crawler/nevrax/spiders/update.py50
4 files changed, 101 insertions, 2 deletions
diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py
index bd97067..2d3c32b 100644
--- a/crawler/neodarznet/spiders/scrape.py
+++ b/crawler/neodarznet/spiders/scrape.py
@@ -7,7 +7,7 @@ from scrapy import Selector
import datetime
class NeodarznetSpider(CrawlSpider):
- name = "neodarznet"
+ name = "neodarznet_crawler"
custom_settings = {
'ITEM_PIPELINES': {
'crawler.neodarznet.pipelines.NeodarznetPipeline': 0
diff --git a/crawler/neodarznet/spiders/update.py b/crawler/neodarznet/spiders/update.py
new file mode 100644
index 0000000..38f1863
--- /dev/null
+++ b/crawler/neodarznet/spiders/update.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from scrapy import Selector
+
+import datetime
+
+from database.models import Neodarznet
+
+from dateutil.relativedelta import *
+
+import logging
+
+class NeodarznetSpider(CrawlSpider):
+ name = "neodarznet_updater"
+ custom_settings = {
+ 'ITEM_PIPELINES': {
+ 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0
+ }
+ }
+ allow_domains = ['neodarz.net']
+
+ datas = Neodarznet.select(Neodarznet.url).dicts()
+ datas_array = []
+ for value in datas:
+ datas_array.append(value['url'])
+ start_urls = datas_array
+
+ def start_requests(self):
+ for url in self.start_urls:
+ logging.info(url)
+ try:
+ page = Neodarznet.get(Neodarznet.url == url)
+ if page.date_updated < datetime.datetime.now()+relativedelta(weeks=-1):
+ yield scrapy.Request(url, callback=self.parse_url, dont_filter=True)
+ except Neodarznet.DoesNotExist:
+ yield scrapy.Request(url, callback=self.parse_url, dont_filter=True)
+ continue
+
+ def parse_url(self, response):
+ sel = Selector(response)
+ yield {
+ 'url': response.url,
+ 'title': response.css('title::text').extract_first(),
+ 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(),
+ 'content_length': len(response.body),
+ 'date_updated': datetime.datetime.now()
+ }
diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py
index d27aecf..c9a8a53 100644
--- a/crawler/nevrax/spiders/scrape.py
+++ b/crawler/nevrax/spiders/scrape.py
@@ -8,7 +8,7 @@ import config
import datetime
class NevraxSpider(CrawlSpider):
- name = "nevrax"
+ name = "nevrax_crawler"
custom_settings = {
'ITEM_PIPELINES': {
'crawler.nevrax.pipelines.NevraxPipeline': 0
diff --git a/crawler/nevrax/spiders/update.py b/crawler/nevrax/spiders/update.py
new file mode 100644
index 0000000..b3f7aa1
--- /dev/null
+++ b/crawler/nevrax/spiders/update.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from scrapy import Selector
+
+import config
+import datetime
+
+from database.models import Nevrax
+
+from dateutil.relativedelta import *
+
+import logging
+
+class NevraxSpider(CrawlSpider):
+ name = "nevrax_updater"
+ custom_settings = {
+ 'ITEM_PIPELINES': {
+ 'crawler.nevrax.pipelines.NevraxPipeline': 0
+ }
+ }
+ allow_domains = [config.NEVRAX_URL]
+
+ datas = Nevrax.select(Nevrax.url).dicts()
+ datas_array = []
+ for value in datas:
+ datas_array.append(value['url'])
+ start_urls = datas_array
+
+ def start_requests(self):
+ for url in self.start_urls:
+ logging.info(url)
+ try:
+ page = Nevrax.get(Nevrax.url == url)
+ if page.date_updated < datetime.datetime.now()+relativedelta(weeks=-1):
+ yield scrapy.Request(url, callback=self.parse_url, dont_filter=True)
+ except Nevrax.DoesNotExist:
+ yield scrapy.Request(url, callback=self.parse_url, dont_filter=True)
+ continue
+
+ def parse_url(self, response):
+ sel = Selector(response)
+ yield {
+ 'url': response.url,
+ 'title': response.css('title::text').extract_first(),
+ 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(),
+ 'content_length': len(response.body),
+ 'date_updated': datetime.datetime.now()
+ }