From 9a8badd5dffe47813489ab0b355f5db5faa66646 Mon Sep 17 00:00:00 2001 From: neodarz Date: Wed, 6 Feb 2019 19:15:36 +0100 Subject: Add ability to update url who are one week old and content modified --- README.md | 8 ++++++ app.py | 32 ++++++++++++++++++++--- crawler/neodarznet/spiders/scrape.py | 2 +- crawler/neodarznet/spiders/update.py | 49 +++++++++++++++++++++++++++++++++++ crawler/nevrax/spiders/scrape.py | 2 +- crawler/nevrax/spiders/update.py | 50 ++++++++++++++++++++++++++++++++++++ 6 files changed, 138 insertions(+), 5 deletions(-) create mode 100644 crawler/neodarznet/spiders/update.py create mode 100644 crawler/nevrax/spiders/update.py diff --git a/README.md b/README.md index 3d63946..6799aaa 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,14 @@ Before you can make search, you must lauch the search server searchd -c sphinx_search.conf ``` +# Updating + +For update the database and only crawl url that are one week old and and his +content modified, you can use: +``` +python app.py update +``` + ## Enjoy You can now launch the server! diff --git a/app.py b/app.py index 431c2b0..ad44a14 100644 --- a/app.py +++ b/app.py @@ -6,17 +6,43 @@ from scrapy.utils.project import get_project_settings from database.models import Page, db -def main(): +def crawl(): try: db.create_tables(Page.__subclasses__()) settings = get_project_settings() process = CrawlerProcess(settings) spiders = spiderloader.SpiderLoader.from_settings(settings) for spider in spiders.list(): - process.crawl(spider) + if "crawl" in spider: + process.crawl(spider) process.start() except Exception as e: print(e) +def update(): + try: + settings = get_project_settings() + process = CrawlerProcess(settings) + spiders = spiderloader.SpiderLoader.from_settings(settings) + for spider in spiders.list(): + if "update" in spider: + process.crawl(spider) + process.start() + except Exception as e: + print(e) + +def show_help(): + print("Launch all crawler => "+str(sys.argv[0])+" crawl") + print("Update all page already crawlerd => "+str(sys.argv[0])+" update") + if __name__ == '__main__': - main() + #main() + if len(sys.argv) == 2: + if sys.argv[1] == "crawl": + crawl() + elif sys.argv[1] == "update": + update() + else: + show_help() + else: + show_help() diff --git a/crawler/neodarznet/spiders/scrape.py b/crawler/neodarznet/spiders/scrape.py index bd97067..2d3c32b 100644 --- a/crawler/neodarznet/spiders/scrape.py +++ b/crawler/neodarznet/spiders/scrape.py @@ -7,7 +7,7 @@ from scrapy import Selector import datetime class NeodarznetSpider(CrawlSpider): - name = "neodarznet" + name = "neodarznet_crawler" custom_settings = { 'ITEM_PIPELINES': { 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0 diff --git a/crawler/neodarznet/spiders/update.py b/crawler/neodarznet/spiders/update.py new file mode 100644 index 0000000..38f1863 --- /dev/null +++ b/crawler/neodarznet/spiders/update.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor +from scrapy import Selector + +import datetime + +from database.models import Neodarznet + +from dateutil.relativedelta import * + +import logging + +class NeodarznetSpider(CrawlSpider): + name = "neodarznet_updater" + custom_settings = { + 'ITEM_PIPELINES': { + 'crawler.neodarznet.pipelines.NeodarznetPipeline': 0 + } + } + allow_domains = ['neodarz.net'] + + datas = Neodarznet.select(Neodarznet.url).dicts() + datas_array = [] + for value in datas: + datas_array.append(value['url']) + start_urls = datas_array + + def start_requests(self): + for url in self.start_urls: + logging.info(url) + try: + page = Neodarznet.get(Neodarznet.url == url) + if page.date_updated < datetime.datetime.now()+relativedelta(weeks=-1): + yield scrapy.Request(url, callback=self.parse_url, dont_filter=True) + except Neodarznet.DoesNotExist: + yield scrapy.Request(url, callback=self.parse_url, dont_filter=True) + continue + + def parse_url(self, response): + sel = Selector(response) + yield { + 'url': response.url, + 'title': response.css('title::text').extract_first(), + 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(), + 'content_length': len(response.body), + 'date_updated': datetime.datetime.now() + } diff --git a/crawler/nevrax/spiders/scrape.py b/crawler/nevrax/spiders/scrape.py index d27aecf..c9a8a53 100644 --- a/crawler/nevrax/spiders/scrape.py +++ b/crawler/nevrax/spiders/scrape.py @@ -8,7 +8,7 @@ import config import datetime class NevraxSpider(CrawlSpider): - name = "nevrax" + name = "nevrax_crawler" custom_settings = { 'ITEM_PIPELINES': { 'crawler.nevrax.pipelines.NevraxPipeline': 0 diff --git a/crawler/nevrax/spiders/update.py b/crawler/nevrax/spiders/update.py new file mode 100644 index 0000000..b3f7aa1 --- /dev/null +++ b/crawler/nevrax/spiders/update.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor +from scrapy import Selector + +import config +import datetime + +from database.models import Nevrax + +from dateutil.relativedelta import * + +import logging + +class NevraxSpider(CrawlSpider): + name = "nevrax_updater" + custom_settings = { + 'ITEM_PIPELINES': { + 'crawler.nevrax.pipelines.NevraxPipeline': 0 + } + } + allow_domains = [config.NEVRAX_URL] + + datas = Nevrax.select(Nevrax.url).dicts() + datas_array = [] + for value in datas: + datas_array.append(value['url']) + start_urls = datas_array + + def start_requests(self): + for url in self.start_urls: + logging.info(url) + try: + page = Nevrax.get(Nevrax.url == url) + if page.date_updated < datetime.datetime.now()+relativedelta(weeks=-1): + yield scrapy.Request(url, callback=self.parse_url, dont_filter=True) + except Nevrax.DoesNotExist: + yield scrapy.Request(url, callback=self.parse_url, dont_filter=True) + continue + + def parse_url(self, response): + sel = Selector(response) + yield { + 'url': response.url, + 'title': response.css('title::text').extract_first(), + 'content': ''.join(sel.select("//div[@class='bodya']//text()").extract()).strip(), + 'content_length': len(response.body), + 'date_updated': datetime.datetime.now() + } -- cgit v1.2.1