From 5e448f52252df2e92565ece488c9dffd49e082d2 Mon Sep 17 00:00:00 2001 From: neodarz Date: Sun, 24 Feb 2019 17:10:50 +0100 Subject: Add first implemention of a cron like function --- .gitignore | 3 +++ README.md | 20 ++++++++++++++++++++ app.py | 48 +++++++++++++++++++++++++++++++++++++++++++----- config.py | 3 +++ cron.conf | 3 +++ 5 files changed, 72 insertions(+), 5 deletions(-) create mode 100644 cron.conf diff --git a/.gitignore b/.gitignore index 72364f9..5a56814 100644 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,6 @@ ENV/ # Rope project settings .ropeproject + +# Project jobs list +jobs.db diff --git a/README.md b/README.md index 34ce041..7c803d3 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,26 @@ content modified, you can use: python app.py update ``` +# Cron + +If you want to use cron for start tasks you can use it. +But there is a cron like function with this app who do nothing more that cron +for the moment. For start it just use the following command: + +``` +python app.py cron +``` + +All the configuration are in the file `cron.conf` and the syntax is the same +that [UNIX cron](https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html?highlight=cron) +but there is some some project specification, checkout the first line of the +file who is a comment about his structure. + +Note 1: There is an id column, make sure all ids are different elsewhere the last +task erase the previous one with the same id. + +Note 2: This is only implemented on the crawl function for the moment. + # Enjoy For start searching send a `POST` request with the manticoresearch json API, diff --git a/app.py b/app.py index 9c1c302..744b697 100644 --- a/app.py +++ b/app.py @@ -1,17 +1,27 @@ import scrapy import sys -from scrapy.crawler import CrawlerProcess +import os +import time +from scrapy.crawler import CrawlerProcess, CrawlerRunner from scrapy import spiderloader from scrapy.utils.project import get_project_settings +from crochet import setup +from apscheduler.schedulers.background import BackgroundScheduler +from apscheduler.triggers import cron from database.models import Page, db +import config -def crawl(the_spider=""): +def crawl(the_spider="", cron=False): try: db.create_tables(Page.__subclasses__()) settings = get_project_settings() - process = CrawlerProcess(settings) spiders = spiderloader.SpiderLoader.from_settings(settings) + if cron: + setup() + process = CrawlerRunner(settings) + else: + process = CrawlerProcess(settings) if the_spider == "": for spider in spiders.list(): @@ -27,9 +37,10 @@ def crawl(the_spider=""): print(" "+spider) sys.exit(1) - process.start() + if not cron: + process.start() except Exception as e: - print(e) + print("Error:"+str(e)) def update(): try: @@ -43,6 +54,31 @@ def update(): except Exception as e: print(e) +def cron_scheduling(): + try: + db.create_tables(Page.__subclasses__()) + + scheduler = BackgroundScheduler() + scheduler.add_jobstore('sqlalchemy', url=config.CRON_DB) + variables = ['minute', 'hour', 'month', 'day_of_week', 'crawler', 'id'] + with open(config.CRON_CONF) as f: + for line in f: + if not line.startswith('#'): + datas = list(filter(lambda x: x != "", line.rstrip().split(" "))) + cron_table = dict(zip(variables, datas)) + scheduler.add_job(crawl, 'cron', args=[cron_table['crawler'], 'cron'], minute=cron_table['minute'], hour=cron_table['hour'], month=cron_table['month'], day_of_week=cron_table['day_of_week'], id=cron_table['id'], replace_existing=True) + print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) + + scheduler.start() + + while True: + time.sleep(1) + + except Exception as e: + print(e) + + + def show_help(): print("Launch all crawler => "+str(sys.argv[0])+" crawl") print("Update all page already crawlerd => "+str(sys.argv[0])+" update") @@ -54,6 +90,8 @@ if __name__ == '__main__': crawl() elif sys.argv[1] == "update": update() + elif sys.argv[1] == "cron": + cron_scheduling() else: show_help() elif len(sys.argv) == 3: diff --git a/config.py b/config.py index 3e1ddbc..751875d 100644 --- a/config.py +++ b/config.py @@ -12,3 +12,6 @@ SPHINX_HOST = '127.0.0.1' SPHINX_PORT = 9312 NEVRAX_URL = "127.0.0.1:8000" + +CRON_DB = "sqlite:///jobs.db" +CRON_CONF = "cron.conf" diff --git a/cron.conf b/cron.conf new file mode 100644 index 0000000..5681f82 --- /dev/null +++ b/cron.conf @@ -0,0 +1,3 @@ +#minute hour month day of week crawler id +00 01 * 0 neodarznet_crawler 1 +02 01 * 0 nevrax_crawler 2 -- cgit v1.2.1