diff options
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | README.md | 20 | ||||
-rw-r--r-- | app.py | 48 | ||||
-rw-r--r-- | config.py | 3 | ||||
-rw-r--r-- | cron.conf | 3 |
5 files changed, 72 insertions, 5 deletions
@@ -87,3 +87,6 @@ ENV/ # Rope project settings .ropeproject + +# Project jobs list +jobs.db @@ -95,6 +95,26 @@ content modified, you can use: python app.py update ``` +# Cron + +If you want to use cron for start tasks you can use it. +But there is a cron like function with this app who do nothing more that cron +for the moment. For start it just use the following command: + +``` +python app.py cron +``` + +All the configuration are in the file `cron.conf` and the syntax is the same +that [UNIX cron](https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html?highlight=cron) +but there is some some project specification, checkout the first line of the +file who is a comment about his structure. + +Note 1: There is an id column, make sure all ids are different elsewhere the last +task erase the previous one with the same id. + +Note 2: This is only implemented on the crawl function for the moment. + # Enjoy For start searching send a `POST` request with the manticoresearch json API, @@ -1,17 +1,27 @@ import scrapy import sys -from scrapy.crawler import CrawlerProcess +import os +import time +from scrapy.crawler import CrawlerProcess, CrawlerRunner from scrapy import spiderloader from scrapy.utils.project import get_project_settings +from crochet import setup +from apscheduler.schedulers.background import BackgroundScheduler +from apscheduler.triggers import cron from database.models import Page, db +import config -def crawl(the_spider=""): +def crawl(the_spider="", cron=False): try: db.create_tables(Page.__subclasses__()) settings = get_project_settings() - process = CrawlerProcess(settings) spiders = spiderloader.SpiderLoader.from_settings(settings) + if cron: + setup() + process = CrawlerRunner(settings) + else: + process = CrawlerProcess(settings) if the_spider == "": for spider in spiders.list(): @@ -27,9 +37,10 @@ def crawl(the_spider=""): print(" "+spider) sys.exit(1) - process.start() + if not cron: + process.start() except Exception as e: - print(e) + print("Error:"+str(e)) def update(): try: @@ -43,6 +54,31 @@ def update(): except Exception as e: print(e) +def cron_scheduling(): + try: + db.create_tables(Page.__subclasses__()) + + scheduler = BackgroundScheduler() + scheduler.add_jobstore('sqlalchemy', url=config.CRON_DB) + variables = ['minute', 'hour', 'month', 'day_of_week', 'crawler', 'id'] + with open(config.CRON_CONF) as f: + for line in f: + if not line.startswith('#'): + datas = list(filter(lambda x: x != "", line.rstrip().split(" "))) + cron_table = dict(zip(variables, datas)) + scheduler.add_job(crawl, 'cron', args=[cron_table['crawler'], 'cron'], minute=cron_table['minute'], hour=cron_table['hour'], month=cron_table['month'], day_of_week=cron_table['day_of_week'], id=cron_table['id'], replace_existing=True) + print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) + + scheduler.start() + + while True: + time.sleep(1) + + except Exception as e: + print(e) + + + def show_help(): print("Launch all crawler => "+str(sys.argv[0])+" crawl") print("Update all page already crawlerd => "+str(sys.argv[0])+" update") @@ -54,6 +90,8 @@ if __name__ == '__main__': crawl() elif sys.argv[1] == "update": update() + elif sys.argv[1] == "cron": + cron_scheduling() else: show_help() elif len(sys.argv) == 3: @@ -12,3 +12,6 @@ SPHINX_HOST = '127.0.0.1' SPHINX_PORT = 9312 NEVRAX_URL = "127.0.0.1:8000" + +CRON_DB = "sqlite:///jobs.db" +CRON_CONF = "cron.conf" diff --git a/cron.conf b/cron.conf new file mode 100644 index 0000000..5681f82 --- /dev/null +++ b/cron.conf @@ -0,0 +1,3 @@ +#minute hour month day of week crawler id +00 01 * 0 neodarznet_crawler 1 +02 01 * 0 nevrax_crawler 2 |