diff options
author | neodarz <neodarz@neodarz.net> | 2019-02-24 17:10:50 +0100 |
---|---|---|
committer | neodarz <neodarz@neodarz.net> | 2019-02-24 17:10:50 +0100 |
commit | 5e448f52252df2e92565ece488c9dffd49e082d2 (patch) | |
tree | a0d850a8ff255429b3b55039a37e51fa7a0e2c6b /app.py | |
parent | e12eb8bd99c1b6695873632e0540fa07379671d4 (diff) | |
download | khanindexer-5e448f52252df2e92565ece488c9dffd49e082d2.tar.xz khanindexer-5e448f52252df2e92565ece488c9dffd49e082d2.zip |
Diffstat (limited to '')
-rw-r--r-- | app.py | 48 |
1 files changed, 43 insertions, 5 deletions
@@ -1,17 +1,27 @@ import scrapy import sys -from scrapy.crawler import CrawlerProcess +import os +import time +from scrapy.crawler import CrawlerProcess, CrawlerRunner from scrapy import spiderloader from scrapy.utils.project import get_project_settings +from crochet import setup +from apscheduler.schedulers.background import BackgroundScheduler +from apscheduler.triggers import cron from database.models import Page, db +import config -def crawl(the_spider=""): +def crawl(the_spider="", cron=False): try: db.create_tables(Page.__subclasses__()) settings = get_project_settings() - process = CrawlerProcess(settings) spiders = spiderloader.SpiderLoader.from_settings(settings) + if cron: + setup() + process = CrawlerRunner(settings) + else: + process = CrawlerProcess(settings) if the_spider == "": for spider in spiders.list(): @@ -27,9 +37,10 @@ def crawl(the_spider=""): print(" "+spider) sys.exit(1) - process.start() + if not cron: + process.start() except Exception as e: - print(e) + print("Error:"+str(e)) def update(): try: @@ -43,6 +54,31 @@ def update(): except Exception as e: print(e) +def cron_scheduling(): + try: + db.create_tables(Page.__subclasses__()) + + scheduler = BackgroundScheduler() + scheduler.add_jobstore('sqlalchemy', url=config.CRON_DB) + variables = ['minute', 'hour', 'month', 'day_of_week', 'crawler', 'id'] + with open(config.CRON_CONF) as f: + for line in f: + if not line.startswith('#'): + datas = list(filter(lambda x: x != "", line.rstrip().split(" "))) + cron_table = dict(zip(variables, datas)) + scheduler.add_job(crawl, 'cron', args=[cron_table['crawler'], 'cron'], minute=cron_table['minute'], hour=cron_table['hour'], month=cron_table['month'], day_of_week=cron_table['day_of_week'], id=cron_table['id'], replace_existing=True) + print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) + + scheduler.start() + + while True: + time.sleep(1) + + except Exception as e: + print(e) + + + def show_help(): print("Launch all crawler => "+str(sys.argv[0])+" crawl") print("Update all page already crawlerd => "+str(sys.argv[0])+" update") @@ -54,6 +90,8 @@ if __name__ == '__main__': crawl() elif sys.argv[1] == "update": update() + elif sys.argv[1] == "cron": + cron_scheduling() else: show_help() elif len(sys.argv) == 3: |