import scrapy import sys import os import time from scrapy.crawler import CrawlerProcess, CrawlerRunner from scrapy import spiderloader from scrapy.utils.project import get_project_settings from crochet import setup from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.triggers import cron from database.models import Page, db import config def crawl(the_spider="", cron=False): try: db.create_tables(Page.__subclasses__()) settings = get_project_settings() spiders = spiderloader.SpiderLoader.from_settings(settings) if cron: setup() process = CrawlerRunner(settings) else: process = CrawlerProcess(settings) if the_spider == "": for spider in spiders.list(): if "crawl" in spider: process.crawl(spider) elif the_spider in spiders.list(): process.crawl(the_spider) else: print("`"+the_spider+"` is not a valid spider.") print('Valid spider are: ') for spider in spiders.list(): if "crawl" in spider: print(" "+spider) sys.exit(1) if not cron: process.start() except Exception as e: print("Error:"+str(e)) def update(): try: settings = get_project_settings() process = CrawlerProcess(settings) spiders = spiderloader.SpiderLoader.from_settings(settings) for spider in spiders.list(): if "update" in spider: process.crawl(spider) process.start() except Exception as e: print(e) def cron_scheduling(): try: db.create_tables(Page.__subclasses__()) scheduler = BackgroundScheduler() scheduler.add_jobstore('sqlalchemy', url=config.CRON_DB) variables = ['minute', 'hour', 'month', 'day_of_week', 'crawler', 'id'] with open(config.CRON_CONF) as f: for line in f: if not line.startswith('#'): datas = list(filter(lambda x: x != "", line.rstrip().split(" "))) cron_table = dict(zip(variables, datas)) scheduler.add_job(crawl, 'cron', args=[cron_table['crawler'], 'cron'], minute=cron_table['minute'], hour=cron_table['hour'], month=cron_table['month'], day_of_week=cron_table['day_of_week'], id=cron_table['id'], replace_existing=True) print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) scheduler.start() while True: time.sleep(1) except Exception as e: print(e) def show_help(): print("Launch all crawler => "+str(sys.argv[0])+" crawl") print("Update all page already crawlerd => "+str(sys.argv[0])+" update") if __name__ == '__main__': #main() if len(sys.argv) == 2: if sys.argv[1] == "crawl": crawl() elif sys.argv[1] == "update": update() elif sys.argv[1] == "cron": cron_scheduling() else: show_help() elif len(sys.argv) == 3: if sys.argv[1] == "crawl": crawl(sys.argv[2]) else: show_help() else: show_help()