import scrapy import sys from scrapy.crawler import CrawlerProcess from scrapy import spiderloader from scrapy.utils.project import get_project_settings from database.models import Page, db def crawl(the_spider=""): try: db.create_tables(Page.__subclasses__()) settings = get_project_settings() process = CrawlerProcess(settings) spiders = spiderloader.SpiderLoader.from_settings(settings) if the_spider == "": for spider in spiders.list(): if "crawl" in spider: process.crawl(spider) elif the_spider in spiders.list(): process.crawl(the_spider) else: print("`"+the_spider+"` is not a valid spider.") print('Valid spider are: ') for spider in spiders.list(): if "crawl" in spider: print(" "+spider) sys.exit(1) process.start() except Exception as e: print(e) def update(): try: settings = get_project_settings() process = CrawlerProcess(settings) spiders = spiderloader.SpiderLoader.from_settings(settings) for spider in spiders.list(): if "update" in spider: process.crawl(spider) process.start() except Exception as e: print(e) def show_help(): print("Launch all crawler => "+str(sys.argv[0])+" crawl") print("Update all page already crawlerd => "+str(sys.argv[0])+" update") if __name__ == '__main__': #main() if len(sys.argv) == 2: if sys.argv[1] == "crawl": crawl() elif sys.argv[1] == "update": update() else: show_help() elif len(sys.argv) == 3: if sys.argv[1] == "crawl": crawl(sys.argv[2]) else: show_help() else: show_help()