From e12eb8bd99c1b6695873632e0540fa07379671d4 Mon Sep 17 00:00:00 2001 From: neodarz Date: Fri, 22 Feb 2019 00:00:19 +0100 Subject: Add a way to specify a crawl to launch --- README.md | 7 +++++++ app.py | 25 +++++++++++++++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c5fcd6a..34ce041 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,13 @@ For launch all the crawler use the following command: python app.py crawl ``` +You can also specific a spider to crawl, for example `nevrax_crawler` with the +command: + +``` +python app.py crawl nevrax_crawler +``` + # Indexing Before lauch indexing or searching command you must verifiy that the folder of diff --git a/app.py b/app.py index ad44a14..9c1c302 100644 --- a/app.py +++ b/app.py @@ -6,15 +6,27 @@ from scrapy.utils.project import get_project_settings from database.models import Page, db -def crawl(): +def crawl(the_spider=""): try: db.create_tables(Page.__subclasses__()) settings = get_project_settings() process = CrawlerProcess(settings) spiders = spiderloader.SpiderLoader.from_settings(settings) - for spider in spiders.list(): - if "crawl" in spider: - process.crawl(spider) + + if the_spider == "": + for spider in spiders.list(): + if "crawl" in spider: + process.crawl(spider) + elif the_spider in spiders.list(): + process.crawl(the_spider) + else: + print("`"+the_spider+"` is not a valid spider.") + print('Valid spider are: ') + for spider in spiders.list(): + if "crawl" in spider: + print(" "+spider) + sys.exit(1) + process.start() except Exception as e: print(e) @@ -44,5 +56,10 @@ if __name__ == '__main__': update() else: show_help() + elif len(sys.argv) == 3: + if sys.argv[1] == "crawl": + crawl(sys.argv[2]) + else: + show_help() else: show_help() -- cgit v1.2.1