aboutsummaryrefslogtreecommitdiff
path: root/app.py
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-02-22 00:00:19 +0100
committerneodarz <neodarz@neodarz.net>2019-02-22 00:00:19 +0100
commite12eb8bd99c1b6695873632e0540fa07379671d4 (patch)
tree967ea553732aab7d7709da4e5ff5c6d8a72aa182 /app.py
parent582cfba635143c3fd00a914acc4c8e65587e5a21 (diff)
downloadkhanindexer-e12eb8bd99c1b6695873632e0540fa07379671d4.tar.xz
khanindexer-e12eb8bd99c1b6695873632e0540fa07379671d4.zip
Add a way to specify a crawl to launch
Diffstat (limited to 'app.py')
-rw-r--r--app.py25
1 files changed, 21 insertions, 4 deletions
diff --git a/app.py b/app.py
index ad44a14..9c1c302 100644
--- a/app.py
+++ b/app.py
@@ -6,15 +6,27 @@ from scrapy.utils.project import get_project_settings
from database.models import Page, db
-def crawl():
+def crawl(the_spider=""):
try:
db.create_tables(Page.__subclasses__())
settings = get_project_settings()
process = CrawlerProcess(settings)
spiders = spiderloader.SpiderLoader.from_settings(settings)
- for spider in spiders.list():
- if "crawl" in spider:
- process.crawl(spider)
+
+ if the_spider == "":
+ for spider in spiders.list():
+ if "crawl" in spider:
+ process.crawl(spider)
+ elif the_spider in spiders.list():
+ process.crawl(the_spider)
+ else:
+ print("`"+the_spider+"` is not a valid spider.")
+ print('Valid spider are: ')
+ for spider in spiders.list():
+ if "crawl" in spider:
+ print(" "+spider)
+ sys.exit(1)
+
process.start()
except Exception as e:
print(e)
@@ -44,5 +56,10 @@ if __name__ == '__main__':
update()
else:
show_help()
+ elif len(sys.argv) == 3:
+ if sys.argv[1] == "crawl":
+ crawl(sys.argv[2])
+ else:
+ show_help()
else:
show_help()