aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-02-22 00:00:19 +0100
committerneodarz <neodarz@neodarz.net>2019-02-22 00:00:19 +0100
commite12eb8bd99c1b6695873632e0540fa07379671d4 (patch)
tree967ea553732aab7d7709da4e5ff5c6d8a72aa182
parent582cfba635143c3fd00a914acc4c8e65587e5a21 (diff)
downloadkhanindexer-e12eb8bd99c1b6695873632e0540fa07379671d4.tar.xz
khanindexer-e12eb8bd99c1b6695873632e0540fa07379671d4.zip
Add a way to specify a crawl to launch
-rw-r--r--README.md7
-rw-r--r--app.py25
2 files changed, 28 insertions, 4 deletions
diff --git a/README.md b/README.md
index c5fcd6a..34ce041 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,13 @@ For launch all the crawler use the following command:
python app.py crawl
```
+You can also specific a spider to crawl, for example `nevrax_crawler` with the
+command:
+
+```
+python app.py crawl nevrax_crawler
+```
+
# Indexing
Before lauch indexing or searching command you must verifiy that the folder of
diff --git a/app.py b/app.py
index ad44a14..9c1c302 100644
--- a/app.py
+++ b/app.py
@@ -6,15 +6,27 @@ from scrapy.utils.project import get_project_settings
from database.models import Page, db
-def crawl():
+def crawl(the_spider=""):
try:
db.create_tables(Page.__subclasses__())
settings = get_project_settings()
process = CrawlerProcess(settings)
spiders = spiderloader.SpiderLoader.from_settings(settings)
- for spider in spiders.list():
- if "crawl" in spider:
- process.crawl(spider)
+
+ if the_spider == "":
+ for spider in spiders.list():
+ if "crawl" in spider:
+ process.crawl(spider)
+ elif the_spider in spiders.list():
+ process.crawl(the_spider)
+ else:
+ print("`"+the_spider+"` is not a valid spider.")
+ print('Valid spider are: ')
+ for spider in spiders.list():
+ if "crawl" in spider:
+ print(" "+spider)
+ sys.exit(1)
+
process.start()
except Exception as e:
print(e)
@@ -44,5 +56,10 @@ if __name__ == '__main__':
update()
else:
show_help()
+ elif len(sys.argv) == 3:
+ if sys.argv[1] == "crawl":
+ crawl(sys.argv[2])
+ else:
+ show_help()
else:
show_help()