aboutsummaryrefslogtreecommitdiff
path: root/app.py
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-02-06 19:15:36 +0100
committerneodarz <neodarz@neodarz.net>2019-02-06 19:15:36 +0100
commit9a8badd5dffe47813489ab0b355f5db5faa66646 (patch)
treebf53db13612cd7c32f2eadfe905cad83ce50a0d7 /app.py
parentf84e8fb75b8096dff5a39936ac26c933fdba3059 (diff)
downloadkhanindexer-9a8badd5dffe47813489ab0b355f5db5faa66646.tar.xz
khanindexer-9a8badd5dffe47813489ab0b355f5db5faa66646.zip
Add ability to update url who are one week old and content modified
Diffstat (limited to 'app.py')
-rw-r--r--app.py32
1 files changed, 29 insertions, 3 deletions
diff --git a/app.py b/app.py
index 431c2b0..ad44a14 100644
--- a/app.py
+++ b/app.py
@@ -6,17 +6,43 @@ from scrapy.utils.project import get_project_settings
from database.models import Page, db
-def main():
+def crawl():
try:
db.create_tables(Page.__subclasses__())
settings = get_project_settings()
process = CrawlerProcess(settings)
spiders = spiderloader.SpiderLoader.from_settings(settings)
for spider in spiders.list():
- process.crawl(spider)
+ if "crawl" in spider:
+ process.crawl(spider)
process.start()
except Exception as e:
print(e)
+def update():
+ try:
+ settings = get_project_settings()
+ process = CrawlerProcess(settings)
+ spiders = spiderloader.SpiderLoader.from_settings(settings)
+ for spider in spiders.list():
+ if "update" in spider:
+ process.crawl(spider)
+ process.start()
+ except Exception as e:
+ print(e)
+
+def show_help():
+ print("Launch all crawler => "+str(sys.argv[0])+" crawl")
+ print("Update all page already crawlerd => "+str(sys.argv[0])+" update")
+
if __name__ == '__main__':
- main()
+ #main()
+ if len(sys.argv) == 2:
+ if sys.argv[1] == "crawl":
+ crawl()
+ elif sys.argv[1] == "update":
+ update()
+ else:
+ show_help()
+ else:
+ show_help()