Add ability to update url who are one week old and content modified

author: neodarz <neodarz@neodarz.net> 2019-02-06 19:15:36 +0100
committer: neodarz <neodarz@neodarz.net> 2019-02-06 19:15:36 +0100
commit: 9a8badd5dffe47813489ab0b355f5db5faa66646 (patch)
tree: bf53db13612cd7c32f2eadfe905cad83ce50a0d7 /app.py
parent: f84e8fb75b8096dff5a39936ac26c933fdba3059 (diff)
download: khanindexer-9a8badd5dffe47813489ab0b355f5db5faa66646.tar.xz
khanindexer-9a8badd5dffe47813489ab0b355f5db5faa66646.zip
1 files changed, 29 insertions, 3 deletions
diff --git a/app.py b/app.py
index 431c2b0..ad44a14 100644
--- a/app.py
+++ b/app.py
@@ -6,17 +6,43 @@ from scrapy.utils.project import get_project_settings
 
 from database.models import Page, db
 
-def main():
+def crawl():
     try:
         db.create_tables(Page.__subclasses__())
         settings = get_project_settings()
         process = CrawlerProcess(settings)
         spiders = spiderloader.SpiderLoader.from_settings(settings)
         for spider in spiders.list():
-            process.crawl(spider)
+            if "crawl" in spider:
+                process.crawl(spider)
         process.start()
     except Exception as e:
         print(e)
 
+def update():
+    try:
+        settings = get_project_settings()
+        process = CrawlerProcess(settings)
+        spiders = spiderloader.SpiderLoader.from_settings(settings)
+        for spider in spiders.list():
+            if "update" in spider:
+                process.crawl(spider)
+        process.start()
+    except Exception as e:
+        print(e)
+
+def show_help():
+    print("Launch all crawler               => "+str(sys.argv[0])+" crawl")
+    print("Update all page already crawlerd => "+str(sys.argv[0])+" update")
+
 if __name__ == '__main__':
-    main()
+    #main()
+    if len(sys.argv) == 2:
+        if sys.argv[1] == "crawl":
+            crawl()
+        elif sys.argv[1] == "update":
+            update()
+        else:
+            show_help()
+    else:
+        show_help()
author	neodarz <neodarz@neodarz.net>	2019-02-06 19:15:36 +0100
committer	neodarz <neodarz@neodarz.net>	2019-02-06 19:15:36 +0100
commit	9a8badd5dffe47813489ab0b355f5db5faa66646 (patch)
tree	bf53db13612cd7c32f2eadfe905cad83ce50a0d7 /app.py
parent	f84e8fb75b8096dff5a39936ac26c933fdba3059 (diff)
download	khanindexer-9a8badd5dffe47813489ab0b355f5db5faa66646.tar.xz khanindexer-9a8badd5dffe47813489ab0b355f5db5faa66646.zip