Add first implemention of a cron like functionHEAD master

author: neodarz <neodarz@neodarz.net> 2019-02-24 17:10:50 +0100
committer: neodarz <neodarz@neodarz.net> 2019-02-24 17:10:50 +0100
commit: 5e448f52252df2e92565ece488c9dffd49e082d2 (patch)
tree: a0d850a8ff255429b3b55039a37e51fa7a0e2c6b /app.py
parent: e12eb8bd99c1b6695873632e0540fa07379671d4 (diff)
download: khanindexer-5e448f52252df2e92565ece488c9dffd49e082d2.tar.xz
khanindexer-5e448f52252df2e92565ece488c9dffd49e082d2.zip
1 files changed, 43 insertions, 5 deletions
diff --git a/app.py b/app.py
index 9c1c302..744b697 100644
--- a/app.py
+++ b/app.py
@@ -1,17 +1,27 @@
 import scrapy
 import sys
-from scrapy.crawler import CrawlerProcess
+import os
+import time
+from scrapy.crawler import CrawlerProcess, CrawlerRunner
 from scrapy import spiderloader
 from scrapy.utils.project import get_project_settings
+from crochet import setup
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers import cron
 
 from database.models import Page, db
+import config
 
-def crawl(the_spider=""):
+def crawl(the_spider="", cron=False):
     try:
         db.create_tables(Page.__subclasses__())
         settings = get_project_settings()
-        process = CrawlerProcess(settings)
         spiders = spiderloader.SpiderLoader.from_settings(settings)
+        if cron:
+            setup()
+            process = CrawlerRunner(settings)
+        else:
+            process = CrawlerProcess(settings)
 
         if the_spider == "":
             for spider in spiders.list():
@@ -27,9 +37,10 @@ def crawl(the_spider=""):
                     print("    "+spider)
             sys.exit(1)
 
-        process.start()
+        if not cron:
+            process.start()
     except Exception as e:
-        print(e)
+        print("Error:"+str(e))
 
 def update():
     try:
@@ -43,6 +54,31 @@ def update():
     except Exception as e:
         print(e)
 
+def cron_scheduling():
+    try:
+        db.create_tables(Page.__subclasses__())
+
+        scheduler = BackgroundScheduler()
+        scheduler.add_jobstore('sqlalchemy', url=config.CRON_DB)
+        variables = ['minute', 'hour', 'month', 'day_of_week', 'crawler', 'id']
+        with open(config.CRON_CONF) as f:
+            for line in f:
+                if not line.startswith('#'):
+                    datas = list(filter(lambda x: x != "", line.rstrip().split(" ")))
+                    cron_table = dict(zip(variables, datas))
+                    scheduler.add_job(crawl, 'cron', args=[cron_table['crawler'], 'cron'], minute=cron_table['minute'], hour=cron_table['hour'], month=cron_table['month'], day_of_week=cron_table['day_of_week'], id=cron_table['id'], replace_existing=True)
+        print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
+
+        scheduler.start()
+
+        while True:
+            time.sleep(1)
+
+    except Exception as e:
+        print(e)
+
+
+
 def show_help():
     print("Launch all crawler               => "+str(sys.argv[0])+" crawl")
     print("Update all page already crawlerd => "+str(sys.argv[0])+" update")
@@ -54,6 +90,8 @@ if __name__ == '__main__':
             crawl()
         elif sys.argv[1] == "update":
             update()
+        elif sys.argv[1] == "cron":
+            cron_scheduling()
         else:
             show_help()
     elif len(sys.argv) == 3:
author	neodarz <neodarz@neodarz.net>	2019-02-24 17:10:50 +0100
committer	neodarz <neodarz@neodarz.net>	2019-02-24 17:10:50 +0100
commit	5e448f52252df2e92565ece488c9dffd49e082d2 (patch)
tree	a0d850a8ff255429b3b55039a37e51fa7a0e2c6b /app.py
parent	e12eb8bd99c1b6695873632e0540fa07379671d4 (diff)
download	khanindexer-5e448f52252df2e92565ece488c9dffd49e082d2.tar.xz khanindexer-5e448f52252df2e92565ece488c9dffd49e082d2.zip