Add first implemention of a cron like functionHEAD master

author: neodarz <neodarz@neodarz.net> 2019-02-24 17:10:50 +0100
committer: neodarz <neodarz@neodarz.net> 2019-02-24 17:10:50 +0100
commit: 5e448f52252df2e92565ece488c9dffd49e082d2 (patch)
tree: a0d850a8ff255429b3b55039a37e51fa7a0e2c6b
parent: e12eb8bd99c1b6695873632e0540fa07379671d4 (diff)
download: khanindexer-master.tar.xz
khanindexer-master.zip
5 files changed, 72 insertions, 5 deletions
diff --git a/.gitignore b/.gitignore
index 72364f9..5a56814 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,3 +87,6 @@ ENV/
 
 # Rope project settings
 .ropeproject
+
+# Project jobs list
+jobs.db
diff --git a/README.md b/README.md
index 34ce041..7c803d3 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,26 @@ content modified, you can use:
 python app.py update
 ```
 
+# Cron
+
+If you want to use cron for start tasks you can use it.
+But there is a cron like function with this app who do nothing more that cron
+for the moment. For start it just use the following command:
+
+```
+python app.py cron
+```
+
+All the configuration are in the file `cron.conf` and the syntax is the same
+that [UNIX cron](https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html?highlight=cron)
+but there is some some project specification, checkout the first line of the
+file who is a comment about his structure.
+
+Note 1: There is an id column, make sure all ids are different elsewhere the last
+task erase the previous one with the same id.
+
+Note 2: This is only implemented on the crawl function for the moment.
+
 # Enjoy
 
 For start searching send a `POST` request with the manticoresearch json API,
diff --git a/app.py b/app.py
index 9c1c302..744b697 100644
--- a/app.py
+++ b/app.py
@@ -1,17 +1,27 @@
 import scrapy
 import sys
-from scrapy.crawler import CrawlerProcess
+import os
+import time
+from scrapy.crawler import CrawlerProcess, CrawlerRunner
 from scrapy import spiderloader
 from scrapy.utils.project import get_project_settings
+from crochet import setup
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers import cron
 
 from database.models import Page, db
+import config
 
-def crawl(the_spider=""):
+def crawl(the_spider="", cron=False):
     try:
         db.create_tables(Page.__subclasses__())
         settings = get_project_settings()
-        process = CrawlerProcess(settings)
         spiders = spiderloader.SpiderLoader.from_settings(settings)
+        if cron:
+            setup()
+            process = CrawlerRunner(settings)
+        else:
+            process = CrawlerProcess(settings)
 
         if the_spider == "":
             for spider in spiders.list():
@@ -27,9 +37,10 @@ def crawl(the_spider=""):
                     print("    "+spider)
             sys.exit(1)
 
-        process.start()
+        if not cron:
+            process.start()
     except Exception as e:
-        print(e)
+        print("Error:"+str(e))
 
 def update():
     try:
@@ -43,6 +54,31 @@ def update():
     except Exception as e:
         print(e)
 
+def cron_scheduling():
+    try:
+        db.create_tables(Page.__subclasses__())
+
+        scheduler = BackgroundScheduler()
+        scheduler.add_jobstore('sqlalchemy', url=config.CRON_DB)
+        variables = ['minute', 'hour', 'month', 'day_of_week', 'crawler', 'id']
+        with open(config.CRON_CONF) as f:
+            for line in f:
+                if not line.startswith('#'):
+                    datas = list(filter(lambda x: x != "", line.rstrip().split(" ")))
+                    cron_table = dict(zip(variables, datas))
+                    scheduler.add_job(crawl, 'cron', args=[cron_table['crawler'], 'cron'], minute=cron_table['minute'], hour=cron_table['hour'], month=cron_table['month'], day_of_week=cron_table['day_of_week'], id=cron_table['id'], replace_existing=True)
+        print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
+
+        scheduler.start()
+
+        while True:
+            time.sleep(1)
+
+    except Exception as e:
+        print(e)
+
+
+
 def show_help():
     print("Launch all crawler               => "+str(sys.argv[0])+" crawl")
     print("Update all page already crawlerd => "+str(sys.argv[0])+" update")
@@ -54,6 +90,8 @@ if __name__ == '__main__':
             crawl()
         elif sys.argv[1] == "update":
             update()
+        elif sys.argv[1] == "cron":
+            cron_scheduling()
         else:
             show_help()
     elif len(sys.argv) == 3:
diff --git a/config.py b/config.py
index 3e1ddbc..751875d 100644
--- a/config.py
+++ b/config.py
@@ -12,3 +12,6 @@ SPHINX_HOST = '127.0.0.1'
 SPHINX_PORT = 9312
 
 NEVRAX_URL = "127.0.0.1:8000"
+
+CRON_DB = "sqlite:///jobs.db"
+CRON_CONF = "cron.conf"
diff --git a/cron.conf b/cron.conf
new file mode 100644
index 0000000..5681f82
--- /dev/null
+++ b/cron.conf
@@ -0,0 +1,3 @@
+#minute     hour       month       day of week             crawler             id
+00           01           *               0               neodarznet_crawler      1
+02           01           *               0               nevrax_crawler      2
author	neodarz <neodarz@neodarz.net>	2019-02-24 17:10:50 +0100
committer	neodarz <neodarz@neodarz.net>	2019-02-24 17:10:50 +0100
commit	5e448f52252df2e92565ece488c9dffd49e082d2 (patch)
tree	a0d850a8ff255429b3b55039a37e51fa7a0e2c6b
parent	e12eb8bd99c1b6695873632e0540fa07379671d4 (diff)
download	khanindexer-master.tar.xz khanindexer-master.zip