aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-02-24 17:10:50 +0100
committerneodarz <neodarz@neodarz.net>2019-02-24 17:10:50 +0100
commit5e448f52252df2e92565ece488c9dffd49e082d2 (patch)
treea0d850a8ff255429b3b55039a37e51fa7a0e2c6b
parente12eb8bd99c1b6695873632e0540fa07379671d4 (diff)
downloadkhanindexer-master.tar.xz
khanindexer-master.zip
Add first implemention of a cron like functionHEADmaster
-rw-r--r--.gitignore3
-rw-r--r--README.md20
-rw-r--r--app.py48
-rw-r--r--config.py3
-rw-r--r--cron.conf3
5 files changed, 72 insertions, 5 deletions
diff --git a/.gitignore b/.gitignore
index 72364f9..5a56814 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,3 +87,6 @@ ENV/
# Rope project settings
.ropeproject
+
+# Project jobs list
+jobs.db
diff --git a/README.md b/README.md
index 34ce041..7c803d3 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,26 @@ content modified, you can use:
python app.py update
```
+# Cron
+
+If you want to use cron for start tasks you can use it.
+But there is a cron like function with this app who do nothing more that cron
+for the moment. For start it just use the following command:
+
+```
+python app.py cron
+```
+
+All the configuration are in the file `cron.conf` and the syntax is the same
+that [UNIX cron](https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html?highlight=cron)
+but there is some some project specification, checkout the first line of the
+file who is a comment about his structure.
+
+Note 1: There is an id column, make sure all ids are different elsewhere the last
+task erase the previous one with the same id.
+
+Note 2: This is only implemented on the crawl function for the moment.
+
# Enjoy
For start searching send a `POST` request with the manticoresearch json API,
diff --git a/app.py b/app.py
index 9c1c302..744b697 100644
--- a/app.py
+++ b/app.py
@@ -1,17 +1,27 @@
import scrapy
import sys
-from scrapy.crawler import CrawlerProcess
+import os
+import time
+from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy import spiderloader
from scrapy.utils.project import get_project_settings
+from crochet import setup
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers import cron
from database.models import Page, db
+import config
-def crawl(the_spider=""):
+def crawl(the_spider="", cron=False):
try:
db.create_tables(Page.__subclasses__())
settings = get_project_settings()
- process = CrawlerProcess(settings)
spiders = spiderloader.SpiderLoader.from_settings(settings)
+ if cron:
+ setup()
+ process = CrawlerRunner(settings)
+ else:
+ process = CrawlerProcess(settings)
if the_spider == "":
for spider in spiders.list():
@@ -27,9 +37,10 @@ def crawl(the_spider=""):
print(" "+spider)
sys.exit(1)
- process.start()
+ if not cron:
+ process.start()
except Exception as e:
- print(e)
+ print("Error:"+str(e))
def update():
try:
@@ -43,6 +54,31 @@ def update():
except Exception as e:
print(e)
+def cron_scheduling():
+ try:
+ db.create_tables(Page.__subclasses__())
+
+ scheduler = BackgroundScheduler()
+ scheduler.add_jobstore('sqlalchemy', url=config.CRON_DB)
+ variables = ['minute', 'hour', 'month', 'day_of_week', 'crawler', 'id']
+ with open(config.CRON_CONF) as f:
+ for line in f:
+ if not line.startswith('#'):
+ datas = list(filter(lambda x: x != "", line.rstrip().split(" ")))
+ cron_table = dict(zip(variables, datas))
+ scheduler.add_job(crawl, 'cron', args=[cron_table['crawler'], 'cron'], minute=cron_table['minute'], hour=cron_table['hour'], month=cron_table['month'], day_of_week=cron_table['day_of_week'], id=cron_table['id'], replace_existing=True)
+ print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
+
+ scheduler.start()
+
+ while True:
+ time.sleep(1)
+
+ except Exception as e:
+ print(e)
+
+
+
def show_help():
print("Launch all crawler => "+str(sys.argv[0])+" crawl")
print("Update all page already crawlerd => "+str(sys.argv[0])+" update")
@@ -54,6 +90,8 @@ if __name__ == '__main__':
crawl()
elif sys.argv[1] == "update":
update()
+ elif sys.argv[1] == "cron":
+ cron_scheduling()
else:
show_help()
elif len(sys.argv) == 3:
diff --git a/config.py b/config.py
index 3e1ddbc..751875d 100644
--- a/config.py
+++ b/config.py
@@ -12,3 +12,6 @@ SPHINX_HOST = '127.0.0.1'
SPHINX_PORT = 9312
NEVRAX_URL = "127.0.0.1:8000"
+
+CRON_DB = "sqlite:///jobs.db"
+CRON_CONF = "cron.conf"
diff --git a/cron.conf b/cron.conf
new file mode 100644
index 0000000..5681f82
--- /dev/null
+++ b/cron.conf
@@ -0,0 +1,3 @@
+#minute hour month day of week crawler id
+00 01 * 0 neodarznet_crawler 1
+02 01 * 0 nevrax_crawler 2