aboutsummaryrefslogtreecommitdiff
path: root/app.py
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2019-02-24 17:10:50 +0100
committerneodarz <neodarz@neodarz.net>2019-02-24 17:10:50 +0100
commit5e448f52252df2e92565ece488c9dffd49e082d2 (patch)
treea0d850a8ff255429b3b55039a37e51fa7a0e2c6b /app.py
parente12eb8bd99c1b6695873632e0540fa07379671d4 (diff)
downloadkhanindexer-5e448f52252df2e92565ece488c9dffd49e082d2.tar.xz
khanindexer-5e448f52252df2e92565ece488c9dffd49e082d2.zip
Add first implemention of a cron like functionHEADmaster
Diffstat (limited to '')
-rw-r--r--app.py48
1 files changed, 43 insertions, 5 deletions
diff --git a/app.py b/app.py
index 9c1c302..744b697 100644
--- a/app.py
+++ b/app.py
@@ -1,17 +1,27 @@
import scrapy
import sys
-from scrapy.crawler import CrawlerProcess
+import os
+import time
+from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy import spiderloader
from scrapy.utils.project import get_project_settings
+from crochet import setup
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers import cron
from database.models import Page, db
+import config
-def crawl(the_spider=""):
+def crawl(the_spider="", cron=False):
try:
db.create_tables(Page.__subclasses__())
settings = get_project_settings()
- process = CrawlerProcess(settings)
spiders = spiderloader.SpiderLoader.from_settings(settings)
+ if cron:
+ setup()
+ process = CrawlerRunner(settings)
+ else:
+ process = CrawlerProcess(settings)
if the_spider == "":
for spider in spiders.list():
@@ -27,9 +37,10 @@ def crawl(the_spider=""):
print(" "+spider)
sys.exit(1)
- process.start()
+ if not cron:
+ process.start()
except Exception as e:
- print(e)
+ print("Error:"+str(e))
def update():
try:
@@ -43,6 +54,31 @@ def update():
except Exception as e:
print(e)
+def cron_scheduling():
+ try:
+ db.create_tables(Page.__subclasses__())
+
+ scheduler = BackgroundScheduler()
+ scheduler.add_jobstore('sqlalchemy', url=config.CRON_DB)
+ variables = ['minute', 'hour', 'month', 'day_of_week', 'crawler', 'id']
+ with open(config.CRON_CONF) as f:
+ for line in f:
+ if not line.startswith('#'):
+ datas = list(filter(lambda x: x != "", line.rstrip().split(" ")))
+ cron_table = dict(zip(variables, datas))
+ scheduler.add_job(crawl, 'cron', args=[cron_table['crawler'], 'cron'], minute=cron_table['minute'], hour=cron_table['hour'], month=cron_table['month'], day_of_week=cron_table['day_of_week'], id=cron_table['id'], replace_existing=True)
+ print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
+
+ scheduler.start()
+
+ while True:
+ time.sleep(1)
+
+ except Exception as e:
+ print(e)
+
+
+
def show_help():
print("Launch all crawler => "+str(sys.argv[0])+" crawl")
print("Update all page already crawlerd => "+str(sys.argv[0])+" update")
@@ -54,6 +90,8 @@ if __name__ == '__main__':
crawl()
elif sys.argv[1] == "update":
update()
+ elif sys.argv[1] == "cron":
+ cron_scheduling()
else:
show_help()
elif len(sys.argv) == 3: