aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorneodarz <neodarz@neodarz.net>2017-04-16 16:50:34 +0200
committerneodarz <neodarz@neodarz.net>2017-04-16 16:50:34 +0200
commit96fa8536264a39872c3674ad70472e843c5dc661 (patch)
treee072d13f19c83387175c5a055fb2e888e86a2fea
parent6e5a07e1f101b2f7d8d036d37ff99348cada0218 (diff)
downloadryzomcore_searx-96fa8536264a39872c3674ad70472e843c5dc661.tar.xz
ryzomcore_searx-96fa8536264a39872c3674ad70472e843c5dc661.zip
Update crawler forul of khaganat
-rw-r--r--README.md2
-rw-r--r--khaganatForum.py77
2 files changed, 47 insertions, 32 deletions
diff --git a/README.md b/README.md
index dc94957..a5b45d2 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ Si "moteurs de recherche" est entre guillmet, c'est que ce ne sont pas des moteu
| [Wikhan](https://khaganat.net/wikhan) | OK | Sans date, mais possibilité de récupérer la date |
| [Blog](https://khaganat.net/blog/fr:start) | x | |
| [Ryzom dev](http://dev.ryzom.com/projects/ryzom/wiki) | OK | Récupération de la date |
-| [Forum khanagat](https://khaganat.net/forum/) | OK | Sans date et sans contenu |
+| [Forum khanagat](https://khaganat.net/forum/) | OK | Récupération de la date |
| [Forum ryzom](http://app.ryzom.com/app_forum/index.php) | X | |
| [Google Community](https://plus.google.com/u/0/communities/103798956862568269036) | X | |
| [nel-all archive](http://lists.nongnu.org/archive/html/nel-all/) | X | |
diff --git a/khaganatForum.py b/khaganatForum.py
index 75cd0be..781a1cf 100644
--- a/khaganatForum.py
+++ b/khaganatForum.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
# Doku Wiki
#
# @website https://www.dokuwiki.org/
@@ -56,11 +57,24 @@ def response(resp):
if not res_url:
continue
- #content = extract_text(r.xpath('.//div[@class="list_posts"]'))
- title = extract_text(r.xpath('.//div[@class="topic_details floatleft"]/h5'))
-
-
+ try:
+ title = extract_text(r.xpath('.//div[@class="topic_details floatleft"]/h5'))
+ except:
+ continue
+ try:
+ content = extract_text(r.xpath('.//div[@class="list_posts double_height"]'))
+ except:
+ content = ""
+ try:
+ dateBrut = extract_text(r.xpath('.//div[@class="topic_details floatleft"]/span/em')).encode('utf-8')
+ except:
+ dateBrut = "01 janvier 1970 à 01:01:01".encode('utf-8')
+ date = dateBrut.split(' ')
+ year = date[2]
+ day = date[0]
+ french_text_month = date[1]
+ time = date[4]
#dataBrut = extract_text(r.xpath('.//span[@class="date"]'))
#data = dataBrut.split('-')
@@ -75,36 +89,37 @@ def response(resp):
#TheDay = Thedate[1].split(',')
- #if Thedate[0] == "Jan":
- # ThedateMonth = 1
- #elif Thedate[0] == "Feb":
- # ThedateMonth = 2
- #elif Thedate[0] == "Mar":
- # ThedateMonth = 3
- #elif Thedate[0] == "Apr":
- # ThedateMonth = 4
- #elif Thedate[0] == "May":
- # ThedateMonth = 5
- #elif Thedate[0] == "Jun":
- # ThedateMonth = 6
- #elif Thedate[0] == "Jul":
- # ThedateMonth = 7
- #elif Thedate[0] == "Aug":
- # ThedateMonth = 8
- #elif Thedate[0] == "Sep":
- # ThedateMonth = 9
- #elif Thedate[0] == "Oct":
- # ThedateMonth = 10
- #elif Thedate[0] == "Nov":
- # ThedateMonth = 11
- #else:
- # ThedateMonth = 12
+ if french_text_month == "janvier":
+ Month = 1
+ elif french_text_month.decode('utf-8') == "février".decode('utf-8'):
+ Month = 2
+ elif french_text_month == "mars":
+ Month = 3
+ elif french_text_month == "avril":
+ Month = 4
+ elif french_text_month == "mai":
+ Month = 5
+ elif french_text_month == "juin":
+ Month = 6
+ elif french_text_month == "juillet":
+ Month = 7
+ elif french_text_month.decode('utf-8') == "août".decode('utf-8'):
+ Month = 8
+ elif french_text_month == "septembre":
+ Month = 9
+ elif french_text_month == "octobre":
+ Month = 10
+ elif french_text_month == "novembre":
+ Month = 11
+ else:
+ Month = 12
+
# append result
results.append({'title': title,
- 'content': "",
- 'url': base_url + res_url})
- #'publishedDate': datetime(int(Thedate[2]), ThedateMonth, int(TheDay[0]), 3, 1, 42)})
+ 'content': content,
+ 'url': base_url + res_url,
+ 'publishedDate': datetime(int(year), Month, int(day), 3, 1, 42)})