Update crawler forul of khaganat

author: neodarz <neodarz@neodarz.net> 2017-04-16 16:50:34 +0200
committer: neodarz <neodarz@neodarz.net> 2017-04-16 16:50:34 +0200
commit: 96fa8536264a39872c3674ad70472e843c5dc661 (patch)
tree: e072d13f19c83387175c5a055fb2e888e86a2fea
parent: 6e5a07e1f101b2f7d8d036d37ff99348cada0218 (diff)
download: ryzomcore_searx-96fa8536264a39872c3674ad70472e843c5dc661.tar.xz
ryzomcore_searx-96fa8536264a39872c3674ad70472e843c5dc661.zip
2 files changed, 47 insertions, 32 deletions
diff --git a/README.md b/README.md
index dc94957..a5b45d2 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ Si "moteurs de recherche" est entre guillmet, c'est que ce ne sont pas des moteu
 | [Wikhan](https://khaganat.net/wikhan) | OK | Sans date, mais possibilité de récupérer la date |
 | [Blog](https://khaganat.net/blog/fr:start) | x | |
 | [Ryzom dev](http://dev.ryzom.com/projects/ryzom/wiki) | OK | Récupération de la date |
-| [Forum khanagat](https://khaganat.net/forum/) | OK | Sans date et sans contenu |
+| [Forum khanagat](https://khaganat.net/forum/) | OK | Récupération de la date |
 | [Forum ryzom](http://app.ryzom.com/app_forum/index.php) | X | |
 | [Google Community](https://plus.google.com/u/0/communities/103798956862568269036) | X | |
 | [nel-all archive](http://lists.nongnu.org/archive/html/nel-all/) | X | |
diff --git a/khaganatForum.py b/khaganatForum.py
index 75cd0be..781a1cf 100644
--- a/khaganatForum.py
+++ b/khaganatForum.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Doku Wiki
 #
 # @website     https://www.dokuwiki.org/
@@ -56,11 +57,24 @@ def response(resp):
         if not res_url:
             continue
 
-        #content = extract_text(r.xpath('.//div[@class="list_posts"]'))
-        title = extract_text(r.xpath('.//div[@class="topic_details floatleft"]/h5')) 
-
-        
+        try:
+            title = extract_text(r.xpath('.//div[@class="topic_details floatleft"]/h5')) 
+        except:
+            continue
+        try:
+            content = extract_text(r.xpath('.//div[@class="list_posts double_height"]'))
+        except:
+            content = ""
 
+        try:
+            dateBrut = extract_text(r.xpath('.//div[@class="topic_details floatleft"]/span/em')).encode('utf-8')
+        except:
+            dateBrut = "01 janvier 1970 à 01:01:01".encode('utf-8')
+        date = dateBrut.split(' ')
+        year = date[2]
+        day = date[0]
+        french_text_month = date[1]
+        time = date[4]
         #dataBrut = extract_text(r.xpath('.//span[@class="date"]'))
 
         #data = dataBrut.split('-')
@@ -75,36 +89,37 @@ def response(resp):
         #TheDay = Thedate[1].split(',')
 
 
-        #if Thedate[0] == "Jan":
-        #    ThedateMonth = 1
-        #elif Thedate[0] == "Feb":
-        #    ThedateMonth = 2
-        #elif Thedate[0] == "Mar":
-        #    ThedateMonth = 3
-        #elif Thedate[0] == "Apr":
-        #    ThedateMonth = 4
-        #elif Thedate[0] == "May":
-        #    ThedateMonth = 5
-        #elif Thedate[0] == "Jun":
-        #    ThedateMonth = 6
-        #elif Thedate[0] == "Jul":
-        #    ThedateMonth = 7
-        #elif Thedate[0] == "Aug":
-        #    ThedateMonth = 8
-        #elif Thedate[0] == "Sep":
-        #    ThedateMonth = 9
-        #elif Thedate[0] == "Oct":
-        #    ThedateMonth = 10
-        #elif Thedate[0] == "Nov":
-        #    ThedateMonth = 11
-        #else:
-        #    ThedateMonth = 12
+        if french_text_month == "janvier":
+            Month = 1
+        elif french_text_month.decode('utf-8') == "février".decode('utf-8'):
+            Month = 2
+        elif french_text_month  == "mars":
+            Month = 3
+        elif french_text_month == "avril":
+            Month = 4
+        elif french_text_month == "mai":
+            Month = 5
+        elif french_text_month == "juin":
+            Month = 6
+        elif french_text_month == "juillet":
+            Month = 7
+        elif french_text_month.decode('utf-8') == "août".decode('utf-8'):
+            Month = 8
+        elif french_text_month == "septembre":
+            Month = 9
+        elif french_text_month == "octobre":
+             Month = 10
+        elif french_text_month == "novembre":
+            Month = 11
+        else:
+            Month = 12
+
 
         # append result
         results.append({'title': title,
-                        'content': "",
-                        'url': base_url + res_url})
-                        #'publishedDate': datetime(int(Thedate[2]), ThedateMonth, int(TheDay[0]), 3, 1, 42)})
+                        'content': content,
+                        'url': base_url + res_url,
+                        'publishedDate': datetime(int(year), Month, int(day), 3, 1, 42)})
author	neodarz <neodarz@neodarz.net>	2017-04-16 16:50:34 +0200
committer	neodarz <neodarz@neodarz.net>	2017-04-16 16:50:34 +0200
commit	96fa8536264a39872c3674ad70472e843c5dc661 (patch)
tree	e072d13f19c83387175c5a055fb2e888e86a2fea
parent	6e5a07e1f101b2f7d8d036d37ff99348cada0218 (diff)
download	ryzomcore_searx-96fa8536264a39872c3674ad70472e843c5dc661.tar.xz ryzomcore_searx-96fa8536264a39872c3674ad70472e843c5dc661.zip