From 96fa8536264a39872c3674ad70472e843c5dc661 Mon Sep 17 00:00:00 2001 From: neodarz Date: Sun, 16 Apr 2017 16:50:34 +0200 Subject: Update crawler forul of khaganat --- README.md | 2 +- khaganatForum.py | 77 +++++++++++++++++++++++++++++++++----------------------- 2 files changed, 47 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index dc94957..a5b45d2 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Si "moteurs de recherche" est entre guillmet, c'est que ce ne sont pas des moteu | [Wikhan](https://khaganat.net/wikhan) | OK | Sans date, mais possibilité de récupérer la date | | [Blog](https://khaganat.net/blog/fr:start) | x | | | [Ryzom dev](http://dev.ryzom.com/projects/ryzom/wiki) | OK | Récupération de la date | -| [Forum khanagat](https://khaganat.net/forum/) | OK | Sans date et sans contenu | +| [Forum khanagat](https://khaganat.net/forum/) | OK | Récupération de la date | | [Forum ryzom](http://app.ryzom.com/app_forum/index.php) | X | | | [Google Community](https://plus.google.com/u/0/communities/103798956862568269036) | X | | | [nel-all archive](http://lists.nongnu.org/archive/html/nel-all/) | X | | diff --git a/khaganatForum.py b/khaganatForum.py index 75cd0be..781a1cf 100644 --- a/khaganatForum.py +++ b/khaganatForum.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Doku Wiki # # @website https://www.dokuwiki.org/ @@ -56,11 +57,24 @@ def response(resp): if not res_url: continue - #content = extract_text(r.xpath('.//div[@class="list_posts"]')) - title = extract_text(r.xpath('.//div[@class="topic_details floatleft"]/h5')) - - + try: + title = extract_text(r.xpath('.//div[@class="topic_details floatleft"]/h5')) + except: + continue + try: + content = extract_text(r.xpath('.//div[@class="list_posts double_height"]')) + except: + content = "" + try: + dateBrut = extract_text(r.xpath('.//div[@class="topic_details floatleft"]/span/em')).encode('utf-8') + except: + dateBrut = "01 janvier 1970 à 01:01:01".encode('utf-8') + date = dateBrut.split(' ') + year = date[2] + day = date[0] + french_text_month = date[1] + time = date[4] #dataBrut = extract_text(r.xpath('.//span[@class="date"]')) #data = dataBrut.split('-') @@ -75,36 +89,37 @@ def response(resp): #TheDay = Thedate[1].split(',') - #if Thedate[0] == "Jan": - # ThedateMonth = 1 - #elif Thedate[0] == "Feb": - # ThedateMonth = 2 - #elif Thedate[0] == "Mar": - # ThedateMonth = 3 - #elif Thedate[0] == "Apr": - # ThedateMonth = 4 - #elif Thedate[0] == "May": - # ThedateMonth = 5 - #elif Thedate[0] == "Jun": - # ThedateMonth = 6 - #elif Thedate[0] == "Jul": - # ThedateMonth = 7 - #elif Thedate[0] == "Aug": - # ThedateMonth = 8 - #elif Thedate[0] == "Sep": - # ThedateMonth = 9 - #elif Thedate[0] == "Oct": - # ThedateMonth = 10 - #elif Thedate[0] == "Nov": - # ThedateMonth = 11 - #else: - # ThedateMonth = 12 + if french_text_month == "janvier": + Month = 1 + elif french_text_month.decode('utf-8') == "février".decode('utf-8'): + Month = 2 + elif french_text_month == "mars": + Month = 3 + elif french_text_month == "avril": + Month = 4 + elif french_text_month == "mai": + Month = 5 + elif french_text_month == "juin": + Month = 6 + elif french_text_month == "juillet": + Month = 7 + elif french_text_month.decode('utf-8') == "août".decode('utf-8'): + Month = 8 + elif french_text_month == "septembre": + Month = 9 + elif french_text_month == "octobre": + Month = 10 + elif french_text_month == "novembre": + Month = 11 + else: + Month = 12 + # append result results.append({'title': title, - 'content': "", - 'url': base_url + res_url}) - #'publishedDate': datetime(int(Thedate[2]), ThedateMonth, int(TheDay[0]), 3, 1, 42)}) + 'content': content, + 'url': base_url + res_url, + 'publishedDate': datetime(int(year), Month, int(day), 3, 1, 42)}) -- cgit v1.2.1