blob: 305f39d576200d80807e13a78272da1b8e58552e (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
import requests, re, sys
from bs4 import BeautifulSoup
page_num = 0
url_prefix = 'http://fr.lolix.org/search/offre/'
default_option = '&mode=find&posteid=0®ionid=0&contratid=0'
url = url_prefix+'search.php?page='+str(page_num)+default_option
page = requests.get(url)
if len(sys.argv) == 2:
arg = sys.argv[1]
else:
arg = ""
soup = BeautifulSoup(page.text, 'html.parser')
contenu = soup.find_all(class_='Contenu')
offres = []
empty = True
while empty:
for el in contenu:
table = el.find_all('table')
for ele in table:
if ele.find_all(class_="ListeTitre"):
print("Parsing page "+str(page_num)+"...")
if re.search('Aucune', ele.text):
empty = False
else:
rows = ele.find_all("tr")
for row in rows:
lines = row.find_all("td")
for line in lines:
for a in line.find_all('a', href=True):
if re.search('\?id', a.attrs['href']):
urlArt = url_prefix+str(a.attrs['href'])
pageArt = requests.get(urlArt)
soupArt = BeautifulSoup(pageArt.text, "html.parser")
if re.search(arg, soupArt.text):
offres.append(url_prefix+str(a.attrs['href']))
page_num = page_num + 1
url = url_prefix+"search.php?page="+str(page_num)+default_option
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
contenu = soup.find_all(class_='Contenu')
if len(offres) != 0:
print("\nOffres trouvée: \n")
for offre in offres:
print(offre)
else:
print("Aucune offre à ce jour ne correspond à vos critères de recherche.")
|