modules-dl.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

#!/bin/python

from bs4 import BeautifulSoup
import requests
import lxml
import re

from tqdm import tqdm
import urllib.request


import sys

if len(sys.argv) < 2:
    print("Usage:")
    print("    {} <user_id>".format(sys.argv[0]))
    exit(1)

user = sys.argv[1]

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

def download_url(url, out_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
            miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=out_path, reporthook=t.update_to)

def get_mods(url):

    r = requests.get(url)

    soup = BeautifulSoup(r.text, "lxml")

    search_download = re.compile(".*download.*")
    search_page = re.compile(".*pagination.*")

    is_number = re.compile(".*[0-9].*")

    for link in soup.find_all('a', href=re.compile("download")):
        link_href = link.get("href")
        link_name_arr = link.get("href").split("#")
        if len(link_name_arr) > 1:
            print("Downloading {} module...".format(link_name_arr[1]))
            link_name = link_name_arr[1]
            download_url(link_href, link_name)
        else:
            print("Can't get get file name... :/")

    page_selected = "0"
    go_to = "1"
    pages = []
    for link in soup.find_all('a', class_=re.compile(".*pagination.*")):
        if is_number.match(link.text) and "pagination-selected" in link.get("class"):
            page_selected = link.text
        if is_number.match(link.text) and "pagination" in link.get("class"):
            if link.text not in pages:
                pages.append(link.text)

    go_to = str(int(page_selected)+1)
    if go_to in pages:
        get_mods("https://modarchive.org/index.php?request=view_artist_modules&query=" + user+"&page="+ go_to +"#mods")


url = "https://modarchive.org/index.php?request=view_artist_modules&query=" + user+"&page=1#mods"
get_mods(url)