From b06edf3abc5bd6b957456bf1a988912ec585a0e6 Mon Sep 17 00:00:00 2001 From: neodarz Date: Sun, 7 Jul 2019 12:00:59 +0200 Subject: Add script for download modules --- modules-dl.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 modules-dl.py diff --git a/modules-dl.py b/modules-dl.py new file mode 100644 index 0000000..b6ce2b1 --- /dev/null +++ b/modules-dl.py @@ -0,0 +1,69 @@ +#!/bin/python + +from bs4 import BeautifulSoup +import requests +import lxml +import re + +from tqdm import tqdm +import urllib.request + + +import sys + +if len(sys.argv) < 2: + print("Usage:") + print(" {} ".format(sys.argv[0])) + exit(1) + +user = sys.argv[1] + +class DownloadProgressBar(tqdm): + def update_to(self, b=1, bsize=1, tsize=None): + if tsize is not None: + self.total = tsize + self.update(b * bsize - self.n) + +def download_url(url, out_path): + with DownloadProgressBar(unit='B', unit_scale=True, + miniters=1, desc=url.split('/')[-1]) as t: + urllib.request.urlretrieve(url, filename=out_path, reporthook=t.update_to) + +def get_mods(url): + + r = requests.get(url) + + soup = BeautifulSoup(r.text, "lxml") + + search_download = re.compile(".*download.*") + search_page = re.compile(".*pagination.*") + + is_number = re.compile(".*[0-9].*") + + for link in soup.find_all('a', href=re.compile("download")): + link_href = link.get("href") + link_name_arr = link.get("href").split("#") + if len(link_name_arr) > 1: + print("Downloading {} module...".format(link_name_arr[1])) + link_name = link_name_arr[1] + download_url(link_href, link_name) + else: + print("Can't get get file name... :/") + + page_selected = "0" + go_to = "1" + pages = [] + for link in soup.find_all('a', class_=re.compile(".*pagination.*")): + if is_number.match(link.text) and "pagination-selected" in link.get("class"): + page_selected = link.text + if is_number.match(link.text) and "pagination" in link.get("class"): + if link.text not in pages: + pages.append(link.text) + + go_to = str(int(page_selected)+1) + if go_to in pages: + get_mods("https://modarchive.org/index.php?request=view_artist_modules&query=" + user+"&page="+ go_to +"#mods") + + +url = "https://modarchive.org/index.php?request=view_artist_modules&query=" + user+"&page=1#mods" +get_mods(url) -- cgit v1.2.1