diff options
Diffstat (limited to 'extractors')
-rw-r--r-- | extractors/__init__.py | 0 | ||||
-rw-r--r-- | extractors/__pycache__/__init__.cpython-38.pyc | bin | 0 -> 155 bytes | |||
-rw-r--r-- | extractors/__pycache__/bandcamp.cpython-38.pyc | bin | 0 -> 1220 bytes | |||
-rw-r--r-- | extractors/__pycache__/common.cpython-38.pyc | bin | 0 -> 1726 bytes | |||
-rw-r--r-- | extractors/__pycache__/job.cpython-38.pyc | bin | 0 -> 1831 bytes | |||
-rw-r--r-- | extractors/bandcamp.py | 26 | ||||
-rw-r--r-- | extractors/common.py | 42 | ||||
-rw-r--r-- | extractors/job.py | 47 |
8 files changed, 115 insertions, 0 deletions
diff --git a/extractors/__init__.py b/extractors/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/extractors/__init__.py diff --git a/extractors/__pycache__/__init__.cpython-38.pyc b/extractors/__pycache__/__init__.cpython-38.pyc Binary files differnew file mode 100644 index 0000000..ae96411 --- /dev/null +++ b/extractors/__pycache__/__init__.cpython-38.pyc diff --git a/extractors/__pycache__/bandcamp.cpython-38.pyc b/extractors/__pycache__/bandcamp.cpython-38.pyc Binary files differnew file mode 100644 index 0000000..8ff4e9f --- /dev/null +++ b/extractors/__pycache__/bandcamp.cpython-38.pyc diff --git a/extractors/__pycache__/common.cpython-38.pyc b/extractors/__pycache__/common.cpython-38.pyc Binary files differnew file mode 100644 index 0000000..641e251 --- /dev/null +++ b/extractors/__pycache__/common.cpython-38.pyc diff --git a/extractors/__pycache__/job.cpython-38.pyc b/extractors/__pycache__/job.cpython-38.pyc Binary files differnew file mode 100644 index 0000000..68dee63 --- /dev/null +++ b/extractors/__pycache__/job.cpython-38.pyc diff --git a/extractors/bandcamp.py b/extractors/bandcamp.py new file mode 100644 index 0000000..cc383ae --- /dev/null +++ b/extractors/bandcamp.py @@ -0,0 +1,26 @@ +import re +import logging +import requests +from bs4 import BeautifulSoup + +from .common import Extractor + +class bandcamp(Extractor): + pattern = re.compile(r'(http(?:s|):\/\/.*bandcamp.com\/)') + filename_template = "%(artist)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s" + + def __init__(self, reg, url): + super().__init__(reg, url) + + def get_albums(self): + r = requests.get(self.root) + soup = BeautifulSoup(r.text, 'html.parser') + items = soup.select('a[href]') + for item in items: + if 'album' in item['href']: + url = self.root.rstrip('/') + item['href'] + if url not in self._albums: + self._albums.append(url) + + if not self._albums: + logging.warning(f"No albums found at {self.root} ????") diff --git a/extractors/common.py b/extractors/common.py new file mode 100644 index 0000000..db8f5c1 --- /dev/null +++ b/extractors/common.py @@ -0,0 +1,42 @@ +import os +from pathlib import Path +from sh import youtube_dl +from utils import read_file, write_file + + +class Extractor(): + + def __init__(self, reg, url): + self.root = reg.group(1) + self._albums = [] + self.root_path = self._root_path() + self._update_cache(self.root) + + def _root_path(self): + file_path = os.path.abspath(__file__) + folder_path = Path(file_path).parent + root_path = Path(folder_path).parent + return root_path + + def _update_cache(self, url): + urls_cache = [] + cache_file = Path(self.root_path, '.urls_cache.txt') + urls_cache = read_file(cache_file) + + for url in urls_cache: + if url.startswith(self.root): + return + write_file(cache_file, self.root) + + def _yt_wrapper(self, url, output): + for line in youtube_dl( + url, audio_format="mp3", + add_metadata=True, + o=output + self.filename_template, + _iter=True): + print(line.strip()) + + def download_albums(self, output): + for album in self._albums: + print("Parsing " + album + "...") + self._yt_wrapper(album, output) diff --git a/extractors/job.py b/extractors/job.py new file mode 100644 index 0000000..ce44e9a --- /dev/null +++ b/extractors/job.py @@ -0,0 +1,47 @@ +import logging +import re +import importlib +import sys + +extrs = [ + 'bandcamp' +] + + +class DlJob(): + + def __init__(self, url, output): + self.extr = self._find(url) + self.output = output + self._albums = [] + if not self.extr: + logging.error(url + " is not supported") + sys.exit(1) + + def _find(self, url): + for cls in self._list_extractors(): + match = cls.pattern.match(url) + if match: + return cls(match, url) + + def _list_extractors(self): + for extr in iter(extrs): + module = importlib.import_module('.'+extr, __package__) + yield from self._add_module(module) + + def _add_module(self, module): + classes = self._get_classes(module) + for cls in classes: + cls.pattern = re.compile(cls.pattern) + return classes + + def _get_classes(self, module): + return [ + cls for cls in module.__dict__.values() if ( + hasattr(cls, "pattern") and cls.__module__ == module.__name__ + ) + ] + + def run(self): + self.extr.get_albums() + self.extr.download_albums(self.output) |