Merge pull request #537 from aliparlakci/development

Serene-Arc · Oct 20, 2021 · 8104ce3 · 8104ce3
2 parents 2d6e25d + f716d98
commit 8104ce3
Show file tree

Hide file tree

Showing 9 changed files with 71 additions and 63 deletions.
diff --git a/bdfr/resource.py b/bdfr/resource.py
@@ -30,33 +30,7 @@ def __init__(self, source_submission: Submission, url: str, download_function: C
 
     @staticmethod
     def retry_download(url: str) -> Callable:
-        max_wait_time = 300
-
-        def http_download(download_parameters: dict) -> Optional[bytes]:
-            current_wait_time = 60
-            if 'max_wait_time' in download_parameters:
-                max_wait_time = download_parameters['max_wait_time']
-            else:
-                max_wait_time = 300
-            while True:
-                try:
-                    response = requests.get(url)
-                    if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
-                        return response.content
-                    elif response.status_code in (408, 429):
-                        raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
-                    else:
-                        raise BulkDownloaderException(
-                            f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
-                except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
-                    logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
-                    time.sleep(current_wait_time)
-                    if current_wait_time < max_wait_time:
-                        current_wait_time += 60
-                    else:
-                        logger.error(f'Max wait time exceeded for resource at url {url}')
-                        raise
-        return http_download
+        return lambda global_params: Resource.http_download(url, global_params)
 
     def download(self, download_parameters: Optional[dict] = None):
         if download_parameters is None:
@@ -82,3 +56,30 @@ def _determine_extension(self) -> Optional[str]:
         match = re.search(extension_pattern, stripped_url)
         if match:
             return match.group(1)
+
+    @staticmethod
+    def http_download(url: str, download_parameters: dict) -> Optional[bytes]:
+        headers = download_parameters.get('headers')
+        current_wait_time = 60
+        if 'max_wait_time' in download_parameters:
+            max_wait_time = download_parameters['max_wait_time']
+        else:
+            max_wait_time = 300
+        while True:
+            try:
+                response = requests.get(url, headers=headers)
+                if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
+                    return response.content
+                elif response.status_code in (408, 429):
+                    raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
+                else:
+                    raise BulkDownloaderException(
+                        f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
+            except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
+                logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
+                time.sleep(current_wait_time)
+                if current_wait_time < max_wait_time:
+                    current_wait_time += 60
+                else:
+                    logger.error(f'Max wait time exceeded for resource at url {url}')
+                    raise
diff --git a/bdfr/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py
@@ -2,7 +2,7 @@
 
 import logging
 import re
-from typing import Optional
+from typing import Callable, Optional
 
 import bs4
 from praw.models import Submission
@@ -29,7 +29,7 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l
         for link in links:
             if not re.match(r'https?://.*', link):
                 link = 'https://' + link
-            out.append(Resource(self.post, link, Resource.retry_download(link)))
+            out.append(Resource(self.post, link, self.erome_download(link)))
         return out
 
     @staticmethod
@@ -43,3 +43,14 @@ def _get_links(url: str) -> set[str]:
         out.extend([vid.get('src') for vid in videos])
 
         return set(out)
+
+    @staticmethod
+    def erome_download(url: str) -> Callable:
+        download_parameters = {
+            'headers': {
+                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
+                              ' Chrome/88.0.4324.104 Safari/537.36',
+                'Referer': 'https://www.erome.com/',
+            },
+        }
+        return lambda global_params: Resource.http_download(url, global_params | download_parameters)
diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 from typing import Callable, Optional
 
-import youtube_dl
+import yt_dlp
 from praw.models import Submission
 
 from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
@@ -45,9 +45,9 @@ def download(_: dict) -> bytes:
                 download_path = Path(temp_dir).resolve()
                 ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
                 try:
-                    with youtube_dl.YoutubeDL(ytdl_options) as ydl:
+                    with yt_dlp.YoutubeDL(ytdl_options) as ydl:
                         ydl.download([self.post.url])
-                except youtube_dl.DownloadError as e:
+                except yt_dlp.DownloadError as e:
                     raise SiteDownloaderError(f'Youtube download failed: {e}')
 
                 downloaded_files = list(download_path.iterdir())
@@ -64,7 +64,7 @@ def download(_: dict) -> bytes:
     def get_video_attributes(url: str) -> dict:
         yt_logger = logging.getLogger('youtube-dl')
         yt_logger.setLevel(logging.CRITICAL)
-        with youtube_dl.YoutubeDL({'logger': yt_logger, }) as ydl:
+        with yt_dlp.YoutubeDL({'logger': yt_logger, }) as ydl:
             try:
                 result = ydl.extract_info(url, download=False)
                 return result

diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,4 @@ ffmpeg-python>=0.2.0
 praw>=7.2.0
 pyyaml>=5.4.1
 requests>=2.25.1
-youtube-dl>=2021.3.14
+yt-dlp>=2021.9.25
diff --git a/setup.cfg b/setup.cfg
@@ -4,7 +4,7 @@ description_file = README.md
 description_content_type = text/markdown
 home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit
 keywords = reddit, download, archive
-version = 2.4.1
+version = 2.4.2
 author = Ali Parlakci
 author_email = [email protected]
 maintainer = Serene Arc

diff --git a/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py b/tests/site_downloaders/fallback_downloaders/test_youtubedl_fallback.py
@@ -22,7 +22,7 @@ def test_can_handle_link(test_url: str, expected: bool):
 @pytest.mark.online
 @pytest.mark.slow
 @pytest.mark.parametrize(('test_url', 'expected_hash'), (
-    ('https://streamable.com/dt46y', '1e7f4928e55de6e3ca23d85cc9246bbb'),
+    ('https://streamable.com/dt46y', 'b7e465adaade5f2b6d8c2b4b7d0a2878'),
     ('https://streamable.com/t8sem', '49b2d1220c485455548f1edbc05d4ecf'),
     ('https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/', '21968d3d92161ea5e0abdcaf6311b06c'),
     ('https://v.redd.it/9z1dnk3xr5k61', '351a2b57e888df5ccbc508056511f38d'),
@@ -34,4 +34,6 @@ def test_find_resources(test_url: str, expected_hash: str):
     resources = downloader.find_resources()
     assert len(resources) == 1
     assert isinstance(resources[0], Resource)
+    for res in resources:
+        res.download()
     assert resources[0].hash.hexdigest() == expected_hash
diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # coding=utf-8
-
+import re
 from unittest.mock import MagicMock
 
 import pytest
@@ -11,44 +11,37 @@
 @pytest.mark.online
 @pytest.mark.parametrize(('test_url', 'expected_urls'), (
     ('https://www.erome.com/a/vqtPuLXh', (
-        'https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4',
+        r'https://s\d+.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4',
     )),
     ('https://www.erome.com/a/ORhX0FZz', (
-        'https://s15.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4',
-        'https://s15.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4',
-        'https://s15.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4',
-        'https://s15.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4',
-        'https://s15.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4',
-        'https://s15.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4',
-        'https://s15.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4'
+        r'https://s\d+.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4',
+        r'https://s\d+.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4',
+        r'https://s\d+.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4',
+        r'https://s\d+.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4',
+        r'https://s\d+.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4',
+        r'https://s\d+.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4',
+        r'https://s\d+.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4'
     )),
 ))
 def test_get_link(test_url: str, expected_urls: tuple[str]):
     result = Erome. _get_links(test_url)
-    assert set(result) == set(expected_urls)
+    assert all([any([re.match(p, r) for r in result]) for p in expected_urls])
 
 
 @pytest.mark.online
 @pytest.mark.slow
-@pytest.mark.parametrize(('test_url', 'expected_hashes'), (
-    ('https://www.erome.com/a/vqtPuLXh', {
-        '5da2a8d60d87bed279431fdec8e7d72f'
-    }),
-    ('https://www.erome.com/a/lGrcFxmb', {
-        '0e98f9f527a911dcedde4f846bb5b69f',
-        '25696ae364750a5303fc7d7dc78b35c1',
-        '63775689f438bd393cde7db6d46187de',
-        'a1abf398cfd4ef9cfaf093ceb10c746a',
-        'bd9e1a4ea5ef0d6ba47fb90e337c2d14'
-    }),
+@pytest.mark.parametrize(('test_url', 'expected_hashes_len'), (
+    ('https://www.erome.com/a/vqtPuLXh', 1),
+    ('https://www.erome.com/a/4tP3KI6F', 1),
 ))
-def test_download_resource(test_url: str, expected_hashes: tuple[str]):
+def test_download_resource(test_url: str, expected_hashes_len: int):
     # Can't compare hashes for this test, Erome doesn't return the exact same file from request to request so the hash
     # will change back and forth randomly
     mock_submission = MagicMock()
     mock_submission.url = test_url
     test_site = Erome(mock_submission)
     resources = test_site.find_resources()
-    [res.download() for res in resources]
+    for res in resources:
+        res.download()
     resource_hashes = [res.hash.hexdigest() for res in resources]
-    assert len(resource_hashes) == len(expected_hashes)
+    assert len(resource_hashes) == expected_hashes_len
diff --git a/tests/site_downloaders/test_pornhub.py b/tests/site_downloaders/test_pornhub.py
@@ -12,7 +12,7 @@
 @pytest.mark.online
 @pytest.mark.slow
 @pytest.mark.parametrize(('test_url', 'expected_hash'), (
-    ('https://www.pornhub.com/view_video.php?viewkey=ph5a2ee0461a8d0', '5f5294b9b97dbb7cb9cf8df278515621'),
+    ('https://www.pornhub.com/view_video.php?viewkey=ph6074c59798497', 'd9b99e4ebecf2d8d67efe5e70d2acf8a'),
 ))
 def test_find_resources_good(test_url: str, expected_hash: str):
     test_submission = MagicMock()

diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py
@@ -13,8 +13,9 @@
 @pytest.mark.online
 @pytest.mark.slow
 @pytest.mark.parametrize(('test_url', 'expected_hash'), (
-    ('https://www.youtube.com/watch?v=uSm2VDgRIUs', 'f70b704b4b78b9bb5cd032bfc26e4971'),
-    ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '2bfdbf434ed284623e46f3bf52c36166'),
+    ('https://www.youtube.com/watch?v=uSm2VDgRIUs', '2d60b54582df5b95ec72bb00b580d2ff'),
+    ('https://www.youtube.com/watch?v=GcI7nxQj7HA', '5db0fc92a0a7fb9ac91e63505eea9cf0'),
+    ('https://youtu.be/TMqPOlp4tNo', 'f68c00b018162857f3df4844c45302e7'),  # Age restricted
 ))
 def test_find_resources_good(test_url: str, expected_hash: str):
     test_submission = MagicMock()