Merge branch 'yt-dlp:master' into master

Crypto90 · Dec 16, 2024 · 49c58f2 · 49c58f2
2 parents cf06307 + d298693
commit 49c58f2
Show file tree

Hide file tree

Showing 12 changed files with 132 additions and 27 deletions.
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -711,3 +711,5 @@ gitninja1234
 jkruse
 xiaomac
 wesson09
+Crypto90
+MutantPiggieGolem1
diff --git a/Changelog.md b/Changelog.md
@@ -4,6 +4,20 @@
 # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master
 -->
 
+### 2024.12.13
+
+#### Extractor changes
+- **patreon**: campaign: [Support /c/ URLs](https://github.com/yt-dlp/yt-dlp/commit/bc262bcad4d3683ceadf61a7eb87e233e72adef3) ([#11756](https://github.com/yt-dlp/yt-dlp/issues/11756)) by [bashonly](https://github.com/bashonly)
+- **soundcloud**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/f4d3e9e6dc25077b79849a31a2f67f93fdc01e62) ([#11777](https://github.com/yt-dlp/yt-dlp/issues/11777)) by [bashonly](https://github.com/bashonly)
+- **youtube**
+    - [Fix `release_date` extraction](https://github.com/yt-dlp/yt-dlp/commit/d5e2a379f2adcb28bc48c7d9e90716d7278f89d2) ([#11759](https://github.com/yt-dlp/yt-dlp/issues/11759)) by [MutantPiggieGolem1](https://github.com/MutantPiggieGolem1)
+    - [Fix signature function extraction for `2f1832d2`](https://github.com/yt-dlp/yt-dlp/commit/5460cd91891bf613a2065e2fc278d9903c37a127) ([#11801](https://github.com/yt-dlp/yt-dlp/issues/11801)) by [bashonly](https://github.com/bashonly)
+    - [Prioritize original language over auto-dubbed audio](https://github.com/yt-dlp/yt-dlp/commit/dc3c4fddcc653989dae71fc563d82a308fc898cc) ([#11803](https://github.com/yt-dlp/yt-dlp/issues/11803)) by [bashonly](https://github.com/bashonly)
+    - search_url: [Fix playlist searches](https://github.com/yt-dlp/yt-dlp/commit/f6c73aad5f1a67544bea137ebd9d1e22e0e56567) ([#11782](https://github.com/yt-dlp/yt-dlp/issues/11782)) by [Crypto90](https://github.com/Crypto90)
+
+#### Misc. changes
+- **cleanup**: [Make more playlist entries lazy](https://github.com/yt-dlp/yt-dlp/commit/54216696261bc07cacd9a837c501d9e0b7fed09e) ([#11763](https://github.com/yt-dlp/yt-dlp/issues/11763)) by [seproDev](https://github.com/seproDev)
+
 ### 2024.12.06
 
 #### Core changes

diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py
@@ -73,6 +73,11 @@
         '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
         'MyOSJXtKI3m-uME_jv7-pT12gOFC02RFkGoqWpzE0Cs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
     ),
+    (
+        'https://www.youtube.com/s/player/2f1832d2/player_ias.vflset/en_US/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        '0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q',
+    ),
 ]
 
 _NSIG_TESTS = [
@@ -192,6 +197,10 @@
         'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js',
         'gK15nzVyaXE9RsMP3z', 'ZFFWFLPWx9DEgQ',
     ),
+    (
+        'https://www.youtube.com/s/player/2f1832d2/player_ias.vflset/en_US/base.js',
+        'YWt1qdbe8SAfkoPHW5d', 'RrRjWQOJmBiP',
+    ),
 ]
 
 

diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py
@@ -31,6 +31,7 @@
     update_url_query,
     url_or_none,
 )
+from ..utils.traversal import traverse_obj
 
 
 class BrightcoveLegacyIE(InfoExtractor):
@@ -935,8 +936,8 @@ def extract_policy_key():
 
         if content_type == 'playlist':
             return self.playlist_result(
-                [self._parse_brightcove_metadata(vid, vid.get('id'), headers)
-                 for vid in json_data.get('videos', []) if vid.get('id')],
+                (self._parse_brightcove_metadata(vid, vid['id'], headers)
+                 for vid in traverse_obj(json_data, ('videos', lambda _, v: v['id']))),
                 json_data.get('id'), json_data.get('name'),
                 json_data.get('description'))
 

diff --git a/yt_dlp/extractor/dvtv.py b/yt_dlp/extractor/dvtv.py
@@ -162,7 +162,7 @@ def _real_extract(self, url):
         items = re.findall(r'(?s)playlist\.push\(({.+?})\);', webpage)
         if items:
             return self.playlist_result(
-                [self._parse_video_metadata(i, video_id, timestamp) for i in items],
+                (self._parse_video_metadata(i, video_id, timestamp) for i in items),
                 video_id, self._html_search_meta('twitter:title', webpage))
 
         item = self._search_regex(

diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py
@@ -343,7 +343,7 @@ def _real_extract(self, url):
         if media_ids:
             media_ids.append(lead_video_id)
             return self.playlist_result(
-                [self._extract_video(media_id) for media_id in media_ids], page_id, title, description)
+                map(self._extract_video, media_ids), page_id, title, description)
 
         return {
             **self._extract_video(lead_video_id),

diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py
@@ -457,7 +457,7 @@ class PatreonCampaignIE(PatreonBaseIE):
     _VALID_URL = r'''(?x)
         https?://(?:www\.)?patreon\.com/(?:
             (?:m|api/campaigns)/(?P<campaign_id>\d+)|
-            (?P<vanity>(?!creation[?/]|posts/|rss[?/])[\w-]+)
+            (?:c/)?(?P<vanity>(?!creation[?/]|posts/|rss[?/])[\w-]+)
         )(?:/posts)?/?(?:$|[?#])'''
     _TESTS = [{
         'url': 'https://www.patreon.com/dissonancepod/',
@@ -509,6 +509,26 @@ class PatreonCampaignIE(PatreonBaseIE):
             'thumbnail': r're:^https?://.*$',
         },
         'playlist_mincount': 201,
+    }, {
+        'url': 'https://www.patreon.com/c/OgSog',
+        'info_dict': {
+            'id': '8504388',
+            'title': 'OGSoG',
+            'description': r're:(?s)Hello and welcome to our Patreon page. We are Mari, Lasercorn, .+',
+            'channel': 'OGSoG',
+            'channel_id': '8504388',
+            'channel_url': 'https://www.patreon.com/OgSog',
+            'uploader_url': 'https://www.patreon.com/OgSog',
+            'uploader_id': '72323575',
+            'uploader': 'David Moss',
+            'thumbnail': r're:https?://.+/.+',
+            'channel_follower_count': int,
+            'age_limit': 0,
+        },
+        'playlist_mincount': 331,
+    }, {
+        'url': 'https://www.patreon.com/c/OgSog/posts',
+        'only_matching': True,
     }, {
         'url': 'https://www.patreon.com/dissonancepod/posts',
         'only_matching': True,

diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py
@@ -210,6 +210,7 @@ def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_f
 
         format_urls = set()
         formats = []
+        has_drm = False
         query = {'client_id': self._CLIENT_ID}
         if secret_token:
             query['secret_token'] = secret_token
@@ -245,6 +246,7 @@ def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_f
                         'url': format_url,
                         'quality': 10,
                         'format_note': 'Original',
+                        'vcodec': 'none',
                     })
 
         def invalid_url(url):
@@ -259,6 +261,9 @@ def invalid_url(url):
             preset_base = preset.partition('_')[0]
 
             protocol = traverse_obj(t, ('format', 'protocol', {str})) or 'http'
+            if protocol.startswith(('ctr-', 'cbc-')):
+                has_drm = True
+                continue
             if protocol == 'progressive':
                 protocol = 'http'
             if protocol != 'hls' and '/hls' in format_url:
@@ -315,8 +320,11 @@ def invalid_url(url):
                 'preference': -10 if is_preview else None,
             })
 
-        if not formats and info.get('policy') == 'BLOCK':
-            self.raise_geo_restricted(metadata_available=True)
+        if not formats:
+            if has_drm:
+                self.report_drm(track_id)
+            if info.get('policy') == 'BLOCK':
+                self.raise_geo_restricted(metadata_available=True)
 
         user = info.get('user') or {}
 

diff --git a/yt_dlp/extractor/vidyard.py b/yt_dlp/extractor/vidyard.py
@@ -421,5 +421,5 @@ def _real_extract(self, url):
             return self._process_video_json(video_json['chapters'][0], video_id)
 
         return self.playlist_result(
-            [self._process_video_json(chapter, video_id) for chapter in video_json['chapters']],
+            (self._process_video_json(chapter, video_id) for chapter in video_json['chapters']),
             str(video_json['playerUuid']), video_json.get('name'))
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
@@ -518,11 +518,12 @@ def ucid_or_none(self, ucid):
         return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None)
 
     def handle_or_none(self, handle):
-        return self._search_regex(rf'^({self._YT_HANDLE_RE})$', handle, '@-handle', default=None)
+        return self._search_regex(rf'^({self._YT_HANDLE_RE})$', urllib.parse.unquote(handle or ''),
+                                  '@-handle', default=None)
 
     def handle_from_url(self, url):
         return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_HANDLE_RE})',
-                                  url, 'channel handle', default=None)
+                                  urllib.parse.unquote(url or ''), 'channel handle', default=None)
 
     def ucid_from_url(self, url):
         return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_CHANNEL_UCID_RE})',
@@ -1496,7 +1497,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         },
         # Age-gate videos. See https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-888837000
         {
-            'note': 'Embed allowed age-gate video',
+            'note': 'Embed allowed age-gate video; works with web_embedded',
             'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
             'info_dict': {
                 'id': 'HtVdAasjOgU',
@@ -1526,7 +1527,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'heatmap': 'count:100',
                 'timestamp': 1401991663,
             },
-            'skip': 'Age-restricted; requires authentication',
         },
         {
             'note': 'Age-gate video with embed allowed in public site',
@@ -2802,6 +2802,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}},
             },
         },
+        {
+            # uploader_id has non-ASCII characters that are percent-encoded in YT's JSON
+            'url': 'https://www.youtube.com/shorts/18NGQq7p3LY',
+            'info_dict': {
+                'id': '18NGQq7p3LY',
+                'ext': 'mp4',
+                'title': '아이브 이서 장원영 리즈 삐끼삐끼 챌린지',
+                'description': '',
+                'uploader': 'ㅇㅇ',
+                'uploader_id': '@으아-v1k',
+                'uploader_url': 'https://www.youtube.com/@으아-v1k',
+                'channel': 'ㅇㅇ',
+                'channel_id': 'UCC25oTm2J7ZVoi5TngOHg9g',
+                'channel_url': 'https://www.youtube.com/channel/UCC25oTm2J7ZVoi5TngOHg9g',
+                'thumbnail': r're:https?://.+/.+\.jpg',
+                'playable_in_embed': True,
+                'age_limit': 0,
+                'duration': 3,
+                'timestamp': 1724306170,
+                'upload_date': '20240822',
+                'availability': 'public',
+                'live_status': 'not_live',
+                'view_count': int,
+                'like_count': int,
+                'channel_follower_count': int,
+                'categories': ['People & Blogs'],
+                'tags': [],
+            },
+        },
     ]
 
     _WEBPAGE_TESTS = [
@@ -3128,9 +3157,9 @@ def _parse_sig_js(self, jscode):
         # ;N&&(N=sig(decodeURIComponent(N)),J.set(R,encodeURIComponent(N)));return J};
         # {var H=u,k=f.sp,v=sig(decodeURIComponent(f.s));H.set(k,encodeURIComponent(v))}
         funcname = self._search_regex(
-            (r'\b(?P<var>[a-zA-Z0-9$]+)&&\((?P=var)=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\((?P=var)\)\)',
-             r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*(?P<arg>[a-zA-Z0-9$]+)\s*\)\s*{\s*(?P=arg)\s*=\s*(?P=arg)\.split\(\s*""\s*\)\s*;\s*[^}]+;\s*return\s+(?P=arg)\.join\(\s*""\s*\)',
-             r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?',
+            (r'\b(?P<var>[a-zA-Z0-9_$]+)&&\((?P=var)=(?P<sig>[a-zA-Z0-9_$]{2,})\(decodeURIComponent\((?P=var)\)\)',
+             r'(?P<sig>[a-zA-Z0-9_$]+)\s*=\s*function\(\s*(?P<arg>[a-zA-Z0-9_$]+)\s*\)\s*{\s*(?P=arg)\s*=\s*(?P=arg)\.split\(\s*""\s*\)\s*;\s*[^}]+;\s*return\s+(?P=arg)\.join\(\s*""\s*\)',
+             r'(?:\b|[^a-zA-Z0-9_$])(?P<sig>[a-zA-Z0-9_$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9_$]{2}\.[a-zA-Z0-9_$]{2}\(a,\d+\))?',
              # Old patterns
              r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
              r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
@@ -3984,10 +4013,20 @@ def append_client(*client_names):
                 else:
                     prs.append(pr)
 
+            # web_embedded can work around age-gate and age-verification for some embeddable videos
+            if self._is_agegated(pr) and variant != 'web_embedded':
+                append_client(f'web_embedded.{base_client}')
+            # Unauthenticated users will only get web_embedded client formats if age-gated
+            if self._is_agegated(pr) and not self.is_authenticated:
+                self.to_screen(
+                    f'{video_id}: This video is age-restricted; some formats may be missing '
+                    f'without authentication. {self._login_hint()}', only_once=True)
+
             ''' This code is pointless while web_creator is in _DEFAULT_AUTHED_CLIENTS
             # EU countries require age-verification for accounts to access age-restricted videos
             # If account is not age-verified, _is_agegated() will be truthy for non-embedded clients
-            if self.is_authenticated and self._is_agegated(pr):
+            embedding_is_disabled = variant == 'web_embedded' and self._is_unplayable(pr)
+            if self.is_authenticated and (self._is_agegated(pr) or embedding_is_disabled):
                 self.to_screen(
                     f'{video_id}: This video is age-restricted and YouTube is requiring '
                     'account age-verification; some formats may be missing', only_once=True)
@@ -4068,10 +4107,12 @@ def build_fragments(f):
                 if height:
                     res_qualities[height] = quality
 
+            display_name = audio_track.get('displayName') or ''
+            is_original = 'original' in display_name.lower()
+            is_descriptive = 'descriptive' in display_name.lower()
             is_default = audio_track.get('audioIsDefault')
-            is_descriptive = 'descriptive' in (audio_track.get('displayName') or '').lower()
             language_code = audio_track.get('id', '').split('.')[0]
-            if language_code and is_default:
+            if language_code and (is_original or (is_default and not original_language)):
                 original_language = language_code
 
             # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
@@ -4152,7 +4193,7 @@ def build_fragments(f):
                 'filesize': int_or_none(fmt.get('contentLength')),
                 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}',
                 'format_note': join_nonempty(
-                    join_nonempty(audio_track.get('displayName'), is_default and ' (default)', delim=''),
+                    join_nonempty(display_name, is_default and ' (default)', delim=''),
                     name, fmt.get('isDrc') and 'DRC',
                     try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
                     try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
@@ -4171,7 +4212,7 @@ def build_fragments(f):
                 'url': fmt_url,
                 'width': int_or_none(fmt.get('width')),
                 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None,
-                'language_preference': PREFERRED_LANG_VALUE if is_default else -10 if is_descriptive else -1,
+                'language_preference': PREFERRED_LANG_VALUE if is_original else 5 if is_default else -10 if is_descriptive else -1,
                 # Strictly de-prioritize broken, damaged and 3gp formats
                 'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None,
             }
@@ -4690,7 +4731,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
                     (?=(?P<artist>[^\n]+))(?P=artist)\n+
                     (?=(?P<album>[^\n]+))(?P=album)\n
                     (?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?
-                    (?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?
+                    (?:.+?Released\ on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?
                     (.+?\nArtist\s*:\s*
                         (?=(?P<clean_artist>[^\n]+))(?P=clean_artist)\n
                     )?.+\nAuto-generated\ by\ YouTube\.\s*$

diff --git a/yt_dlp/update.py b/yt_dlp/update.py
@@ -65,9 +65,14 @@ def _get_variant_and_executable_path():
             machine = '_legacy' if version_tuple(platform.mac_ver()[0]) < (10, 15) else ''
         else:
             machine = f'_{platform.machine().lower()}'
+            is_64bits = sys.maxsize > 2**32
             # Ref: https://en.wikipedia.org/wiki/Uname#Examples
             if machine[1:] in ('x86', 'x86_64', 'amd64', 'i386', 'i686'):
-                machine = '_x86' if platform.architecture()[0][:2] == '32' else ''
+                machine = '_x86' if not is_64bits else ''
+            # platform.machine() on 32-bit raspbian OS may return 'aarch64', so check "64-bitness"
+            # See: https://github.com/yt-dlp/yt-dlp/issues/11813
+            elif machine[1:] == 'aarch64' and not is_64bits:
+                machine = '_armv7l'
             # sys.executable returns a /tmp/ path for staticx builds (linux_static)
             # Ref: https://staticx.readthedocs.io/en/latest/usage.html#run-time-information
             if static_exe_path := os.getenv('STATICX_PROG_PATH'):
@@ -525,11 +530,16 @@ def filename(self):
     @functools.cached_property
     def cmd(self):
         """The command-line to run the executable, if known"""
+        argv = None
         # There is no sys.orig_argv in py < 3.10. Also, it can be [] when frozen
         if getattr(sys, 'orig_argv', None):
-            return sys.orig_argv
+            argv = sys.orig_argv
         elif getattr(sys, 'frozen', False):
-            return sys.argv
+            argv = sys.argv
+        # linux_static exe's argv[0] will be /tmp/staticx-NNNN/yt-dlp_linux if we don't fixup here
+        if argv and os.getenv('STATICX_PROG_PATH'):
+            argv = [self.filename, *argv[1:]]
+        return argv
 
     def restart(self):
         """Restart the executable"""

diff --git a/yt_dlp/version.py b/yt_dlp/version.py
@@ -1,8 +1,8 @@
 # Autogenerated by devscripts/update-version.py
 
-__version__ = '2024.12.06'
+__version__ = '2024.12.13'
 
-RELEASE_GIT_HEAD = '4bd2655398aed450456197a6767639114a24eac2'
+RELEASE_GIT_HEAD = '54216696261bc07cacd9a837c501d9e0b7fed09e'
 
 VARIANT = None
 
@@ -12,4 +12,4 @@
 
 ORIGIN = 'yt-dlp/yt-dlp'
 
-_pkg_version = '2024.12.06'
+_pkg_version = '2024.12.13'