From 5310fa87f6cb7f66bf42e2520878952fbf6b1652 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:29:22 -0500 Subject: [PATCH 01/23] [ie/weverse] Fix extractor (#11215) Closes #11213 Authored by: bashonly --- yt_dlp/extractor/weverse.py | 60 +++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index c94ca9db9796..6f1a8b95d88d 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -27,8 +27,9 @@ class WeverseBaseIE(InfoExtractor): _NETRC_MACHINE = 'weverse' - _ACCOUNT_API_BASE = 'https://accountapi.weverse.io/web/api/v2' + _ACCOUNT_API_BASE = 'https://accountapi.weverse.io/web/api' _API_HEADERS = { + 'Accept': 'application/json', 'Referer': 'https://weverse.io/', 'WEV-device-Id': str(uuid.uuid4()), } @@ -39,14 +40,14 @@ def _perform_login(self, username, password): headers = { 'x-acc-app-secret': '5419526f1c624b38b10787e5c10b2a7a', - 'x-acc-app-version': '2.2.6', + 'x-acc-app-version': '3.3.6', 'x-acc-language': 'en', 'x-acc-service-id': 'weverse', 'x-acc-trace-id': str(uuid.uuid4()), 'x-clog-user-device-id': str(uuid.uuid4()), } valid_username = traverse_obj(self._download_json( - f'{self._ACCOUNT_API_BASE}/signup/email/status', None, note='Checking username', + f'{self._ACCOUNT_API_BASE}/v2/signup/email/status', None, note='Checking username', query={'email': username}, headers=headers, expected_status=(400, 404)), 'hasPassword') if not valid_username: raise ExtractorError('Invalid username provided', expected=True) @@ -54,8 +55,9 @@ def _perform_login(self, username, password): headers['content-type'] = 'application/json' try: auth = self._download_json( - f'{self._ACCOUNT_API_BASE}/auth/token/by-credentials', None, data=json.dumps({ + f'{self._ACCOUNT_API_BASE}/v3/auth/token/by-credentials', None, data=json.dumps({ 'email': username, + 'otpSessionId': 'BY_PASS', 'password': password, }, separators=(',', ':')).encode(), headers=headers, note='Logging in') except ExtractorError as e: @@ -78,8 +80,10 @@ def _call_api(self, ep, video_id, data=None, note='Downloading API JSON'): # From https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/main.e206f7c1.js: key = b'1b9cb6378d959b45714bec49971ade22e6e24e42' api_path = update_url_query(ep, { + # 'gcc': 'US', 'appId': 'be4d79eb8fc7bd008ee82c8ec4ff6fd4', 'language': 'en', + 'os': 'WEB', 'platform': 'WEB', 'wpf': 'pc', }) @@ -152,7 +156,7 @@ def _parse_post_meta(self, metadata): 'description': ((('extension', 'mediaInfo', 'body'), 'body'), {str}), 'uploader': ('author', 'profileName', {str}), 'uploader_id': ('author', 'memberId', {str}), - 'creator': ('community', 'communityName', {str}), + 'creators': ('community', 'communityName', {str}, all), 'channel_id': (('community', 'author'), 'communityId', {str_or_none}), 'duration': ('extension', 'video', 'playTime', {float_or_none}), 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}), @@ -196,7 +200,7 @@ class WeverseIE(WeverseBaseIE): 'channel': 'billlie', 'channel_id': '72', 'channel_url': 'https://weverse.io/billlie', - 'creator': 'Billlie', + 'creators': ['Billlie'], 'timestamp': 1666262062, 'upload_date': '20221020', 'release_timestamp': 1666262058, @@ -222,7 +226,7 @@ class WeverseIE(WeverseBaseIE): 'channel': 'lesserafim', 'channel_id': '47', 'channel_url': 'https://weverse.io/lesserafim', - 'creator': 'LE SSERAFIM', + 'creators': ['LE SSERAFIM'], 'timestamp': 1659353400, 'upload_date': '20220801', 'release_timestamp': 1659353400, @@ -286,7 +290,7 @@ def _real_extract(self, url): elif live_status == 'is_live': video_info = self._call_api( - f'/video/v1.0/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', + f'/video/v1.2/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', video_id, note='Downloading live JSON') playback = self._parse_json(video_info['lipPlayback'], video_id) m3u8_url = traverse_obj(playback, ( @@ -302,7 +306,7 @@ def _real_extract(self, url): else: infra_video_id = post['extension']['video']['infraVideoId'] in_key = self._call_api( - f'/video/v1.0/vod/{api_video_id}/inKey?preview=false', video_id, + f'/video/v1.1/vod/{api_video_id}/inKey?preview=false', video_id, data=b'{}', note='Downloading VOD API key')['inKey'] video_info = self._download_json( @@ -347,7 +351,6 @@ class WeverseMediaIE(WeverseBaseIE): _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/media/(?P[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/billlie/media/4-116372884', - 'md5': '8efc9cfd61b2f25209eb1a5326314d28', 'info_dict': { 'id': 'e-C9wLSQs6o', 'ext': 'mp4', @@ -358,8 +361,9 @@ class WeverseMediaIE(WeverseBaseIE): 'channel_url': 'https://www.youtube.com/channel/UCyc9sUCxELTDK9vELO5Fzeg', 'uploader': 'Billlie', 'uploader_id': '@Billlie', - 'uploader_url': 'http://www.youtube.com/@Billlie', + 'uploader_url': 'https://www.youtube.com/@Billlie', 'upload_date': '20230403', + 'timestamp': 1680533992, 'duration': 211, 'age_limit': 0, 'playable_in_embed': True, @@ -372,6 +376,8 @@ class WeverseMediaIE(WeverseBaseIE): 'thumbnail': 'https://i.ytimg.com/vi/e-C9wLSQs6o/maxresdefault.jpg', 'categories': ['Entertainment'], 'tags': 'count:7', + 'channel_is_verified': True, + 'heatmap': 'count:100', }, }, { 'url': 'https://weverse.io/billlie/media/3-102914520', @@ -386,7 +392,7 @@ class WeverseMediaIE(WeverseBaseIE): 'channel': 'billlie', 'channel_id': '72', 'channel_url': 'https://weverse.io/billlie', - 'creator': 'Billlie', + 'creators': ['Billlie'], 'timestamp': 1662174000, 'upload_date': '20220903', 'release_timestamp': 1662174000, @@ -432,7 +438,7 @@ class WeverseMomentIE(WeverseBaseIE): 'uploader_id': '66a07e164b56a696ee71c99315ffe27b', 'channel': 'secretnumber', 'channel_id': '56', - 'creator': 'SECRET NUMBER', + 'creators': ['SECRET NUMBER'], 'duration': 10, 'upload_date': '20230405', 'timestamp': 1680653968, @@ -441,7 +447,6 @@ class WeverseMomentIE(WeverseBaseIE): 'comment_count': int, 'availability': 'needs_auth', }, - 'skip': 'Moment has expired', }] def _real_extract(self, url): @@ -571,7 +576,7 @@ class WeverseLiveIE(WeverseBaseIE): 'channel': 'purplekiss', 'channel_id': '35', 'channel_url': 'https://weverse.io/purplekiss', - 'creator': 'PURPLE KISS', + 'creators': ['PURPLE KISS'], 'timestamp': 1680780892, 'upload_date': '20230406', 'release_timestamp': 1680780883, @@ -584,6 +589,31 @@ class WeverseLiveIE(WeverseBaseIE): 'live_status': 'is_live', }, 'skip': 'Livestream has ended', + }, { + 'url': 'https://weverse.io/lesserafim', + 'info_dict': { + 'id': '4-181521628', + 'ext': 'mp4', + 'title': r're:심심해서요', + 'description': '', + 'uploader': '채채🤎', + 'uploader_id': 'd49b8b06f3cc1d92d655b25ab27ac2e7', + 'channel': 'lesserafim', + 'channel_id': '47', + 'creators': ['LE SSERAFIM'], + 'channel_url': 'https://weverse.io/lesserafim', + 'timestamp': 1728570273, + 'upload_date': '20241010', + 'release_timestamp': 1728570264, + 'release_date': '20241010', + 'thumbnail': r're:https://phinf\.wevpstatic\.net/.+\.png', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'is_live', + }, + 'skip': 'Livestream has ended', }, { 'url': 'https://weverse.io/billlie/', 'only_matching': True, From ceaea731b6e314dbbdfb2e358d7677785ed0b4fc Mon Sep 17 00:00:00 2001 From: vvto33 <54504675+vvto33@users.noreply.github.com> Date: Fri, 11 Oct 2024 06:42:34 +0900 Subject: [PATCH 02/23] [ie/TVer] Support series URLs (#9507) Also improve thumbnails extraction Authored by: vvto33, pzhlkj6612 Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com> --- yt_dlp/extractor/tver.py | 148 ++++++++++++++++++++++++++------------- 1 file changed, 101 insertions(+), 47 deletions(-) diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index c13832c6f508..a8865fe6498a 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -6,11 +6,12 @@ str_or_none, strip_or_none, traverse_obj, + update_url_query, ) class TVerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature|tokyo2020/video|olympic/paris2024/video)/)+(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature)/)+(?P[a-zA-Z0-9]+)' _TESTS = [{ 'skip': 'videos are only available for 7 days', 'url': 'https://tver.jp/episodes/ep83nf3w4p', @@ -21,80 +22,115 @@ class TVerIE(InfoExtractor): 'episode': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', 'alt_title': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', 'channel': 'テレビ朝日', - }, - 'add_ie': ['BrightcoveNew'], - }, { - 'url': 'https://tver.jp/olympic/paris2024/video/6359578055112/', - 'info_dict': { - 'id': '6359578055112', + 'id': 'ep83nf3w4p', 'ext': 'mp4', - 'title': '堀米雄斗 金メダルで五輪連覇!「みんなの応援が最後に乗れたカギ」', - 'timestamp': 1722279928, - 'upload_date': '20240729', - 'tags': ['20240729', 'japanese', 'japanmedal', 'paris'], - 'uploader_id': '4774017240001', - 'thumbnail': r're:https?://[^/?#]+boltdns\.net/[^?#]+/1920x1080/match/image\.jpg', - 'duration': 670.571, + 'onair_label': '5月3日(火)放送分', + 'ext_title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着! テレビ朝日 5月3日(火)放送分', }, - 'params': {'skip_download': 'm3u8'}, + 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://tver.jp/corner/f0103888', 'only_matching': True, }, { 'url': 'https://tver.jp/lp/f0033031', 'only_matching': True, + }, { + 'url': 'https://tver.jp/series/srtxft431v', + 'info_dict': { + 'id': 'srtxft431v', + 'title': '名探偵コナン', + }, + 'playlist': [ + { + 'md5': '779ffd97493ed59b0a6277ea726b389e', + 'info_dict': { + 'id': 'ref:conan-1137-241005', + 'ext': 'mp4', + 'title': '名探偵コナン #1137「行列店、味変の秘密」', + 'uploader_id': '5330942432001', + 'tags': [], + 'channel': '読売テレビ', + 'series': '名探偵コナン', + 'description': 'md5:601fccc1d2430d942a2c8068c4b33eb5', + 'episode': '#1137「行列店、味変の秘密」', + 'duration': 1469.077, + 'timestamp': 1728030405, + 'upload_date': '20241004', + 'alt_title': '名探偵コナン #1137「行列店、味変の秘密」 読売テレビ 10月5日(土)放送分', + 'thumbnail': r're:https://.+\.jpg', + }, + }], + }, { + 'url': 'https://tver.jp/series/sru35hwdd2', + 'info_dict': { + 'id': 'sru35hwdd2', + 'title': '神回だけ見せます!', + }, + 'playlist_count': 11, + }, { + 'url': 'https://tver.jp/series/srkq2shp9d', + 'only_matching': True, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - _PLATFORM_UID = None - _PLATFORM_TOKEN = None + _HEADERS = {'x-tver-platform-type': 'web'} + _PLATFORM_QUERY = {} def _real_initialize(self): - create_response = self._download_json( - 'https://platform-api.tver.jp/v2/api/platform_users/browser/create', None, - note='Creating session', data=b'device_type=pc', headers={ - 'Origin': 'https://s.tver.jp', - 'Referer': 'https://s.tver.jp/', - 'Content-Type': 'application/x-www-form-urlencoded', + session_info = self._download_json( + 'https://platform-api.tver.jp/v2/api/platform_users/browser/create', + None, 'Creating session', data=b'device_type=pc') + self._PLATFORM_QUERY = traverse_obj(session_info, ('result', { + 'platform_uid': 'platform_uid', + 'platform_token': 'platform_token', + })) + + def _call_platform_api(self, path, video_id, note=None, fatal=True, query=None): + return self._download_json( + f'https://platform-api.tver.jp/service/api/{path}', video_id, note, + fatal=fatal, headers=self._HEADERS, query={ + **self._PLATFORM_QUERY, + **(query or {}), }) - self._PLATFORM_UID = traverse_obj(create_response, ('result', 'platform_uid')) - self._PLATFORM_TOKEN = traverse_obj(create_response, ('result', 'platform_token')) + + def _yield_episode_ids_for_series(self, series_id): + seasons_info = self._download_json( + f'https://service-api.tver.jp/api/v1/callSeriesSeasons/{series_id}', + series_id, 'Downloading seasons info', headers=self._HEADERS) + for season_id in traverse_obj( + seasons_info, ('result', 'contents', lambda _, v: v['type'] == 'season', 'content', 'id', {str})): + episodes_info = self._call_platform_api( + f'v1/callSeasonEpisodes/{season_id}', series_id, f'Downloading season {season_id} episodes info') + yield from traverse_obj(episodes_info, ( + 'result', 'contents', lambda _, v: v['type'] == 'episode', 'content', 'id', {str})) def _real_extract(self, url): video_id, video_type = self._match_valid_url(url).group('id', 'type') - if video_type == 'olympic/paris2024/video': - # Player ID is taken from .content.brightcove.E200.pro.pc.account_id: - # https://tver.jp/olympic/paris2024/req/api/hook?q=https%3A%2F%2Folympic-assets.tver.jp%2Fweb-static%2Fjson%2Fconfig.json&d= - return self.url_result(smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % ('4774017240001', video_id), - {'geo_countries': ['JP']}), 'BrightcoveNew') + if video_type == 'series': + series_info = self._call_platform_api( + f'v2/callSeries/{video_id}', video_id, 'Downloading series info') + return self.playlist_from_matches( + self._yield_episode_ids_for_series(video_id), video_id, + traverse_obj(series_info, ('result', 'content', 'content', 'title', {str})), + ie=TVerIE, getter=lambda x: f'https://tver.jp/episodes/{x}') - elif video_type not in {'series', 'episodes'}: + if video_type != 'episodes': webpage = self._download_webpage(url, video_id, note='Resolving to new URL') video_id = self._match_id(self._search_regex( (r'canonical"\s*href="(https?://tver\.jp/[^"]+)"', r'&link=(https?://tver\.jp/[^?&]+)[?&]'), webpage, 'url regex')) - episode_info = self._download_json( - f'https://platform-api.tver.jp/service/api/v1/callEpisode/{video_id}?require_data=mylist,later[epefy106ur],good[epefy106ur],resume[epefy106ur]', - video_id, fatal=False, - query={ - 'platform_uid': self._PLATFORM_UID, - 'platform_token': self._PLATFORM_TOKEN, - }, headers={ - 'x-tver-platform-type': 'web', + episode_info = self._call_platform_api( + f'v1/callEpisode/{video_id}', video_id, 'Downloading episode info', fatal=False, query={ + 'require_data': 'mylist,later[epefy106ur],good[epefy106ur],resume[epefy106ur]', }) episode_content = traverse_obj( episode_info, ('result', 'episode', 'content')) or {} + version = traverse_obj(episode_content, ('version', {str_or_none}), default='5') video_info = self._download_json( - f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, - query={ - 'v': str_or_none(episode_content.get('version')) or '5', - }, headers={ - 'Origin': 'https://tver.jp', - 'Referer': 'https://tver.jp/', - }) + f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, 'Downloading video info', + query={'v': version}, headers={'Referer': 'https://tver.jp/'}) p_id = video_info['video']['accountID'] r_id = traverse_obj(video_info, ('video', ('videoRefID', 'videoID')), get_all=False) if not r_id: @@ -110,6 +146,23 @@ def _real_extract(self, url): provider = str_or_none(episode_content.get('productionProviderName')) onair_label = str_or_none(episode_content.get('broadcastDateLabel')) + thumbnails = [ + { + 'id': quality, + 'url': update_url_query( + f'https://statics.tver.jp/images/content/thumbnail/episode/{quality}/{video_id}.jpg', + {'v': version}), + 'width': width, + 'height': height, + } + for quality, width, height in [ + ('small', 480, 270), + ('medium', 640, 360), + ('large', 960, 540), + ('xlarge', 1280, 720), + ] + ] + return { '_type': 'url_transparent', 'title': title, @@ -119,6 +172,7 @@ def _real_extract(self, url): 'alt_title': join_nonempty(title, provider, onair_label, delim=' '), 'channel': provider, 'description': str_or_none(video_info.get('description')), + 'thumbnails': thumbnails, 'url': smuggle_url( self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), {'geo_countries': ['JP']}), 'ie_key': 'BrightcoveNew', From f4338714241b11d9d43768ae71a25f5e952f677d Mon Sep 17 00:00:00 2001 From: 444995 <113297002+444995@users.noreply.github.com> Date: Sat, 12 Oct 2024 00:39:00 +0200 Subject: [PATCH 03/23] [ie/drtv] Fix extractor (#11141) Closes #11137 Authored by: 444995 --- yt_dlp/extractor/drtv.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/drtv.py b/yt_dlp/extractor/drtv.py index 32b684552856..ba86eb2b49a0 100644 --- a/yt_dlp/extractor/drtv.py +++ b/yt_dlp/extractor/drtv.py @@ -139,12 +139,11 @@ def _real_initialize(self): return token_response = self._download_json( - 'https://production.dr-massive.com/api/authorization/anonymous-sso', None, + 'https://isl.dr-massive.com/api/authorization/anonymous-sso', None, note='Downloading anonymous token', headers={ 'content-type': 'application/json', }, query={ - 'device': 'web_browser', - 'ff': 'idp,ldp,rpt', + 'device': 'phone_android', 'lang': 'da', 'supportFallbackToken': 'true', }, data=json.dumps({ From 9d43dcb2c5c38f443f84dfc126cd32720e1a1ad6 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 12 Oct 2024 17:47:37 +0000 Subject: [PATCH 04/23] [ie/cwtv] Fix extraction (#11230) Closes #9935 Authored by: bashonly --- yt_dlp/extractor/cwtv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/cwtv.py b/yt_dlp/extractor/cwtv.py index 4559d3cd680a..cb432e616045 100644 --- a/yt_dlp/extractor/cwtv.py +++ b/yt_dlp/extractor/cwtv.py @@ -6,6 +6,7 @@ parse_iso8601, smuggle_url, str_or_none, + update_url_query, ) @@ -98,7 +99,9 @@ def _real_extract(self, url): raise ExtractorError(data['msg'], expected=True) video_data = data['video'] title = video_data['title'] - mpx_url = video_data.get('mpx_url') or f'https://link.theplatform.com/s/cwtv/media/guid/2703454149/{video_id}?formats=M3U' + mpx_url = update_url_query( + video_data.get('mpx_url') or f'https://link.theplatform.com/s/cwtv/media/guid/2703454149/{video_id}', + {'formats': 'M3U+none'}) season = str_or_none(video_data.get('season')) episode = str_or_none(video_data.get('episode')) From babb70960595e2146f06f81affc29c7e713e34e2 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 12 Oct 2024 23:23:03 +0000 Subject: [PATCH 05/23] [ie/patreon:campaign] Stricter URL matching (#11235) Redefinition of suitable() is no longer necessary Closes #11233 Authored by: bashonly --- yt_dlp/extractor/patreon.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index f5cb2a5d6569..4d668cd37dc0 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -55,6 +55,7 @@ def _call_api(self, ep, item_id, query=None, headers=None, fatal=True, note=None class PatreonIE(PatreonBaseIE): + IE_NAME = 'patreon' _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P\d+)' _TESTS = [{ 'url': 'http://www.patreon.com/creation?hid=743933', @@ -433,8 +434,12 @@ def _get_comments(self, post_id): class PatreonCampaignIE(PatreonBaseIE): - - _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m|api/campaigns)/(?P\d+)|(?P[-\w]+))' + IE_NAME = 'patreon:campaign' + _VALID_URL = r'''(?x) + https?://(?:www\.)?patreon\.com/(?: + (?:m|api/campaigns)/(?P\d+)| + (?P(?!creation[?/]|posts/|rss[?/])[\w-]+) + )(?:/posts)?/?(?:$|[?#])''' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', 'info_dict': { @@ -496,10 +501,6 @@ class PatreonCampaignIE(PatreonBaseIE): 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if PatreonIE.suitable(url) else super().suitable(url) - def _entries(self, campaign_id): cursor = None params = { From c5f0f58efd8c3930de8202c15a5c53b1b635bd51 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 13 Oct 2024 03:38:09 +0200 Subject: [PATCH 06/23] [cookies] Fix compatibility for Python <=3.9 in traceback Authored by: Grub4K --- yt_dlp/YoutubeDL.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 4f45d7faf6be..9ac6ca0d0c93 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -4120,7 +4120,8 @@ def cookiejar(self): self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self) except CookieLoadError as error: cause = error.__context__ - self.report_error(str(cause), tb=''.join(traceback.format_exception(cause))) + # compat: <=py3.9: `traceback.format_exception` has a different signature + self.report_error(str(cause), tb=''.join(traceback.format_exception(None, cause, cause.__traceback__))) raise @property From edfd095b1917701c5046bd51f9542897c17d41a7 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 13 Oct 2024 03:42:43 +0200 Subject: [PATCH 07/23] [ie/generic] Impersonate browser by default (#11206) Also adds `impersonate` extractor arg Authored by: Grub4K --- README.md | 1 + yt_dlp/extractor/generic.py | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fbf50072dbfe..4b1ada82eda4 100644 --- a/README.md +++ b/README.md @@ -1795,6 +1795,7 @@ The following extractors use this feature: * `key_query`: Passthrough the master m3u8 URL query to its HLS AES-128 decryption key URI if no value is provided, or else apply the query string given as `key_query=VALUE`. Note that this will have no effect if the key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg * `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist * `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live` +* `impersonate`: Target(s) to try and impersonate with the initial webpage request; e.g. `safari,chrome-110`. By default any available target will be used. Use `false` to disable impersonation #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 592800287ab2..9b5421e41d77 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -8,6 +8,7 @@ from .commonprotocols import RtmpIE from .youtube import YoutubeIE from ..compat import compat_etree_fromstring +from ..networking.impersonate import ImpersonateTarget from ..utils import ( KNOWN_EXTENSIONS, MEDIA_EXTENSIONS, @@ -2373,6 +2374,12 @@ def _real_extract(self, url): else: video_id = self._generic_id(url) + # Try to impersonate a web-browser by default if possible + # Skip impersonation if not available to omit the warning + impersonate = self._configuration_arg('impersonate', ['']) + if 'false' in impersonate or not self._downloader._impersonate_target_available(ImpersonateTarget()): + impersonate = None + # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) # making it impossible to download only chunk of the file (yet we need only 512kB to # test whether it's HTML or not). According to yt-dlp default Accept-Encoding @@ -2384,7 +2391,7 @@ def _real_extract(self, url): full_response = self._request_webpage(url, video_id, headers=filter_dict({ 'Accept-Encoding': 'identity', 'Referer': smuggled_data.get('referer'), - })) + }), impersonate=impersonate) new_url = full_response.url if new_url != extract_basic_auth(url)[0]: self.report_following_redirect(new_url) From 1a830394a21a81a3e9918f9e175abc9fbb21f089 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 13 Oct 2024 03:50:31 +0200 Subject: [PATCH 08/23] [build] `make_lazy_extractors`: Force running without plugins (#11205) Authored by: Grub4K --- README.md | 3 ++- devscripts/make_lazy_extractors.py | 27 ++++----------------------- yt_dlp/YoutubeDL.py | 4 ++++ yt_dlp/plugins.py | 3 +++ 4 files changed, 13 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 4b1ada82eda4..1cafe51d5191 100644 --- a/README.md +++ b/README.md @@ -278,7 +278,7 @@ py -m bundle.py2exe * **`devscripts/update-version.py`** - Update the version number based on the current date. * **`devscripts/set-variant.py`** - Set the build variant of the executable. * **`devscripts/make_changelog.py`** - Create a markdown changelog using short commit messages and update `CONTRIBUTORS` file. -* **`devscripts/make_lazy_extractors.py`** - Create lazy extractors. Running this before building the binaries (any variant) will improve their startup performance. Set the environment variable `YTDLP_NO_LAZY_EXTRACTORS=1` if you wish to forcefully disable lazy extractor loading. +* **`devscripts/make_lazy_extractors.py`** - Create lazy extractors. Running this before building the binaries (any variant) will improve their startup performance. Set the environment variable `YTDLP_NO_LAZY_EXTRACTORS` to something nonempty to forcefully disable lazy extractor loading. Note: See their `--help` for more info. @@ -1898,6 +1898,7 @@ In other words, the file structure on the disk looks something like: myplugin.py yt-dlp looks for these `yt_dlp_plugins` namespace folders in many locations (see below) and loads in plugins from **all** of them. +Set the environment variable `YTDLP_NO_PLUGINS` to something nonempty to disable loading plugins entirely. See the [wiki for some known plugins](https://github.com/yt-dlp/yt-dlp/wiki/Plugins) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index d74ea202f08b..d288d842960c 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -2,7 +2,6 @@ # Allow direct execution import os -import shutil import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -34,18 +33,14 @@ class {name}({bases}): def main(): - lazy_extractors_filename = get_filename_args(default_outfile='yt_dlp/extractor/lazy_extractors.py') - if os.path.exists(lazy_extractors_filename): - os.remove(lazy_extractors_filename) + os.environ['YTDLP_NO_PLUGINS'] = 'true' + os.environ['YTDLP_NO_LAZY_EXTRACTORS'] = 'true' - _ALL_CLASSES = get_all_ies() # Must be before import + lazy_extractors_filename = get_filename_args(default_outfile='yt_dlp/extractor/lazy_extractors.py') - import yt_dlp.plugins + from yt_dlp.extractor.extractors import _ALL_CLASSES from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor - # Filter out plugins - _ALL_CLASSES = [cls for cls in _ALL_CLASSES if not cls.__module__.startswith(f'{yt_dlp.plugins.PACKAGE_NAME}.')] - DummyInfoExtractor = type('InfoExtractor', (InfoExtractor,), {'IE_NAME': NO_ATTR}) module_src = '\n'.join(( MODULE_TEMPLATE, @@ -58,20 +53,6 @@ def main(): write_file(lazy_extractors_filename, f'{module_src}\n') -def get_all_ies(): - PLUGINS_DIRNAME = 'ytdlp_plugins' - BLOCKED_DIRNAME = f'{PLUGINS_DIRNAME}_blocked' - if os.path.exists(PLUGINS_DIRNAME): - # os.rename cannot be used, e.g. in Docker. See https://github.com/yt-dlp/yt-dlp/pull/4958 - shutil.move(PLUGINS_DIRNAME, BLOCKED_DIRNAME) - try: - from yt_dlp.extractor.extractors import _ALL_CLASSES - finally: - if os.path.exists(BLOCKED_DIRNAME): - shutil.move(BLOCKED_DIRNAME, PLUGINS_DIRNAME) - return _ALL_CLASSES - - def extra_ie_code(ie, base=None): for var in STATIC_CLASS_PROPERTIES: val = getattr(ie, var) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 9ac6ca0d0c93..eea10650363e 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -4070,6 +4070,10 @@ def get_encoding(stream): write_debug(f'Proxy map: {self.proxies}') write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') + if os.environ.get('YTDLP_NO_PLUGINS'): + write_debug('Plugins are forcibly disabled') + return + for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): display_list = ['{}{}'.format( klass.__name__, '' if klass.__name__ == name else f' as {name}') diff --git a/yt_dlp/plugins.py b/yt_dlp/plugins.py index 3cc879fd7e02..d777d14e71a6 100644 --- a/yt_dlp/plugins.py +++ b/yt_dlp/plugins.py @@ -5,6 +5,7 @@ import importlib.util import inspect import itertools +import os import pkgutil import sys import traceback @@ -137,6 +138,8 @@ def load_module(module, module_name, suffix): def load_plugins(name, suffix): classes = {} + if os.environ.get('YTDLP_NO_PLUGINS'): + return classes for finder, module_name, _ in iter_modules(name): if any(x.startswith('_') for x in module_name.split('.')): From 16eb28026a2ddf5608d0a628ef15949b8d3805a9 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 13 Oct 2024 04:01:26 +0200 Subject: [PATCH 09/23] [test] Allow running tests explicitly (#11203) Authored by: Grub4K --- devscripts/run_tests.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/devscripts/run_tests.py b/devscripts/run_tests.py index c605aa62cfc8..eb614fe591ab 100755 --- a/devscripts/run_tests.py +++ b/devscripts/run_tests.py @@ -16,7 +16,7 @@ def parse_args(): parser = argparse.ArgumentParser(description='Run selected yt-dlp tests') parser.add_argument( - 'test', help='a extractor tests, or one of "core" or "download"', nargs='*') + 'test', help='an extractor test, test path, or one of "core" or "download"', nargs='*') parser.add_argument( '-k', help='run a test matching EXPRESSION. Same as "pytest -k"', metavar='EXPRESSION') parser.add_argument( @@ -27,7 +27,6 @@ def parse_args(): def run_tests(*tests, pattern=None, ci=False): run_core = 'core' in tests or (not pattern and not tests) run_download = 'download' in tests - tests = list(map(fix_test_name, tests)) pytest_args = args.pytest_args or os.getenv('HATCH_TEST_ARGS', '') arguments = ['pytest', '-Werror', '--tb=short', *shlex.split(pytest_args)] @@ -41,7 +40,9 @@ def run_tests(*tests, pattern=None, ci=False): arguments.extend(['-m', 'download']) else: arguments.extend( - f'test/test_download.py::TestDownload::test_{test}' for test in tests) + test if '/' in test + else f'test/test_download.py::TestDownload::test_{fix_test_name(test)}' + for test in tests) print(f'Running {arguments}', flush=True) try: From 85b87c991af25dcb35630fa94580fd418e78ee33 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 13 Oct 2024 04:10:12 +0200 Subject: [PATCH 10/23] [utils] `sanitize_path`: Reimplement function (#11198) Authored by: Grub4K --- test/test_utils.py | 10 +++++-- yt_dlp/utils/_utils.py | 62 ++++++++++++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 23 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 4f5fa1e100af..d4b846f56fba 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -221,9 +221,10 @@ def test_sanitize_ids(self): self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') def test_sanitize_path(self): - if sys.platform != 'win32': - return + with unittest.mock.patch('sys.platform', 'win32'): + self._test_sanitize_path() + def _test_sanitize_path(self): self.assertEqual(sanitize_path('abc'), 'abc') self.assertEqual(sanitize_path('abc/def'), 'abc\\def') self.assertEqual(sanitize_path('abc\\def'), 'abc\\def') @@ -256,6 +257,11 @@ def test_sanitize_path(self): self.assertEqual(sanitize_path('./abc'), 'abc') self.assertEqual(sanitize_path('./../abc'), '..\\abc') + self.assertEqual(sanitize_path('\\abc'), '\\abc') + self.assertEqual(sanitize_path('C:abc'), 'C:abc') + self.assertEqual(sanitize_path('C:abc\\..\\'), 'C:..') + self.assertEqual(sanitize_path('C:\\abc:%(title)s.%(ext)s'), 'C:\\abc#%(title)s.%(ext)s') + def test_sanitize_url(self): self.assertEqual(sanitize_url('//foo.bar'), 'http://foo.bar') self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar') diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index e1b3c48d6339..967f01fdf941 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -664,31 +664,51 @@ def replace_insane(char): return result +def _sanitize_path_parts(parts): + sanitized_parts = [] + for part in parts: + if not part or part == '.': + continue + elif part == '..': + if sanitized_parts and sanitized_parts[-1] != '..': + sanitized_parts.pop() + sanitized_parts.append('..') + continue + # Replace invalid segments with `#` + # - trailing dots and spaces (`asdf...` => `asdf..#`) + # - invalid chars (`<>` => `##`) + sanitized_part = re.sub(r'[/<>:"\|\\?\*]|[\s.]$', '#', part) + sanitized_parts.append(sanitized_part) + + return sanitized_parts + + def sanitize_path(s, force=False): """Sanitizes and normalizes path on Windows""" - # XXX: this handles drive relative paths (c:sth) incorrectly - if sys.platform == 'win32': - force = False - drive_or_unc, _ = os.path.splitdrive(s) - elif force: - drive_or_unc = '' + if sys.platform != 'win32': + if not force: + return s + root = '/' if s.startswith('/') else '' + return root + '/'.join(_sanitize_path_parts(s.split('/'))) + + normed = s.replace('/', '\\') + + if normed.startswith('\\\\'): + # UNC path (`\\SERVER\SHARE`) or device path (`\\.`, `\\?`) + parts = normed.split('\\') + root = '\\'.join(parts[:4]) + '\\' + parts = parts[4:] + elif normed[1:2] == ':': + # absolute path or drive relative path + offset = 3 if normed[2:3] == '\\' else 2 + root = normed[:offset] + parts = normed[offset:].split('\\') else: - return s + # relative/drive root relative path + root = '\\' if normed[:1] == '\\' else '' + parts = normed.split('\\') - norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) - if drive_or_unc: - norm_path.pop(0) - sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) - for path_part in norm_path] - if drive_or_unc: - sanitized_path.insert(0, drive_or_unc + os.path.sep) - elif force and s and s[0] == os.path.sep: - sanitized_path.insert(0, os.path.sep) - # TODO: Fix behavioral differences <3.12 - # The workaround using `normpath` only superficially passes tests - # Ref: https://github.com/python/cpython/pull/100351 - return os.path.normpath(os.path.join(*sanitized_path)) + return root + '\\'.join(_sanitize_path_parts(parts)) def sanitize_url(url, *, scheme='http'): From d710a6ca7c622705c0c8c8a3615916f531137d5d Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 13 Oct 2024 05:14:32 +0200 Subject: [PATCH 11/23] Add extractor helpers (#10653) Authored by: Grub4K --- test/test_traversal.py | 79 ++++++++++++++++++- yt_dlp/extractor/common.py | 8 +- yt_dlp/utils/_utils.py | 27 ++++++- yt_dlp/utils/traversal.py | 158 ++++++++++++++++++++++++++++++++++++- 4 files changed, 261 insertions(+), 11 deletions(-) diff --git a/test/test_traversal.py b/test/test_traversal.py index 5d9fbe1d162a..9179dadda47c 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -4,8 +4,18 @@ import pytest -from yt_dlp.utils import dict_get, int_or_none, str_or_none -from yt_dlp.utils.traversal import traverse_obj +from yt_dlp.utils import ( + ExtractorError, + determine_ext, + dict_get, + int_or_none, + str_or_none, +) +from yt_dlp.utils.traversal import ( + traverse_obj, + require, + subs_list_to_dict, +) _TEST_DATA = { 100: 100, @@ -420,6 +430,71 @@ def test_traversal_morsel(self): assert traverse_obj(morsel, [(None,), any]) == morsel, \ 'Morsel should not be implicitly changed to dict on usage' + def test_traversal_filter(self): + data = [None, False, True, 0, 1, 0.0, 1.1, '', 'str', {}, {0: 0}, [], [1]] + + assert traverse_obj(data, [..., filter]) == [True, 1, 1.1, 'str', {0: 0}, [1]], \ + '`filter` should filter falsy values' + + +class TestTraversalHelpers: + def test_traversal_require(self): + with pytest.raises(ExtractorError): + traverse_obj(_TEST_DATA, ['None', {require('value')}]) + assert traverse_obj(_TEST_DATA, ['str', {require('value')}]) == 'str', \ + '`require` should pass through non `None` values' + + def test_subs_list_to_dict(self): + assert traverse_obj([ + {'name': 'de', 'url': 'https://example.com/subs/de.vtt'}, + {'name': 'en', 'url': 'https://example.com/subs/en1.ass'}, + {'name': 'en', 'url': 'https://example.com/subs/en2.ass'}, + ], [..., { + 'id': 'name', + 'url': 'url', + }, all, {subs_list_to_dict}]) == { + 'de': [{'url': 'https://example.com/subs/de.vtt'}], + 'en': [ + {'url': 'https://example.com/subs/en1.ass'}, + {'url': 'https://example.com/subs/en2.ass'}, + ], + }, 'function should build subtitle dict from list of subtitles' + assert traverse_obj([ + {'name': 'de', 'url': 'https://example.com/subs/de.ass'}, + {'name': 'de'}, + {'name': 'en', 'content': 'content'}, + {'url': 'https://example.com/subs/en'}, + ], [..., { + 'id': 'name', + 'data': 'content', + 'url': 'url', + }, all, {subs_list_to_dict}]) == { + 'de': [{'url': 'https://example.com/subs/de.ass'}], + 'en': [{'data': 'content'}], + }, 'subs with mandatory items missing should be filtered' + assert traverse_obj([ + {'url': 'https://example.com/subs/de.ass', 'name': 'de'}, + {'url': 'https://example.com/subs/en', 'name': 'en'}, + ], [..., { + 'id': 'name', + 'ext': ['url', {lambda x: determine_ext(x, default_ext=None)}], + 'url': 'url', + }, all, {subs_list_to_dict(ext='ext')}]) == { + 'de': [{'url': 'https://example.com/subs/de.ass', 'ext': 'ass'}], + 'en': [{'url': 'https://example.com/subs/en', 'ext': 'ext'}], + }, '`ext` should set default ext but leave existing value untouched' + assert traverse_obj([ + {'name': 'en', 'url': 'https://example.com/subs/en2', 'prio': True}, + {'name': 'en', 'url': 'https://example.com/subs/en1', 'prio': False}, + ], [..., { + 'id': 'name', + 'quality': ['prio', {int}], + 'url': 'url', + }, all, {subs_list_to_dict(ext='ext')}]) == {'en': [ + {'url': 'https://example.com/subs/en1', 'ext': 'ext'}, + {'url': 'https://example.com/subs/en2', 'ext': 'ext'}, + ]}, '`quality` key should sort subtitle list accordingly' + class TestDictGet: def test_dict_get(self): diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 3430036f4bdc..812fbfa9f9c2 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -573,13 +573,13 @@ class InfoExtractor: def _login_hint(self, method=NO_DEFAULT, netrc=None): password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' + cookies_hint = 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies' return { None: '', - 'any': f'Use --cookies, --cookies-from-browser, {password_hint}', + 'any': f'Use --cookies, --cookies-from-browser, {password_hint}. {cookies_hint}', 'password': f'Use {password_hint}', - 'cookies': ( - 'Use --cookies-from-browser or --cookies for the authentication. ' - 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'), + 'cookies': f'Use --cookies-from-browser or --cookies for the authentication. {cookies_hint}', + 'session_cookies': f'Use --cookies for the authentication (--cookies-from-browser might not work). {cookies_hint}', }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies'] def __init__(self, downloader=None): diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 967f01fdf941..dd12466b89a1 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1984,11 +1984,30 @@ def urljoin(base, path): return urllib.parse.urljoin(base, path) -def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): +def partial_application(func): + sig = inspect.signature(func) + + @functools.wraps(func) + def wrapped(*args, **kwargs): + try: + sig.bind(*args, **kwargs) + except TypeError: + return functools.partial(func, *args, **kwargs) + else: + return func(*args, **kwargs) + + return wrapped + + +@partial_application +def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1, base=None): if get_attr and v is not None: v = getattr(v, get_attr, None) + if invscale == 1 and scale < 1: + invscale = int(1 / scale) + scale = 1 try: - return int(v) * invscale // scale + return (int(v) if base is None else int(v, base=base)) * invscale // scale except (ValueError, TypeError, OverflowError): return default @@ -2006,9 +2025,13 @@ def str_to_int(int_str): return int_or_none(int_str) +@partial_application def float_or_none(v, scale=1, invscale=1, default=None): if v is None: return default + if invscale == 1 and scale < 1: + invscale = int(1 / scale) + scale = 1 try: return float(v) * invscale / scale except (ValueError, TypeError): diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index 96eb2eddf529..b918487f98d8 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -1,18 +1,35 @@ +from __future__ import annotations + +import collections import collections.abc import contextlib +import functools import http.cookies import inspect import itertools import re +import typing import xml.etree.ElementTree from ._utils import ( IDENTITY, NO_DEFAULT, + ExtractorError, LazyList, deprecation_warning, + get_elements_html_by_class, + get_elements_html_by_attribute, + get_elements_by_attribute, + get_element_html_by_attribute, + get_element_by_attribute, + get_element_html_by_id, + get_element_by_id, + get_element_html_by_class, + get_elements_by_class, + get_element_text_and_html_by_tag, is_iterable_like, try_call, + url_or_none, variadic, ) @@ -54,6 +71,7 @@ def traverse_obj( Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. - `any`-builtin: Take the first matching object and return it, resetting branching. - `all`-builtin: Take all matching objects and return them as a list, resetting branching. + - `filter`-builtin: Return the value if it is truthy, `None` otherwise. `tuple`, `list`, and `dict` all support nested paths and branches. @@ -247,6 +265,10 @@ def apply_path(start_obj, path, test_type): objs = (list(filtered_objs),) continue + if key is filter: + objs = filter(None, objs) + continue + if __debug__ and callable(key): # Verify function signature inspect.signature(key).bind(None, None) @@ -277,13 +299,143 @@ def _traverse_obj(obj, path, allow_empty, test_type): return results[0] if results else {} if allow_empty and is_dict else None for index, path in enumerate(paths, 1): - result = _traverse_obj(obj, path, index == len(paths), True) - if result is not None: - return result + is_last = index == len(paths) + try: + result = _traverse_obj(obj, path, is_last, True) + if result is not None: + return result + except _RequiredError as e: + if is_last: + # Reraise to get cleaner stack trace + raise ExtractorError(e.orig_msg, expected=e.expected) from None return None if default is NO_DEFAULT else default +def value(value, /): + return lambda _: value + + +def require(name, /, *, expected=False): + def func(value): + if value is None: + raise _RequiredError(f'Unable to extract {name}', expected=expected) + + return value + + return func + + +class _RequiredError(ExtractorError): + pass + + +@typing.overload +def subs_list_to_dict(*, ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ... + + +@typing.overload +def subs_list_to_dict(subs: list[dict] | None, /, *, ext: str | None = None) -> dict[str, list[dict]]: ... + + +def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None): + """ + Convert subtitles from a traversal into a subtitle dict. + The path should have an `all` immediately before this function. + + Arguments: + `ext` The default value for `ext` in the subtitle dict + + In the dict you can set the following additional items: + `id` The subtitle id to sort the dict into + `quality` The sort order for each subtitle + """ + if subs is None: + return functools.partial(subs_list_to_dict, ext=ext) + + result = collections.defaultdict(list) + + for sub in subs: + if not url_or_none(sub.get('url')) and not sub.get('data'): + continue + sub_id = sub.pop('id', None) + if sub_id is None: + continue + if ext is not None and not sub.get('ext'): + sub['ext'] = ext + result[sub_id].append(sub) + result = dict(result) + + for subs in result.values(): + subs.sort(key=lambda x: x.pop('quality', 0) or 0) + + return result + + +@typing.overload +def find_element(*, attr: str, value: str, tag: str | None = None, html=False): ... + + +@typing.overload +def find_element(*, cls: str, html=False): ... + + +@typing.overload +def find_element(*, id: str, tag: str | None = None, html=False): ... + + +@typing.overload +def find_element(*, tag: str, html=False): ... + + +def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False): + # deliberately using `id=` and `cls=` for ease of readability + assert tag or id or cls or (attr and value), 'One of tag, id, cls or (attr AND value) is required' + if not tag: + tag = r'[\w:.-]+' + + if attr and value: + assert not cls, 'Cannot match both attr and cls' + assert not id, 'Cannot match both attr and id' + func = get_element_html_by_attribute if html else get_element_by_attribute + return functools.partial(func, attr, value, tag=tag) + + elif cls: + assert not id, 'Cannot match both cls and id' + assert tag is None, 'Cannot match both cls and tag' + func = get_element_html_by_class if html else get_elements_by_class + return functools.partial(func, cls) + + elif id: + func = get_element_html_by_id if html else get_element_by_id + return functools.partial(func, id, tag=tag) + + index = int(bool(html)) + return lambda html: get_element_text_and_html_by_tag(tag, html)[index] + + +@typing.overload +def find_elements(*, cls: str, html=False): ... + + +@typing.overload +def find_elements(*, attr: str, value: str, tag: str | None = None, html=False): ... + + +def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False): + # deliberately using `cls=` for ease of readability + assert cls or (attr and value), 'One of cls or (attr AND value) is required' + + if attr and value: + assert not cls, 'Cannot match both attr and cls' + func = get_elements_html_by_attribute if html else get_elements_by_attribute + return functools.partial(func, attr, value, tag=tag or r'[\w:.-]+') + + assert not tag, 'Cannot match both cls and tag' + func = get_elements_html_by_class if html else get_elements_by_class + return functools.partial(func, cls) + + def get_first(obj, *paths, **kwargs): return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) From cba7868502f04175fecf9ab3e363296aee7ebec2 Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Sun, 13 Oct 2024 14:27:01 +0800 Subject: [PATCH 12/23] [ie/reddit] Detect and raise when login is required (#11202) Closes #10924 Authored by: pzhlkj6612 --- yt_dlp/extractor/reddit.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index bc3e5f7eeed6..b633dc48affa 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -1,3 +1,4 @@ +import json import urllib.parse from .common import InfoExtractor @@ -17,7 +18,7 @@ class RedditIE(InfoExtractor): _NETRC_MACHINE = 'reddit' - _VALID_URL = r'https?://(?P(?:\w+\.)?reddit(?:media)?\.com)/(?P(?:(?:r|user)/[^/]+/)?comments/(?P[^/?#&]+))' + _VALID_URL = r'https?://(?:\w+\.)?reddit(?:media)?\.com/(?P(?:(?:r|user)/[^/]+/)?comments/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -251,15 +252,15 @@ def _get_subtitles(self, video_id): return {'en': [{'url': caption_url}]} def _real_extract(self, url): - host, slug, video_id = self._match_valid_url(url).group('host', 'slug', 'id') + slug, video_id = self._match_valid_url(url).group('slug', 'id') - data = self._download_json( - f'https://{host}/{slug}/.json', video_id, fatal=False, expected_status=403) - if not data: - fallback_host = 'old.reddit.com' if host != 'old.reddit.com' else 'www.reddit.com' - self.to_screen(f'{host} request failed, retrying with {fallback_host}') + try: data = self._download_json( - f'https://{fallback_host}/{slug}/.json', video_id, expected_status=403) + f'https://www.reddit.com/{slug}/.json', video_id, expected_status=403) + except ExtractorError as e: + if isinstance(e.cause, json.JSONDecodeError): + self.raise_login_required('Account authentication is required') + raise if traverse_obj(data, 'error') == 403: reason = data.get('reason') From dcfeea4dd5e5686821350baa6c7767a011944867 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 14 Oct 2024 22:19:26 +0000 Subject: [PATCH 13/23] [ie/adobepass] Use newer user-agent for provider redirect request (#11250) Closes #10848 Authored by: bashonly --- yt_dlp/extractor/adobepass.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index eb7e597e52d7..7cc15ec7b6f2 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -1355,6 +1355,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' + _MODERN_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:131.0) Gecko/20100101 Firefox/131.0' _MVPD_CACHE = 'ap-mvpd' _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' @@ -1454,7 +1455,11 @@ def extract_redirect_url(html, url=None, fatal=False): 'no_iframe': 'false', 'domain_name': 'adobe.com', 'redirect_url': url, - }) + }, headers={ + # yt-dlp's default user-agent is usually too old for Comcast_SSO + # See: https://github.com/yt-dlp/yt-dlp/issues/10848 + 'User-Agent': self._MODERN_USER_AGENT, + } if mso_id == 'Comcast_SSO' else None) elif not self._cookies_passed: raise_mvpd_required() From 64d84d75ca8c19ec06558cc7c511f5f4f7a822bc Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 15 Oct 2024 07:07:42 +0000 Subject: [PATCH 14/23] [build] Use `macos-13` image for macOS builds (#11236) Authored by: bashonly --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bd2e42d9af81..495d3c63060b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -240,7 +240,7 @@ jobs: permissions: contents: read actions: write # For cleaning up cache - runs-on: macos-12 + runs-on: macos-13 steps: - uses: actions/checkout@v4 @@ -346,7 +346,7 @@ jobs: macos_legacy: needs: process if: inputs.macos_legacy - runs-on: macos-12 + runs-on: macos-13 steps: - uses: actions/checkout@v4 From fbc66e3ab35743cc847a21223c67d88bb463cd9c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 16 Oct 2024 03:53:53 +0000 Subject: [PATCH 15/23] [utils] `Popen`: Reset PyInstaller environment (#11258) - Forces spawning independent subprocesses for exes bundled with PyInstaller>=6.10 - Fixes regression introduced in fb8b7f226d251e521a89b23c415e249e5b788e5c - Ref: https://pyinstaller.org/en/v6.10.0/CHANGES.html#incompatible-changes Closes #11259 Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki --- yt_dlp/utils/_utils.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index dd12466b89a1..27ebfefbcb92 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -824,14 +824,18 @@ class Popen(subprocess.Popen): _startupinfo = None @staticmethod - def _fix_pyinstaller_ld_path(env): - """Restore LD_LIBRARY_PATH when using PyInstaller - Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations - https://github.com/yt-dlp/yt-dlp/issues/4573 - """ + def _fix_pyinstaller_issues(env): if not hasattr(sys, '_MEIPASS'): return + # Force spawning independent subprocesses for exes bundled with PyInstaller>=6.10 + # Ref: https://pyinstaller.org/en/v6.10.0/CHANGES.html#incompatible-changes + # https://github.com/yt-dlp/yt-dlp/issues/11259 + env['PYINSTALLER_RESET_ENVIRONMENT'] = '1' + + # Restore LD_LIBRARY_PATH when using PyInstaller + # Ref: https://pyinstaller.org/en/v6.10.0/runtime-information.html#ld-library-path-libpath-considerations + # https://github.com/yt-dlp/yt-dlp/issues/4573 def _fix(key): orig = env.get(f'{key}_ORIG') if orig is None: @@ -845,7 +849,7 @@ def _fix(key): def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs): if env is None: env = os.environ.copy() - self._fix_pyinstaller_ld_path(env) + self._fix_pyinstaller_issues(env) self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines') if text is True: From 7af1ddaaf2a6a0a750373a9ab53c7770af4f9fe4 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 19 Oct 2024 21:40:20 +0000 Subject: [PATCH 16/23] [ie/youtube] Fix `comment_count` extraction (#11274) Authored by: bashonly --- yt_dlp/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 6acc42fc0a46..f41f57ed16a0 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4701,11 +4701,12 @@ def process_language(container, base_url, lang_code, sub_name, query): headers=self.generate_api_headers(ytcfg=master_ytcfg), note='Downloading initial data API JSON') + COMMENTS_SECTION_IDS = ('comment-item-section', 'engagement-panel-comments-section') info['comment_count'] = traverse_obj(initial_data, ( 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'itemSectionRenderer', 'contents', ..., 'commentsEntryPointHeaderRenderer', 'commentCount', ), ( - 'engagementPanels', lambda _, v: v['engagementPanelSectionListRenderer']['panelIdentifier'] == 'comment-item-section', + 'engagementPanels', lambda _, v: v['engagementPanelSectionListRenderer']['panelIdentifier'] in COMMENTS_SECTION_IDS, 'engagementPanelSectionListRenderer', 'header', 'engagementPanelTitleHeaderRenderer', 'contextualInfo', ), expected_type=self._get_count, get_all=False) From 3148c1822f66533998278f0a1cf842b9bea1526a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 19 Oct 2024 21:41:14 +0000 Subject: [PATCH 17/23] [ie/substack] Resolve podcast file extensions (#11275) Closes #4601 Authored by: bashonly --- yt_dlp/extractor/substack.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index 30cb322dc250..b70d40f2cad9 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -2,7 +2,13 @@ import urllib.parse from .common import InfoExtractor -from ..utils import js_to_json, str_or_none, traverse_obj +from ..networking import HEADRequest +from ..utils import ( + determine_ext, + js_to_json, + str_or_none, +) +from ..utils.traversal import traverse_obj class SubstackIE(InfoExtractor): @@ -43,6 +49,19 @@ class SubstackIE(InfoExtractor): 'uploader': "Andrew Zimmern's Spilled Milk ", 'uploader_id': '577659', }, + }, { + # Podcast that needs its file extension resolved to mp3 + 'url': 'https://persuasion1.substack.com/p/summers', + 'md5': '1456a755d46084744facdfac9edf900f', + 'info_dict': { + 'id': '141970405', + 'ext': 'mp3', + 'title': 'Larry Summers on What Went Wrong on Campus', + 'description': 'Yascha Mounk and Larry Summers also discuss the promise and perils of artificial intelligence.', + 'thumbnail': r're:https://substackcdn\.com/image/.+\.jpeg', + 'uploader': 'Persuasion', + 'uploader_id': '61579', + }, }] @classmethod @@ -89,7 +108,15 @@ def _real_extract(self, url): post_type = webpage_info['post']['type'] formats, subtitles = [], {} if post_type == 'podcast': - formats, subtitles = [{'url': webpage_info['post']['podcast_url']}], {} + fmt = {'url': webpage_info['post']['podcast_url']} + if not determine_ext(fmt['url'], default_ext=None): + # The redirected format URL expires but the original URL doesn't, + # so we only want to extract the extension from this request + fmt['ext'] = determine_ext(self._request_webpage( + HEADRequest(fmt['url']), display_id, + 'Resolving podcast file extension', + 'Podcast URL is invalid').url) + formats.append(fmt) elif post_type == 'video': formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], canonical_url) else: From 679c68240a26481ea7c07cc0c014745631ea8481 Mon Sep 17 00:00:00 2001 From: rubyevadestaxes <147743127+rubyevadestaxes@users.noreply.github.com> Date: Sat, 19 Oct 2024 23:51:47 +0200 Subject: [PATCH 18/23] [ie/twitter:spaces] Allow extraction when not logged in (#11289) Closes #11288 Authored by: rubyevadestaxes --- yt_dlp/extractor/twitter.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index aca94df2dd92..5adaf16393c5 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -934,14 +934,13 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': 'MoniqueCamarra', 'live_status': 'was_live', 'release_timestamp': 1658417414, - 'description': 'md5:acce559345fd49f129c20dbcda3f1201', + 'description': r're:Twitter Space participated by Sergej Sumlenny.+', 'timestamp': 1658407771, 'release_date': '20220721', 'upload_date': '20220721', }, 'add_ie': ['TwitterSpaces'], 'params': {'skip_download': 'm3u8'}, - 'skip': 'Requires authentication', }, { # URL specifies video number but --yes-playlist 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1', @@ -1856,8 +1855,6 @@ def _build_graphql_query(self, space_id): def _real_extract(self, url): space_id = self._match_id(url) - if not self.is_logged_in: - self.raise_login_required('Twitter Spaces require authentication') space_data = self._call_graphql_api('HPEisOmj1epUNLCWTYhUWw/AudioSpaceById', space_id)['audioSpace'] if not space_data: raise ExtractorError('Twitter Space not found', expected=True) From 8de431ec97a4b62b73df8f686b6e21e462775336 Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 20 Oct 2024 15:18:15 +0200 Subject: [PATCH 19/23] [ie/Funk] Extend `_VALID_URL` (#11269) Authored by: seproDev --- yt_dlp/extractor/funk.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/funk.py b/yt_dlp/extractor/funk.py index 8bdea3fce793..ef8ea72a8cda 100644 --- a/yt_dlp/extractor/funk.py +++ b/yt_dlp/extractor/funk.py @@ -3,7 +3,7 @@ class FunkIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|origin\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P[0-9a-z-]+)-(?P\d+)' + _VALID_URL = r'https?://(?:(?:www|origin|play)\.)?funk\.net/(?:channel|playlist)/[^/?#]+/(?P[0-9a-z-]+)-(?P\d+)' _TESTS = [{ 'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821', 'md5': '8610449476156f338761a75391b0017d', @@ -27,6 +27,9 @@ class FunkIE(InfoExtractor): }, { 'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699', 'only_matching': True, + }, { + 'url': 'https://play.funk.net/playlist/neuesteVideos/george-floyd-wenn-die-polizei-toetet-der-fall-2004391', + 'only_matching': True, }] def _real_extract(self, url): From 0f593dca9fa995d88eb763170a932da61c8f24dc Mon Sep 17 00:00:00 2001 From: Imran Hussain Date: Sun, 20 Oct 2024 18:10:26 +0100 Subject: [PATCH 20/23] Add option `--plugin-dirs` (#11277) Closes #3260 Authored by: imranh2, coletdjnz Co-authored-by: coletdjnz --- README.md | 7 +++++++ test/test_plugins.py | 19 +++++++++++++++++++ .../yt_dlp_plugins/extractor/package.py | 5 +++++ yt_dlp/__init__.py | 5 +++++ yt_dlp/options.py | 8 ++++++++ yt_dlp/plugins.py | 7 +++++++ yt_dlp/utils/_utils.py | 4 ++++ 7 files changed, 55 insertions(+) create mode 100644 test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py diff --git a/README.md b/README.md index 1cafe51d5191..fc38a529a77a 100644 --- a/README.md +++ b/README.md @@ -348,6 +348,13 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git containing directory ("-" for stdin). Can be used multiple times and inside other configuration files + --plugin-dirs PATH Path to an additional directory to search + for plugins. This option can be used + multiple times to add multiple directories. + Note that this currently only works for + extractor plugins; postprocessor plugins can + only be loaded from the default plugin + directories --flat-playlist Do not extract the videos of a playlist, only list them --no-flat-playlist Fully extract the videos of a playlist diff --git a/test/test_plugins.py b/test/test_plugins.py index c82158e9fc35..77545d136ceb 100644 --- a/test/test_plugins.py +++ b/test/test_plugins.py @@ -10,6 +10,7 @@ sys.path.append(str(TEST_DATA_DIR)) importlib.invalidate_caches() +from yt_dlp.utils import Config from yt_dlp.plugins import PACKAGE_NAME, directories, load_plugins @@ -68,6 +69,24 @@ def test_importing_zipped_module(self): os.remove(zip_path) importlib.invalidate_caches() # reset the import caches + def test_plugin_dirs(self): + # Internal plugin dirs hack for CLI --plugin-dirs + # To be replaced with proper system later + custom_plugin_dir = TEST_DATA_DIR / 'plugin_packages' + Config._plugin_dirs = [str(custom_plugin_dir)] + importlib.invalidate_caches() # reset the import caches + + try: + package = importlib.import_module(f'{PACKAGE_NAME}.extractor') + self.assertIn(custom_plugin_dir / 'testpackage' / PACKAGE_NAME / 'extractor', map(Path, package.__path__)) + + plugins_ie = load_plugins('extractor', 'IE') + self.assertIn('PackagePluginIE', plugins_ie.keys()) + + finally: + Config._plugin_dirs = [] + importlib.invalidate_caches() # reset the import caches + if __name__ == '__main__': unittest.main() diff --git a/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py b/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py new file mode 100644 index 000000000000..b860300d8de5 --- /dev/null +++ b/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py @@ -0,0 +1,5 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class PackagePluginIE(InfoExtractor): + pass diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index f598b6c2fecd..d976f5bbcb7d 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -34,6 +34,7 @@ ) from .update import Updater from .utils import ( + Config, NO_DEFAULT, POSTPROCESS_WHEN, DateRange, @@ -967,6 +968,10 @@ def _real_main(argv=None): parser, opts, all_urls, ydl_opts = parse_options(argv) + # HACK: Set the plugin dirs early on + # TODO(coletdjnz): remove when plugin globals system is implemented + Config._plugin_dirs = opts.plugin_dirs + # Dump user agent if opts.dump_user_agent: ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent']) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 9980b7fc3f05..c3a647da773b 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -408,6 +408,14 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help=( 'Location of the main configuration file; either the path to the config or its containing directory ' '("-" for stdin). Can be used multiple times and inside other configuration files')) + general.add_option( + '--plugin-dirs', + dest='plugin_dirs', metavar='PATH', action='append', + help=( + 'Path to an additional directory to search for plugins. ' + 'This option can be used multiple times to add multiple directories. ' + 'Note that this currently only works for extractor plugins; ' + 'postprocessor plugins can only be loaded from the default plugin directories')) general.add_option( '--flat-playlist', action='store_const', dest='extract_flat', const='in_playlist', default=False, diff --git a/yt_dlp/plugins.py b/yt_dlp/plugins.py index d777d14e71a6..204558d60348 100644 --- a/yt_dlp/plugins.py +++ b/yt_dlp/plugins.py @@ -15,6 +15,7 @@ from .compat import functools # isort: split from .utils import ( + Config, get_executable_path, get_system_config_dirs, get_user_config_dirs, @@ -84,6 +85,12 @@ def _get_package_paths(*root_paths, containing_folder='plugins'): with contextlib.suppress(ValueError): # Added when running __main__.py directly candidate_locations.remove(Path(__file__).parent) + # TODO(coletdjnz): remove when plugin globals system is implemented + if Config._plugin_dirs: + candidate_locations.extend(_get_package_paths( + *Config._plugin_dirs, + containing_folder='')) + parts = Path(*fullname.split('.')) for path in orderedSet(candidate_locations, lazy=True): candidate = path / parts diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 27ebfefbcb92..ea748898f200 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -4897,6 +4897,10 @@ class Config: filename = None __initialized = False + # Internal only, do not use! Hack to enable --plugin-dirs + # TODO(coletdjnz): remove when plugin globals system is implemented + _plugin_dirs = None + def __init__(self, parser, label=None): self.parser, self.label = parser, label self._loaded_paths, self.configs = set(), [] From 5af774d7a36c00bea618c7047c9326532cd3f616 Mon Sep 17 00:00:00 2001 From: Deer-Spangle <60626596+Deer-Spangle@users.noreply.github.com> Date: Sun, 20 Oct 2024 21:58:53 +0100 Subject: [PATCH 21/23] [ie/imgur] Support new URL format (#11075) Authored by: Deer-Spangle --- yt_dlp/extractor/imgur.py | 68 ++++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/imgur.py b/yt_dlp/extractor/imgur.py index f0c3419d49ea..2a5a1c9e8480 100644 --- a/yt_dlp/extractor/imgur.py +++ b/yt_dlp/extractor/imgur.py @@ -37,7 +37,7 @@ def get_description(s): class ImgurIE(ImgurBaseIE): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?:[^/?#]+-)?(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://imgur.com/A61SaA1', @@ -54,6 +54,22 @@ class ImgurIE(ImgurBaseIE): 'like_count': int, 'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg', }, + }, { + # Test with URL slug + 'url': 'https://imgur.com/mrw-gifv-is-up-running-without-any-bugs-A61SaA1', + 'info_dict': { + 'id': 'A61SaA1', + 'ext': 'mp4', + 'title': 'MRW gifv is up and running without any bugs', + 'timestamp': 1416446068, + 'upload_date': '20141120', + 'dislike_count': int, + 'comment_count': int, + 'release_timestamp': 1416446068, + 'release_date': '20141120', + 'like_count': int, + 'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg', + }, }, { 'url': 'https://i.imgur.com/A61SaA1.gifv', 'only_matching': True, @@ -92,6 +108,7 @@ class ImgurIE(ImgurBaseIE): 'comment_count': int, 'release_timestamp': 1710491255, 'release_date': '20240315', + 'thumbnail': 'https://i.imgur.com/zV03bd5h.jpg', }, }] @@ -252,17 +269,9 @@ def _real_extract(self, url): class ImgurGalleryIE(ImgurGalleryBaseIE): IE_NAME = 'imgur:gallery' - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?:[^/?#]+-)?(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'http://imgur.com/gallery/Q95ko', - 'info_dict': { - 'id': 'Q95ko', - 'title': 'Adding faces make every GIF better', - }, - 'playlist_count': 25, - 'skip': 'Zoinks! You\'ve taken a wrong turn.', - }, { # TODO: static images - replace with animated/video gallery 'url': 'http://imgur.com/topic/Aww/ll5Vk', 'only_matching': True, @@ -280,7 +289,27 @@ class ImgurGalleryIE(ImgurGalleryBaseIE): 'release_timestamp': 1358554297, 'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg', 'release_date': '20130119', - 'uploader_url': 'https://i.imgur.com/u3R4I2S_d.png?maxwidth=290&fidelity=grand', + 'uploader_url': 'https://i.imgur.com/N5Flb2v_d.png?maxwidth=290&fidelity=grand', + 'comment_count': int, + 'dislike_count': int, + 'like_count': int, + }, + }, { + # Test with slug + 'url': 'https://imgur.com/gallery/classic-steve-carell-gif-cracks-me-up-everytime-repost-downvotes-YcAQlkx', + 'add_ies': ['Imgur'], + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + 'timestamp': 1358554297, + 'upload_date': '20130119', + 'uploader_id': '1648642', + 'uploader': 'wittyusernamehere', + 'release_timestamp': 1358554297, + 'release_date': '20130119', + 'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg', + 'uploader_url': 'https://i.imgur.com/N5Flb2v_d.png?maxwidth=290&fidelity=grand', 'comment_count': int, 'dislike_count': int, 'like_count': int, @@ -317,6 +346,13 @@ class ImgurGalleryIE(ImgurGalleryBaseIE): 'title': 'Penguins !', }, 'playlist_count': 3, + }, { + 'url': 'https://imgur.com/t/unmuted/penguins-penguins-6lAn9VQ', + 'info_dict': { + 'id': '6lAn9VQ', + 'title': 'Penguins !', + }, + 'playlist_count': 3, }, { 'url': 'https://imgur.com/t/unmuted/kx2uD3C', 'add_ies': ['Imgur'], @@ -357,7 +393,7 @@ class ImgurGalleryIE(ImgurGalleryBaseIE): class ImgurAlbumIE(ImgurGalleryBaseIE): IE_NAME = 'imgur:album' - _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?:[^/?#]+-)?(?P[a-zA-Z0-9]+)' _GALLERY = False _TESTS = [{ # TODO: only static images - replace with animated/video gallery @@ -372,6 +408,14 @@ class ImgurAlbumIE(ImgurGalleryBaseIE): 'title': 'enen-no-shouboutai', }, 'playlist_count': 2, + }, { + # Test with URL slug + 'url': 'https://imgur.com/a/enen-no-shouboutai-iX265HX', + 'info_dict': { + 'id': 'iX265HX', + 'title': 'enen-no-shouboutai', + }, + 'playlist_count': 2, }, { 'url': 'https://imgur.com/a/8pih2Ed', 'info_dict': { From c4d95f67ddc522297bb1fea875255cf94b34d595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kotiuk?= Date: Sun, 20 Oct 2024 23:16:22 +0200 Subject: [PATCH 22/23] [ie/cda] Support folders (#10786) Closes #5429 Authored by: pktiuk --- yt_dlp/extractor/_extractors.py | 5 +++- yt_dlp/extractor/cda.py | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4b1f4c316d75..8d59360949f9 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -363,7 +363,10 @@ ) from .ccma import CCMAIE from .cctv import CCTVIE -from .cda import CDAIE +from .cda import ( + CDAIE, + CDAFolderIE, +) from .cellebrite import CellebriteIE from .ceskatelevize import CeskaTelevizeIE from .cgtn import CGTNIE diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 62ee8b17f1e7..b2738e492f70 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -12,6 +12,7 @@ from ..compat import compat_ord from ..utils import ( ExtractorError, + OnDemandPagedList, float_or_none, int_or_none, merge_dicts, @@ -351,3 +352,50 @@ def extract_format(page, version): extract_format(webpage, resolution) return merge_dicts(info_dict, info) + + +class CDAFolderIE(InfoExtractor): + _MAX_PAGE_SIZE = 36 + _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P\w+)/folder/(?P\d+)' + _TESTS = [ + { + 'url': 'https://www.cda.pl/domino264/folder/31188385', + 'info_dict': { + 'id': '31188385', + 'title': 'SERIA DRUGA', + }, + 'playlist_mincount': 13, + }, + { + 'url': 'https://www.cda.pl/smiechawaTV/folder/2664592/vfilm', + 'info_dict': { + 'id': '2664592', + 'title': 'VideoDowcipy - wszystkie odcinki', + }, + 'playlist_mincount': 71, + }, + { + 'url': 'https://www.cda.pl/DeliciousBeauty/folder/19129979/vfilm', + 'info_dict': { + 'id': '19129979', + 'title': 'TESTY KOSMETYKÓW', + }, + 'playlist_mincount': 139, + }] + + def _real_extract(self, url): + folder_id, channel = self._match_valid_url(url).group('id', 'channel') + + webpage = self._download_webpage(url, folder_id) + + def extract_page_entries(page): + webpage = self._download_webpage( + f'https://www.cda.pl/{channel}/folder/{folder_id}/vfilm/{page + 1}', folder_id, + f'Downloading page {page + 1}', expected_status=404) + items = re.findall(r']+href="/video/([0-9a-z]+)"', webpage) + for video_id in items: + yield self.url_result(f'https://www.cda.pl/video/{video_id}', CDAIE, video_id) + + return self.playlist_result( + OnDemandPagedList(extract_page_entries, self._MAX_PAGE_SIZE), + folder_id, self._og_search_title(webpage)) From 87408ccfd772ddf31a8323d8151c24f9577cbc9f Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 20 Oct 2024 23:18:11 +0200 Subject: [PATCH 23/23] [ie/imgur] Fix thumbnail extraction (#11298) Authored by: seproDev --- yt_dlp/extractor/imgur.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/imgur.py b/yt_dlp/extractor/imgur.py index 2a5a1c9e8480..e2644e6a40f8 100644 --- a/yt_dlp/extractor/imgur.py +++ b/yt_dlp/extractor/imgur.py @@ -225,7 +225,10 @@ def og_get_size(media_type): }), get_all=False), 'id': video_id, 'formats': formats, - 'thumbnail': url_or_none(search('thumbnailUrl')), + 'thumbnails': [{ + 'url': thumbnail_url, + 'http_headers': {'Accept': '*/*'}, + }] if (thumbnail_url := search(['thumbnailUrl', 'twitter:image', 'og:image'])) else None, 'http_headers': {'Accept': '*/*'}, }