From b9c979461b244713bf42691a5bc02834e2ba4b2c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 24 Mar 2025 16:18:51 -0500 Subject: [PATCH 01/44] [ie/youtube] Fix signature and nsig extraction for player `363db69b` (#12725) Closes #12724 Authored by: bashonly --- test/test_youtube_signature.py | 9 +++++++++ yt_dlp/extractor/youtube/_video.py | 7 +++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 45dc9113bd..453caacd65 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -83,6 +83,11 @@ _SIG_TESTS = [ '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', 'AAOAOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7vgpDL0QwbdV06sCIEzpWqMGkFR20CFOS21Tp-7vj_EMu-m37KtXJoOy1', ), + ( + 'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + ), ] _NSIG_TESTS = [ @@ -234,6 +239,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', 'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA', ), + ( + 'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js', + 'eWYu5d5YeY_4LyEDc', 'XJQqf-N7Xra3gg', + ), ] diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index c773ba2f11..ee93a599a4 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -2176,10 +2176,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): """Returns tuple of strings: variable assignment code, variable name, variable value code""" return self._search_regex( r'''(?x) - \'use\s+strict\';\s* + (?P["\'])use\s+strict(?P=q1);\s* (?P var\s+(?P[a-zA-Z0-9_$]+)\s*=\s* - (?P"(?:[^"\\]|\\.)+"\.split\("[^"]+"\)) + (?P + (?P["\'])(?:(?!(?P=q2)).|\\.)+(?P=q2) + \.split\((?P["\'])(?:(?!(?P=q3)).)+(?P=q3)\) + ) )[;,] ''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None)) From 4054a2b623bd1e277b49d2e9abc3d112a4b1c7be Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 24 Mar 2025 16:22:25 -0500 Subject: [PATCH 02/44] [ie/youtube] Fix PhantomJS nsig fallback (#12728) Also fixes the NSigDeno plugin Closes #12724 Authored by: bashonly --- yt_dlp/extractor/youtube/_video.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index ee93a599a4..b8cc72ab1d 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -2190,7 +2190,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): global_var, varname, _ = self._extract_player_js_global_var(full_code) if global_var: self.write_debug(f'Prepending n function code with global array variable "{varname}"') - code = global_var + ', ' + code + code = global_var + '; ' + code else: self.write_debug('No global array variable found in player JS') return argnames, re.sub( @@ -2199,7 +2199,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.21') + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.24') jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) From b7fbb5a0a16a8e8d3e29c29e26ebed677d0d6ea3 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Tue, 25 Mar 2025 06:28:09 +0900 Subject: [PATCH 03/44] [ie/vrsquare] Add extractors (#12515) Authored by: doe1080 --- yt_dlp/extractor/_extractors.py | 6 ++ yt_dlp/extractor/vrsquare.py | 185 ++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+) create mode 100644 yt_dlp/extractor/vrsquare.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c56ec9df6a..eb914d2eb7 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2392,6 +2392,12 @@ from .voxmedia import ( VoxMediaIE, VoxMediaVolumeIE, ) +from .vrsquare import ( + VrSquareChannelIE, + VrSquareIE, + VrSquareSearchIE, + VrSquareSectionIE, +) from .vrt import ( VRTIE, DagelijkseKostIE, diff --git a/yt_dlp/extractor/vrsquare.py b/yt_dlp/extractor/vrsquare.py new file mode 100644 index 0000000000..9e8740b421 --- /dev/null +++ b/yt_dlp/extractor/vrsquare.py @@ -0,0 +1,185 @@ +import itertools + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + clean_html, + extract_attributes, + parse_duration, + parse_qs, +) +from ..utils.traversal import ( + find_element, + find_elements, + traverse_obj, +) + + +class VrSquareIE(InfoExtractor): + IE_NAME = 'vrsquare' + IE_DESC = 'VR SQUARE' + + _BASE_URL = 'https://livr.jp' + _VALID_URL = r'https?://livr\.jp/contents/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://livr.jp/contents/P470896661', + 'info_dict': { + 'id': 'P470896661', + 'ext': 'mp4', + 'title': 'そこ曲がったら、櫻坂? 7年間お疲れ様!菅井友香の卒業を祝う会!前半 2022年11月6日放送分', + 'description': 'md5:523726dc835aa8014dfe1e2b38d36cd1', + 'duration': 1515.0, + 'tags': 'count:2', + 'thumbnail': r're:https?://media\.livr\.jp/vod/img/.+\.jpg', + }, + }, { + 'url': 'https://livr.jp/contents/P589523973', + 'info_dict': { + 'id': 'P589523973', + 'ext': 'mp4', + 'title': '薄闇に仰ぐ しだれ桜の妖艶', + 'description': 'md5:a042f517b2cbb4ed6746707afec4d306', + 'duration': 1084.0, + 'tags': list, + 'thumbnail': r're:https?://media\.livr\.jp/vod/img/.+\.jpg', + }, + 'skip': 'Paid video', + }, { + 'url': 'https://livr.jp/contents/P316939908', + 'info_dict': { + 'id': 'P316939908', + 'ext': 'mp4', + 'title': '2024年5月16日(木) 「今日は誰に恋をする?」公演 小栗有以 生誕祭', + 'description': 'md5:2110bdcf947f28bd7d06ec420e51b619', + 'duration': 8559.0, + 'tags': list, + 'thumbnail': r're:https?://media\.livr\.jp/vod/img/.+\.jpg', + }, + 'skip': 'Premium channel subscribers only', + }, { + # Accessible only in the VR SQUARE app + 'url': 'https://livr.jp/contents/P126481458', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + status = self._download_json( + f'{self._BASE_URL}/webApi/contentsStatus/{video_id}', + video_id, 'Checking contents status', fatal=False) + if traverse_obj(status, 'result_code') == '40407': + self.raise_login_required('Unable to access this video') + + try: + web_api = self._download_json( + f'{self._BASE_URL}/webApi/play/url/{video_id}', video_id) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 500: + raise ExtractorError('VR SQUARE app-only videos are not supported', expected=True) + raise + + return { + 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'description': self._html_search_meta('description', webpage), + 'formats': self._extract_m3u8_formats(traverse_obj(web_api, ( + 'urls', ..., 'url', any)), video_id, 'mp4', fatal=False), + 'thumbnail': self._html_search_meta('og:image', webpage), + **traverse_obj(webpage, { + 'duration': ({find_element(cls='layout-product-data-time')}, {parse_duration}), + 'tags': ({find_elements(cls='search-tag')}, ..., {clean_html}), + }), + } + + +class VrSquarePlaylistBaseIE(InfoExtractor): + _BASE_URL = 'https://livr.jp' + + def _fetch_vids(self, source, keys=()): + for url_path in traverse_obj(source, ( + *keys, {find_elements(cls='video', html=True)}, ..., + {extract_attributes}, 'data-url', {str}, filter), + ): + yield self.url_result( + f'{self._BASE_URL}/contents/{url_path.removeprefix("/contents/")}', VrSquareIE) + + def _entries(self, path, display_id, query=None): + for page in itertools.count(1): + ajax = self._download_json( + f'{self._BASE_URL}{path}', display_id, + f'Downloading playlist JSON page {page}', + query={'p': page, **(query or {})}) + yield from self._fetch_vids(ajax, ('contents_render_list', ...)) + if not traverse_obj(ajax, (('has_next', 'hasNext'), {bool}, any)): + break + + +class VrSquareChannelIE(VrSquarePlaylistBaseIE): + IE_NAME = 'vrsquare:channel' + + _VALID_URL = r'https?://livr\.jp/channel/(?P\w+)' + _TESTS = [{ + 'url': 'https://livr.jp/channel/H372648599', + 'info_dict': { + 'id': 'H372648599', + 'title': 'AKB48+チャンネル', + }, + 'playlist_mincount': 502, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return self.playlist_result( + self._entries(f'/ajax/channel/{playlist_id}', playlist_id), + playlist_id, self._html_search_meta('og:title', webpage)) + + +class VrSquareSearchIE(VrSquarePlaylistBaseIE): + IE_NAME = 'vrsquare:search' + + _VALID_URL = r'https?://livr\.jp/web-search/?\?(?:[^#]+&)?w=[^#]+' + _TESTS = [{ + 'url': 'https://livr.jp/web-search?w=%23%E5%B0%8F%E6%A0%97%E6%9C%89%E4%BB%A5', + 'info_dict': { + 'id': '#小栗有以', + }, + 'playlist_mincount': 60, + }] + + def _real_extract(self, url): + search_query = parse_qs(url)['w'][0] + + return self.playlist_result( + self._entries('/ajax/web-search', search_query, {'w': search_query}), search_query) + + +class VrSquareSectionIE(VrSquarePlaylistBaseIE): + IE_NAME = 'vrsquare:section' + + _VALID_URL = r'https?://livr\.jp/(?:category|headline)/(?P\w+)' + _TESTS = [{ + 'url': 'https://livr.jp/category/C133936275', + 'info_dict': { + 'id': 'C133936275', + 'title': 'そこ曲がったら、櫻坂?VR', + }, + 'playlist_mincount': 308, + }, { + 'url': 'https://livr.jp/headline/A296449604', + 'info_dict': { + 'id': 'A296449604', + 'title': 'AKB48 アフターVR', + }, + 'playlist_mincount': 22, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return self.playlist_result( + self._fetch_vids(webpage), playlist_id, self._html_search_meta('og:title', webpage)) From 9491b44032b330e05bd5eaa546187005d1e8538e Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 24 Mar 2025 22:28:47 +0100 Subject: [PATCH 04/44] [utils] `js_to_json`: Make function less fatal (#12715) Authored by: seproDev --- test/test_utils.py | 1 + yt_dlp/utils/_utils.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 42dc7f937e..e60ceed8fd 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1260,6 +1260,7 @@ class TestUtil(unittest.TestCase): def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') + self.assertEqual(js_to_json('{a: `${e("")}`}'), '{"a": "\\"e\\"(\\"\\")"}') def test_js_to_json_template_literal(self): self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"') diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 0140acaa3a..24525560ef 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2767,7 +2767,8 @@ def js_to_json(code, vars={}, *, strict=False): def template_substitute(match): evaluated = js_to_json(match.group(1), vars, strict=strict) if evaluated[0] == '"': - return json.loads(evaluated) + with contextlib.suppress(json.JSONDecodeError): + return json.loads(evaluated) return evaluated def fix_kv(m): From 5086d4aed6aeb3908c62f49e2d8f74cc0cb05110 Mon Sep 17 00:00:00 2001 From: fireattack Date: Tue, 25 Mar 2025 06:24:09 +0800 Subject: [PATCH 05/44] [ie/generic] Fix MPD base URL parsing (#12718) Closes #12709 Authored by: fireattack --- yt_dlp/extractor/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 67c224e502..c144069b3f 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -16,6 +16,7 @@ from ..utils import ( MEDIA_EXTENSIONS, ExtractorError, UnsupportedError, + base_url, determine_ext, determine_protocol, dict_get, @@ -2531,7 +2532,7 @@ class GenericIE(InfoExtractor): elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, - mpd_base_url=full_response.url.rpartition('/')[0], + mpd_base_url=base_url(full_response.url), mpd_url=url) info_dict['live_status'] = 'is_live' if doc.get('type') == 'dynamic' else None self._extra_manifest_info(info_dict, url) From 3396eb50dcd245b49c0f4aecd6e80ec914095d16 Mon Sep 17 00:00:00 2001 From: Subrat Lima <74418100+subrat-lima@users.noreply.github.com> Date: Tue, 25 Mar 2025 03:56:45 +0530 Subject: [PATCH 06/44] [ie/17live:vod] Add extractor (#12723) Closes #12570 Authored by: subrat-lima --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/ichinanalive.py | 58 +++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index eb914d2eb7..28d410fa86 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -839,6 +839,7 @@ from .icareus import IcareusIE from .ichinanalive import ( IchinanaLiveClipIE, IchinanaLiveIE, + IchinanaLiveVODIE, ) from .idolplus import IdolPlusIE from .ign import ( diff --git a/yt_dlp/extractor/ichinanalive.py b/yt_dlp/extractor/ichinanalive.py index a37cfe77bd..475d33593d 100644 --- a/yt_dlp/extractor/ichinanalive.py +++ b/yt_dlp/extractor/ichinanalive.py @@ -1,5 +1,13 @@ + from .common import InfoExtractor -from ..utils import ExtractorError, str_or_none, traverse_obj, unified_strdate +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + traverse_obj, + unified_strdate, + url_or_none, +) class IchinanaLiveIE(InfoExtractor): @@ -157,3 +165,51 @@ class IchinanaLiveClipIE(InfoExtractor): 'description': view_data.get('caption'), 'upload_date': unified_strdate(str_or_none(view_data.get('createdAt'))), } + + +class IchinanaLiveVODIE(InfoExtractor): + IE_NAME = '17live:vod' + _VALID_URL = r'https?://(?:www\.)?17\.live/ja/vod/[^/?#]+/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://17.live/ja/vod/27323042/2cf84520-e65e-4b22-891e-1d3a00b0f068', + 'md5': '3299b930d7457b069639486998a89580', + 'info_dict': { + 'id': '2cf84520-e65e-4b22-891e-1d3a00b0f068', + 'ext': 'mp4', + 'title': 'md5:b5f8cbf497d54cc6a60eb3b480182f01', + 'uploader': 'md5:29fb12122ab94b5a8495586e7c3085a5', + 'uploader_id': '27323042', + 'channel': '🌟オールナイトニッポン アーカイブ🌟', + 'channel_id': '2b4f85f1-d61e-429d-a901-68d32bdd8645', + 'like_count': int, + 'view_count': int, + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)', + 'duration': 549, + 'description': 'md5:116f326579700f00eaaf5581aae1192e', + 'timestamp': 1741058645, + 'upload_date': '20250304', + }, + }, { + 'url': 'https://17.live/ja/vod/27323042/0de11bac-9bea-40b8-9eab-0239a7d88079', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json(f'https://wap-api.17app.co/api/v1/vods/{video_id}', video_id) + + return traverse_obj(json_data, { + 'id': ('vodID', {str}), + 'title': ('title', {str}), + 'formats': ('vodURL', {lambda x: self._extract_m3u8_formats(x, video_id)}), + 'uploader': ('userInfo', 'displayName', {str}), + 'uploader_id': ('userInfo', 'roomID', {int}, {str_or_none}), + 'channel': ('userInfo', 'name', {str}), + 'channel_id': ('userInfo', 'userID', {str}), + 'like_count': ('likeCount', {int_or_none}), + 'view_count': ('viewCount', {int_or_none}), + 'thumbnail': ('imageURL', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'description': ('description', {str}), + 'timestamp': ('createdAt', {int_or_none}), + }) From 86ab79e1a5182092321102adf6ca34195803b878 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 24 Mar 2025 17:38:22 -0500 Subject: [PATCH 07/44] [ie] Fix sorting of HLS audio formats by `GROUP-ID` (#12714) Closes #11178 Authored by: bashonly --- test/test_InfoExtractor.py | 22 ++++++++++++++-------- yt_dlp/extractor/common.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 54f35ef552..c6ff6209a8 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -638,6 +638,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'img_bipbop_adv_example_fmp4', 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', [{ + # 60kbps (bitrate not provided in m3u8); sorted as worst because it's grouped with lowest bitrate video track 'format_id': 'aud1-English', 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a1/prog_index.m3u8', 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', @@ -645,15 +646,9 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'ext': 'mp4', 'protocol': 'm3u8_native', 'audio_ext': 'mp4', + 'source_preference': 0, }, { - 'format_id': 'aud2-English', - 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a2/prog_index.m3u8', - 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', - 'language': 'en', - 'ext': 'mp4', - 'protocol': 'm3u8_native', - 'audio_ext': 'mp4', - }, { + # 192kbps (bitrate not provided in m3u8) 'format_id': 'aud3-English', 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a3/prog_index.m3u8', 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', @@ -661,6 +656,17 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'ext': 'mp4', 'protocol': 'm3u8_native', 'audio_ext': 'mp4', + 'source_preference': 1, + }, { + # 384kbps (bitrate not provided in m3u8); sorted as best because it's grouped with the highest bitrate video track + 'format_id': 'aud2-English', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a2/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', + 'language': 'en', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'audio_ext': 'mp4', + 'source_preference': 2, }, { 'format_id': '530', 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v2/prog_index.m3u8', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 0119111816..4c1bc4cf47 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -78,6 +78,7 @@ from ..utils import ( parse_iso8601, parse_m3u8_attributes, parse_resolution, + qualities, sanitize_url, smuggle_url, str_or_none, @@ -2177,6 +2178,8 @@ class InfoExtractor: media_url = media.get('URI') if media_url: manifest_url = format_url(media_url) + is_audio = media_type == 'AUDIO' + is_alternate = media.get('DEFAULT') == 'NO' or media.get('AUTOSELECT') == 'NO' formats.extend({ 'format_id': join_nonempty(m3u8_id, group_id, name, idx), 'format_note': name, @@ -2189,7 +2192,11 @@ class InfoExtractor: 'preference': preference, 'quality': quality, 'has_drm': has_drm, - 'vcodec': 'none' if media_type == 'AUDIO' else None, + 'vcodec': 'none' if is_audio else None, + # Alternate audio formats (e.g. audio description) should be deprioritized + 'source_preference': -2 if is_audio and is_alternate else None, + # Save this to assign source_preference based on associated video stream + '_audio_group_id': group_id if is_audio and not is_alternate else None, } for idx in _extract_m3u8_playlist_indices(manifest_url)) def build_stream_name(): @@ -2284,6 +2291,8 @@ class InfoExtractor: # ignore references to rendition groups and treat them # as complete formats. if audio_group_id and codecs and f.get('vcodec') != 'none': + # Save this to determine quality of audio formats that only have a GROUP-ID + f['_audio_group_id'] = audio_group_id audio_group = groups.get(audio_group_id) if audio_group and audio_group[0].get('URI'): # TODO: update acodec for audio only formats with @@ -2306,6 +2315,28 @@ class InfoExtractor: formats.append(http_f) last_stream_inf = {} + + # Some audio-only formats only have a GROUP-ID without any other quality/bitrate/codec info + # Each audio GROUP-ID corresponds with one or more video formats' AUDIO attribute + # For sorting purposes, set source_preference based on the quality of the video formats they are grouped with + # See https://github.com/yt-dlp/yt-dlp/issues/11178 + audio_groups_by_quality = orderedSet(f['_audio_group_id'] for f in sorted( + traverse_obj(formats, lambda _, v: v.get('vcodec') != 'none' and v['_audio_group_id']), + key=lambda x: (x.get('tbr') or 0, x.get('width') or 0))) + audio_quality_map = { + audio_groups_by_quality[0]: 'low', + audio_groups_by_quality[-1]: 'high', + } if len(audio_groups_by_quality) > 1 else None + audio_preference = qualities(audio_groups_by_quality) + for fmt in formats: + audio_group_id = fmt.pop('_audio_group_id', None) + if not audio_quality_map or not audio_group_id or fmt.get('vcodec') != 'none': + continue + # Use source_preference since quality and preference are set by params + fmt['source_preference'] = audio_preference(audio_group_id) + fmt['format_note'] = join_nonempty( + fmt.get('format_note'), audio_quality_map.get(audio_group_id), delim=', ') + return formats, subtitles def _extract_m3u8_vod_duration( From 801afeac91f97dc0b58cd39cc7e8c50f619dc4e1 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Tue, 25 Mar 2025 08:12:09 +0900 Subject: [PATCH 08/44] [ie/streaks] Add extractor (#12679) Authored by: doe1080 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/streaks.py | 236 ++++++++++++++++++++++++++++++++ 2 files changed, 237 insertions(+) create mode 100644 yt_dlp/extractor/streaks.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 28d410fa86..a44601b14d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1986,6 +1986,7 @@ from .storyfire import ( StoryFireSeriesIE, StoryFireUserIE, ) +from .streaks import StreaksIE from .streamable import StreamableIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE diff --git a/yt_dlp/extractor/streaks.py b/yt_dlp/extractor/streaks.py new file mode 100644 index 0000000000..1b3718473d --- /dev/null +++ b/yt_dlp/extractor/streaks.py @@ -0,0 +1,236 @@ +import json +import urllib.parse + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + filter_dict, + float_or_none, + join_nonempty, + mimetype2ext, + parse_iso8601, + unsmuggle_url, + update_url_query, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class StreaksBaseIE(InfoExtractor): + _API_URL_TEMPLATE = 'https://{}.api.streaks.jp/v1/projects/{}/medias/{}{}' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['JP'] + + def _extract_from_streaks_api(self, project_id, media_id, headers=None, query=None, ssai=False): + try: + response = self._download_json( + self._API_URL_TEMPLATE.format('playback', project_id, media_id, ''), + media_id, 'Downloading STREAKS playback API JSON', headers={ + 'Accept': 'application/json', + 'Origin': 'https://players.streaks.jp', + **self.geo_verification_headers(), + **(headers or {}), + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status in {403, 404}: + error = self._parse_json(e.cause.response.read().decode(), media_id, fatal=False) + message = traverse_obj(error, ('message', {str})) + code = traverse_obj(error, ('code', {str})) + if code == 'REQUEST_FAILED': + self.raise_geo_restricted(message, countries=self._GEO_COUNTRIES) + elif code == 'MEDIA_NOT_FOUND': + raise ExtractorError(message, expected=True) + elif code or message: + raise ExtractorError(join_nonempty(code, message, delim=': ')) + raise + + streaks_id = response['id'] + live_status = { + 'clip': 'was_live', + 'file': 'not_live', + 'linear': 'is_live', + 'live': 'is_live', + }.get(response.get('type')) + + formats, subtitles = [], {} + drm_formats = False + + for source in traverse_obj(response, ('sources', lambda _, v: v['src'])): + if source.get('key_systems'): + drm_formats = True + continue + + src_url = source['src'] + is_live = live_status == 'is_live' + ext = mimetype2ext(source.get('type')) + if ext != 'm3u8': + self.report_warning(f'Unsupported stream type: {ext}') + continue + + if is_live and ssai: + session_params = traverse_obj(self._download_json( + self._API_URL_TEMPLATE.format('ssai', project_id, streaks_id, '/ssai/session'), + media_id, 'Downloading session parameters', + headers={'Content-Type': 'application/json', 'Accept': 'application/json'}, + data=json.dumps({'id': source['id']}).encode(), + ), (0, 'query', {urllib.parse.parse_qs})) + src_url = update_url_query(src_url, session_params) + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, media_id, 'mp4', m3u8_id='hls', fatal=False, live=is_live, query=query) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + if not formats and drm_formats: + self.report_drm(media_id) + self._remove_duplicate_formats(formats) + + for subs in traverse_obj(response, ( + 'tracks', lambda _, v: v['kind'] in ('captions', 'subtitles') and url_or_none(v['src']), + )): + lang = traverse_obj(subs, ('srclang', {str.lower})) or 'ja' + subtitles.setdefault(lang, []).append({'url': subs['src']}) + + return { + 'id': streaks_id, + 'display_id': media_id, + 'formats': formats, + 'live_status': live_status, + 'subtitles': subtitles, + 'uploader_id': project_id, + **traverse_obj(response, { + 'title': ('name', {str}), + 'description': ('description', {str}, filter), + 'duration': ('duration', {float_or_none}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + 'tags': ('tags', ..., {str}), + 'thumbnails': (('poster', 'thumbnail'), 'src', {'url': {url_or_none}}), + 'timestamp': ('created_at', {parse_iso8601}), + }), + } + + +class StreaksIE(StreaksBaseIE): + _VALID_URL = [ + r'https?://players\.streaks\.jp/(?P[\w-]+)/[\da-f]+/index\.html\?(?:[^#]+&)?m=(?P(?:ref:)?[\w-]+)', + r'https?://playback\.api\.streaks\.jp/v1/projects/(?P[\w-]+)/medias/(?P(?:ref:)?[\w-]+)', + ] + _EMBED_REGEX = [rf']*\bsrc\s*=\s*["\'](?P{_VALID_URL[0]})'] + _TESTS = [{ + 'url': 'https://players.streaks.jp/tipness/08155cd19dc14c12bebefb69b92eafcc/index.html?m=dbdf2df35b4d483ebaeeaeb38c594647', + 'info_dict': { + 'id': 'dbdf2df35b4d483ebaeeaeb38c594647', + 'ext': 'mp4', + 'title': '3shunenCM_edit.mp4', + 'display_id': 'dbdf2df35b4d483ebaeeaeb38c594647', + 'duration': 47.533, + 'live_status': 'not_live', + 'modified_date': '20230726', + 'modified_timestamp': 1690356180, + 'timestamp': 1690355996, + 'upload_date': '20230726', + 'uploader_id': 'tipness', + }, + }, { + 'url': 'https://players.streaks.jp/ktv-web/0298e8964c164ab384c07ef6e08c444b/index.html?m=ref:mycoffeetime_250317', + 'info_dict': { + 'id': 'dccdc079e3fd41f88b0c8435e2d453ab', + 'ext': 'mp4', + 'title': 'わたしの珈琲時間_250317', + 'display_id': 'ref:mycoffeetime_250317', + 'duration': 122.99, + 'live_status': 'not_live', + 'modified_date': '20250310', + 'modified_timestamp': 1741586302, + 'thumbnail': r're:https?://.+\.jpg', + 'timestamp': 1741585839, + 'upload_date': '20250310', + 'uploader_id': 'ktv-web', + }, + }, { + 'url': 'https://playback.api.streaks.jp/v1/projects/ktv-web/medias/b5411938e1e5435dac71edf829dd4813', + 'info_dict': { + 'id': 'b5411938e1e5435dac71edf829dd4813', + 'ext': 'mp4', + 'title': 'KANTELE_SYUSEi_0630', + 'display_id': 'b5411938e1e5435dac71edf829dd4813', + 'live_status': 'not_live', + 'modified_date': '20250122', + 'modified_timestamp': 1737522999, + 'thumbnail': r're:https?://.+\.jpg', + 'timestamp': 1735205137, + 'upload_date': '20241226', + 'uploader_id': 'ktv-web', + }, + }, { + # TVer Olympics: website already down, but api remains accessible + 'url': 'https://playback.api.streaks.jp/v1/projects/tver-olympic/medias/ref:sp_240806_1748_dvr', + 'info_dict': { + 'id': 'c10f7345adb648cf804d7578ab93b2e3', + 'ext': 'mp4', + 'title': 'サッカー 男子 準決勝_dvr', + 'display_id': 'ref:sp_240806_1748_dvr', + 'duration': 12960.0, + 'live_status': 'was_live', + 'modified_date': '20240805', + 'modified_timestamp': 1722896263, + 'timestamp': 1722777618, + 'upload_date': '20240804', + 'uploader_id': 'tver-olympic', + }, + }, { + # TBS FREE: 24-hour stream + 'url': 'https://playback.api.streaks.jp/v1/projects/tbs/medias/ref:simul-02', + 'info_dict': { + 'id': 'c4e83a7b48f4409a96adacec674b4e22', + 'ext': 'mp4', + 'title': str, + 'display_id': 'ref:simul-02', + 'live_status': 'is_live', + 'modified_date': '20241031', + 'modified_timestamp': 1730339858, + 'timestamp': 1705466840, + 'upload_date': '20240117', + 'uploader_id': 'tbs', + }, + }, { + # DRM protected + 'url': 'https://players.streaks.jp/sp-jbc/a12d7ee0f40c49d6a0a2bff520639677/index.html?m=5f89c62f37ee4a68be8e6e3b1396c7d8', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://event.play.jp/playnext2023/', + 'info_dict': { + 'id': '2d975178293140dc8074a7fc536a7604', + 'ext': 'mp4', + 'title': 'PLAY NEXTキームービー(本番)', + 'uploader_id': 'play', + 'duration': 17.05, + 'thumbnail': r're:https?://.+\.jpg', + 'timestamp': 1668387517, + 'upload_date': '20221114', + 'modified_timestamp': 1739411523, + 'modified_date': '20250213', + 'live_status': 'not_live', + }, + }, { + 'url': 'https://wowshop.jp/Page/special/cooking_goods/?bid=wowshop&srsltid=AfmBOor_phUNoPEE_UCPiGGSCMrJE5T2US397smvsbrSdLqUxwON0el4', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '?bid=wowshop&srsltid=AfmBOor_phUNoPEE_UCPiGGSCMrJE5T2US397smvsbrSdLqUxwON0el4', + 'title': 'ワンランク上の料理道具でとびきりの“おいしい”を食卓へ|wowshop', + 'description': 'md5:914b5cb8624fc69274c7fb7b2342958f', + 'age_limit': 0, + 'thumbnail': 'https://wowshop.jp/Page/special/cooking_goods/images/ogp.jpg', + }, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + project_id, media_id = self._match_valid_url(url).group('project_id', 'id') + + return self._extract_from_streaks_api( + project_id, media_id, headers=filter_dict({ + 'X-Streaks-Api-Key': smuggled_data.get('api_key'), + })) From 66e0bab814e4a52ef3e12d81123ad992a29df50e Mon Sep 17 00:00:00 2001 From: Abdulmohsen <1621552+arabcoders@users.noreply.github.com> Date: Tue, 25 Mar 2025 03:00:22 +0300 Subject: [PATCH 09/44] [ie/TVer] Fix extractor (#12659) Closes #12643, Closes #12282 Authored by: arabcoders, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- README.md | 3 + yt_dlp/extractor/tver.py | 149 +++++++++++++++++++++++++++------------ 2 files changed, 106 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 6e27b0a348..1ee4223856 100644 --- a/README.md +++ b/README.md @@ -1866,6 +1866,9 @@ The following extractors use this feature: #### sonylivseries * `sort_order`: Episode sort order for series extraction - one of `asc` (ascending, oldest first) or `desc` (descending, newest first). Default is `asc` +#### tver +* `backend`: Backend API to use for extraction - one of `streaks` (default) or `brightcove` (deprecated) + **Note**: These options may be changed/removed in the future without concern for backward compatibility diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index f3daf89469..805150db45 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -1,31 +1,70 @@ -from .common import InfoExtractor +from .streaks import StreaksBaseIE from ..utils import ( ExtractorError, + int_or_none, join_nonempty, + make_archive_id, smuggle_url, str_or_none, strip_or_none, - traverse_obj, update_url_query, ) +from ..utils.traversal import require, traverse_obj -class TVerIE(InfoExtractor): +class TVerIE(StreaksBaseIE): _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature)/)+(?P[a-zA-Z0-9]+)' + _GEO_COUNTRIES = ['JP'] + _GEO_BYPASS = False _TESTS = [{ - 'skip': 'videos are only available for 7 days', - 'url': 'https://tver.jp/episodes/ep83nf3w4p', + # via Streaks backend + 'url': 'https://tver.jp/episodes/epc1hdugbk', 'info_dict': { - 'title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'description': 'md5:dc2c06b6acc23f1e7c730c513737719b', - 'series': '家事ヤロウ!!!', - 'episode': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'alt_title': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'channel': 'テレビ朝日', - 'id': 'ep83nf3w4p', + 'id': 'epc1hdugbk', 'ext': 'mp4', + 'display_id': 'ref:baeebeac-a2a6-4dbf-9eb3-c40d59b40068', + 'title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル)', + 'alt_title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル) 日テレ', + 'description': 'md5:2726f742d5e3886edeaf72fb6d740fef', + 'uploader_id': 'tver-ntv', + 'channel': '日テレ', + 'duration': 1158.024, + 'thumbnail': 'https://statics.tver.jp/images/content/thumbnail/episode/xlarge/epc1hdugbk.jpg?v=16', + 'series': '神回だけ見せます!', + 'episode': '#2 壮烈!車大騎馬戦(木曜スペシャル)', + 'episode_number': 2, + 'timestamp': 1736486036, + 'upload_date': '20250110', + 'modified_timestamp': 1736870264, + 'modified_date': '20250114', + 'live_status': 'not_live', + 'release_timestamp': 1651453200, + 'release_date': '20220502', + '_old_archive_ids': ['brightcovenew ref:baeebeac-a2a6-4dbf-9eb3-c40d59b40068'], }, - 'add_ie': ['BrightcoveNew'], + }, { + # via Brightcove backend (deprecated) + 'url': 'https://tver.jp/episodes/epc1hdugbk', + 'info_dict': { + 'id': 'ref:baeebeac-a2a6-4dbf-9eb3-c40d59b40068', + 'ext': 'mp4', + 'title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル)', + 'alt_title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル) 日テレ', + 'description': 'md5:2726f742d5e3886edeaf72fb6d740fef', + 'uploader_id': '4394098882001', + 'channel': '日テレ', + 'duration': 1158.101, + 'thumbnail': 'https://statics.tver.jp/images/content/thumbnail/episode/xlarge/epc1hdugbk.jpg?v=16', + 'tags': [], + 'series': '神回だけ見せます!', + 'episode': '#2 壮烈!車大騎馬戦(木曜スペシャル)', + 'episode_number': 2, + 'timestamp': 1651388531, + 'upload_date': '20220501', + 'release_timestamp': 1651453200, + 'release_date': '20220502', + }, + 'params': {'extractor_args': {'tver': {'backend': ['brightcove']}}}, }, { 'url': 'https://tver.jp/corner/f0103888', 'only_matching': True, @@ -38,26 +77,7 @@ class TVerIE(InfoExtractor): 'id': 'srtxft431v', 'title': '名探偵コナン', }, - 'playlist': [ - { - 'md5': '779ffd97493ed59b0a6277ea726b389e', - 'info_dict': { - 'id': 'ref:conan-1137-241005', - 'ext': 'mp4', - 'title': '名探偵コナン #1137「行列店、味変の秘密」', - 'uploader_id': '5330942432001', - 'tags': [], - 'channel': '読売テレビ', - 'series': '名探偵コナン', - 'description': 'md5:601fccc1d2430d942a2c8068c4b33eb5', - 'episode': '#1137「行列店、味変の秘密」', - 'duration': 1469.077, - 'timestamp': 1728030405, - 'upload_date': '20241004', - 'alt_title': '名探偵コナン #1137「行列店、味変の秘密」 読売テレビ 10月5日(土)放送分', - 'thumbnail': r're:https://.+\.jpg', - }, - }], + 'playlist_mincount': 21, }, { 'url': 'https://tver.jp/series/sru35hwdd2', 'info_dict': { @@ -70,7 +90,11 @@ class TVerIE(InfoExtractor): 'only_matching': True, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - _HEADERS = {'x-tver-platform-type': 'web'} + _HEADERS = { + 'x-tver-platform-type': 'web', + 'Origin': 'https://tver.jp', + 'Referer': 'https://tver.jp/', + } _PLATFORM_QUERY = {} def _real_initialize(self): @@ -103,6 +127,9 @@ class TVerIE(InfoExtractor): def _real_extract(self, url): video_id, video_type = self._match_valid_url(url).group('id', 'type') + backend = self._configuration_arg('backend', ['streaks'])[0] + if backend not in ('brightcove', 'streaks'): + raise ExtractorError(f'Invalid backend value: {backend}', expected=True) if video_type == 'series': series_info = self._call_platform_api( @@ -129,12 +156,6 @@ class TVerIE(InfoExtractor): video_info = self._download_json( f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, 'Downloading video info', query={'v': version}, headers={'Referer': 'https://tver.jp/'}) - p_id = video_info['video']['accountID'] - r_id = traverse_obj(video_info, ('video', ('videoRefID', 'videoID')), get_all=False) - if not r_id: - raise ExtractorError('Failed to extract reference ID for Brightcove') - if not r_id.isdigit(): - r_id = f'ref:{r_id}' episode = strip_or_none(episode_content.get('title')) series = str_or_none(episode_content.get('seriesTitle')) @@ -161,17 +182,53 @@ class TVerIE(InfoExtractor): ] ] - return { - '_type': 'url_transparent', + metadata = { 'title': title, 'series': series, 'episode': episode, # an another title which is considered "full title" for some viewers 'alt_title': join_nonempty(title, provider, onair_label, delim=' '), 'channel': provider, - 'description': str_or_none(video_info.get('description')), 'thumbnails': thumbnails, - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), {'geo_countries': ['JP']}), - 'ie_key': 'BrightcoveNew', + **traverse_obj(video_info, { + 'description': ('description', {str}), + 'release_timestamp': ('viewStatus', 'startAt', {int_or_none}), + 'episode_number': ('no', {int_or_none}), + }), + } + + brightcove_id = traverse_obj(video_info, ('video', ('videoRefID', 'videoID'), {str}, any)) + if brightcove_id and not brightcove_id.isdecimal(): + brightcove_id = f'ref:{brightcove_id}' + + streaks_id = traverse_obj(video_info, ('streaks', 'videoRefID', {str})) + if streaks_id and not streaks_id.startswith('ref:'): + streaks_id = f'ref:{streaks_id}' + + # Deprecated Brightcove extraction reachable w/extractor-arg or fallback; errors are expected + if backend == 'brightcove' or not streaks_id: + if backend != 'brightcove': + self.report_warning( + 'No STREAKS ID found; falling back to Brightcove extraction', video_id=video_id) + if not brightcove_id: + raise ExtractorError('Unable to extract brightcove reference ID', expected=True) + account_id = traverse_obj(video_info, ( + 'video', 'accountID', {str}, {require('brightcove account ID', expected=True)})) + return { + **metadata, + '_type': 'url_transparent', + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, brightcove_id), + {'geo_countries': ['JP']}), + 'ie_key': 'BrightcoveNew', + } + + return { + **self._extract_from_streaks_api(video_info['streaks']['projectID'], streaks_id, { + 'Origin': 'https://tver.jp', + 'Referer': 'https://tver.jp/', + }), + **metadata, + 'id': video_id, + '_old_archive_ids': [make_archive_id('BrightcoveNew', brightcove_id)] if brightcove_id else None, } From 9dde546e7ee3e1515d88ee3af08b099351455dc0 Mon Sep 17 00:00:00 2001 From: sepro Date: Tue, 25 Mar 2025 01:05:02 +0100 Subject: [PATCH 10/44] [cleanup] Misc (#12694) Authored by: seproDev --- yt_dlp/extractor/bokecc.py | 2 +- yt_dlp/extractor/hse.py | 6 +++--- yt_dlp/extractor/moviepilot.py | 4 ++-- yt_dlp/extractor/polskieradio.py | 12 ++++++------ yt_dlp/extractor/redgifs.py | 8 ++++---- yt_dlp/extractor/senategov.py | 4 ++-- yt_dlp/extractor/taptap.py | 6 +++--- yt_dlp/extractor/wykop.py | 10 +++++----- yt_dlp/extractor/youporn.py | 14 +++++++------- yt_dlp/extractor/youtube/_video.py | 2 +- 10 files changed, 34 insertions(+), 34 deletions(-) diff --git a/yt_dlp/extractor/bokecc.py b/yt_dlp/extractor/bokecc.py index 5fe937a6ac..42047aced1 100644 --- a/yt_dlp/extractor/bokecc.py +++ b/yt_dlp/extractor/bokecc.py @@ -24,7 +24,7 @@ class BokeCCBaseIE(InfoExtractor): class BokeCCIE(BokeCCBaseIE): - _IE_DESC = 'CC视频' + IE_DESC = 'CC视频' _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P.*)' _TESTS = [{ diff --git a/yt_dlp/extractor/hse.py b/yt_dlp/extractor/hse.py index d9004293ff..c3c7bb32e2 100644 --- a/yt_dlp/extractor/hse.py +++ b/yt_dlp/extractor/hse.py @@ -6,7 +6,7 @@ from ..utils import ( ) -class HSEShowBaseInfoExtractor(InfoExtractor): +class HSEShowBaseIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] def _extract_redux_data(self, url, video_id): @@ -28,7 +28,7 @@ class HSEShowBaseInfoExtractor(InfoExtractor): return formats, subtitles -class HSEShowIE(HSEShowBaseInfoExtractor): +class HSEShowIE(HSEShowBaseIE): _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/c/tv-shows/(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.hse.de/dpl/c/tv-shows/505350', @@ -64,7 +64,7 @@ class HSEShowIE(HSEShowBaseInfoExtractor): } -class HSEProductIE(HSEShowBaseInfoExtractor): +class HSEProductIE(HSEShowBaseIE): _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/p/product/(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.hse.de/dpl/p/product/408630', diff --git a/yt_dlp/extractor/moviepilot.py b/yt_dlp/extractor/moviepilot.py index ed5be4fa65..3b20232263 100644 --- a/yt_dlp/extractor/moviepilot.py +++ b/yt_dlp/extractor/moviepilot.py @@ -3,8 +3,8 @@ from .dailymotion import DailymotionIE class MoviepilotIE(InfoExtractor): - _IE_NAME = 'moviepilot' - _IE_DESC = 'Moviepilot trailer' + IE_NAME = 'moviepilot' + IE_DESC = 'Moviepilot trailer' _VALID_URL = r'https?://(?:www\.)?moviepilot\.de/movies/(?P[^/]+)' _TESTS = [{ diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py index 6fb21e156d..9d0496bdf0 100644 --- a/yt_dlp/extractor/polskieradio.py +++ b/yt_dlp/extractor/polskieradio.py @@ -22,7 +22,7 @@ from ..utils import ( ) -class PolskieRadioBaseExtractor(InfoExtractor): +class PolskieRadioBaseIE(InfoExtractor): def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): media_urls = set() @@ -47,7 +47,7 @@ class PolskieRadioBaseExtractor(InfoExtractor): yield entry -class PolskieRadioLegacyIE(PolskieRadioBaseExtractor): +class PolskieRadioLegacyIE(PolskieRadioBaseIE): # legacy sites IE_NAME = 'polskieradio:legacy' _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P\d+)' @@ -127,7 +127,7 @@ class PolskieRadioLegacyIE(PolskieRadioBaseExtractor): return self.playlist_result(entries, playlist_id, title, description) -class PolskieRadioIE(PolskieRadioBaseExtractor): +class PolskieRadioIE(PolskieRadioBaseIE): # new next.js sites _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P\d+)' _TESTS = [{ @@ -519,7 +519,7 @@ class PolskieRadioPlayerIE(InfoExtractor): } -class PolskieRadioPodcastBaseExtractor(InfoExtractor): +class PolskieRadioPodcastBaseIE(InfoExtractor): _API_BASE = 'https://apipodcasts.polskieradio.pl/api' def _parse_episode(self, data): @@ -539,7 +539,7 @@ class PolskieRadioPodcastBaseExtractor(InfoExtractor): } -class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): +class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseIE): IE_NAME = 'polskieradio:podcast:list' _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P\d+)' _TESTS = [{ @@ -578,7 +578,7 @@ class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): } -class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): +class PolskieRadioPodcastIE(PolskieRadioPodcastBaseIE): IE_NAME = 'polskieradio:podcast' _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})' _TESTS = [{ diff --git a/yt_dlp/extractor/redgifs.py b/yt_dlp/extractor/redgifs.py index 870e33253f..cd3cd323e5 100644 --- a/yt_dlp/extractor/redgifs.py +++ b/yt_dlp/extractor/redgifs.py @@ -12,7 +12,7 @@ from ..utils import ( ) -class RedGifsBaseInfoExtractor(InfoExtractor): +class RedGifsBaseIE(InfoExtractor): _FORMATS = { 'gif': 250, 'sd': 480, @@ -113,7 +113,7 @@ class RedGifsBaseInfoExtractor(InfoExtractor): return page_fetcher(page) if page else OnDemandPagedList(page_fetcher, self._PAGE_SIZE) -class RedGifsIE(RedGifsBaseInfoExtractor): +class RedGifsIE(RedGifsBaseIE): _VALID_URL = r'https?://(?:(?:www\.)?redgifs\.com/(?:watch|ifr)/|thumbs2\.redgifs\.com/)(?P[^-/?#\.]+)' _TESTS = [{ 'url': 'https://www.redgifs.com/watch/squeakyhelplesswisent', @@ -172,7 +172,7 @@ class RedGifsIE(RedGifsBaseInfoExtractor): return self._parse_gif_data(video_info['gif']) -class RedGifsSearchIE(RedGifsBaseInfoExtractor): +class RedGifsSearchIE(RedGifsBaseIE): IE_DESC = 'Redgifs search' _VALID_URL = r'https?://(?:www\.)?redgifs\.com/browse\?(?P[^#]+)' _PAGE_SIZE = 80 @@ -226,7 +226,7 @@ class RedGifsSearchIE(RedGifsBaseInfoExtractor): entries, query_str, tags, f'RedGifs search for {tags}, ordered by {order}') -class RedGifsUserIE(RedGifsBaseInfoExtractor): +class RedGifsUserIE(RedGifsBaseIE): IE_DESC = 'Redgifs user' _VALID_URL = r'https?://(?:www\.)?redgifs\.com/users/(?P[^/?#]+)(?:\?(?P[^#]+))?' _PAGE_SIZE = 80 diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py index efcdb79d0c..dc275e58da 100644 --- a/yt_dlp/extractor/senategov.py +++ b/yt_dlp/extractor/senategov.py @@ -13,7 +13,7 @@ from ..utils.traversal import traverse_obj class SenateISVPIE(InfoExtractor): - _IE_NAME = 'senate.gov:isvp' + IE_NAME = 'senate.gov:isvp' _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P.+)' _EMBED_REGEX = [r"]+src=['\"](?Phttps?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]"] @@ -137,7 +137,7 @@ class SenateISVPIE(InfoExtractor): class SenateGovIE(InfoExtractor): - _IE_NAME = 'senate.gov' + IE_NAME = 'senate.gov' _SUBDOMAIN_RE = '|'.join(map(re.escape, ( 'agriculture', 'aging', 'appropriations', 'armed-services', 'banking', 'budget', 'commerce', 'energy', 'epw', 'finance', 'foreign', 'help', diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index e4c31da4e2..f5900194f1 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -191,12 +191,12 @@ class TapTapAppIE(TapTapBaseIE): }] -class TapTapIntlBase(TapTapBaseIE): +class TapTapIntlBaseIE(TapTapBaseIE): _X_UA = 'V=1&PN=WebAppIntl2&LANG=zh_TW&VN_CODE=115&VN=0.1.0&LOC=CN&PLT=PC&DS=Android&UID={uuid}&CURR=&DT=PC&OS=Windows&OSV=NT%208.0.0' _VIDEO_API = 'https://www.taptap.io/webapiv2/video-resource/v1/multi-get' -class TapTapAppIntlIE(TapTapIntlBase): +class TapTapAppIntlIE(TapTapIntlBaseIE): _VALID_URL = r'https?://www\.taptap\.io/app/(?P\d+)' _INFO_API = 'https://www.taptap.io/webapiv2/i/app/v5/detail' _DATA_PATH = 'app' @@ -227,7 +227,7 @@ class TapTapAppIntlIE(TapTapIntlBase): }] -class TapTapPostIntlIE(TapTapIntlBase): +class TapTapPostIntlIE(TapTapIntlBaseIE): _VALID_URL = r'https?://www\.taptap\.io/post/(?P\d+)' _INFO_API = 'https://www.taptap.io/webapiv2/creation/post/v1/detail' _INFO_QUERY_KEY = 'id_str' diff --git a/yt_dlp/extractor/wykop.py b/yt_dlp/extractor/wykop.py index 2ae0a2a5ed..08cad1fffd 100644 --- a/yt_dlp/extractor/wykop.py +++ b/yt_dlp/extractor/wykop.py @@ -11,7 +11,7 @@ from ..utils import ( ) -class WykopBaseExtractor(InfoExtractor): +class WykopBaseIE(InfoExtractor): def _get_token(self, force_refresh=False): if not force_refresh: maybe_cached = self.cache.load('wykop', 'bearer') @@ -72,7 +72,7 @@ class WykopBaseExtractor(InfoExtractor): } -class WykopDigIE(WykopBaseExtractor): +class WykopDigIE(WykopBaseIE): IE_NAME = 'wykop:dig' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P\d+)' @@ -128,7 +128,7 @@ class WykopDigIE(WykopBaseExtractor): } -class WykopDigCommentIE(WykopBaseExtractor): +class WykopDigCommentIE(WykopBaseIE): IE_NAME = 'wykop:dig:comment' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P\d+)/[^/]+/komentarz/(?P\d+)' @@ -177,7 +177,7 @@ class WykopDigCommentIE(WykopBaseExtractor): } -class WykopPostIE(WykopBaseExtractor): +class WykopPostIE(WykopBaseIE): IE_NAME = 'wykop:post' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P\d+)' @@ -228,7 +228,7 @@ class WykopPostIE(WykopBaseExtractor): } -class WykopPostCommentIE(WykopBaseExtractor): +class WykopPostCommentIE(WykopBaseIE): IE_NAME = 'wykop:post:comment' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P\d+)/[^/#]+#(?P\d+)' diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py index 8eb77aa031..b3c4b76d78 100644 --- a/yt_dlp/extractor/youporn.py +++ b/yt_dlp/extractor/youporn.py @@ -227,7 +227,7 @@ class YouPornIE(InfoExtractor): return result -class YouPornListBase(InfoExtractor): +class YouPornListBaseIE(InfoExtractor): def _get_next_url(self, url, pl_id, html): return urljoin(url, self._search_regex( r''']*?\bhref\s*=\s*("|')(?P(?:(?!\1)[^>])+)\1''', @@ -284,7 +284,7 @@ class YouPornListBase(InfoExtractor): playlist_id=pl_id, playlist_title=title) -class YouPornCategoryIE(YouPornListBase): +class YouPornCategoryIE(YouPornListBaseIE): IE_DESC = 'YouPorn category, with sorting, filtering and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -319,7 +319,7 @@ class YouPornCategoryIE(YouPornListBase): }] -class YouPornChannelIE(YouPornListBase): +class YouPornChannelIE(YouPornListBaseIE): IE_DESC = 'YouPorn channel, with sorting and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -349,7 +349,7 @@ class YouPornChannelIE(YouPornListBase): return re.sub(r'_', ' ', title_slug).title() -class YouPornCollectionIE(YouPornListBase): +class YouPornCollectionIE(YouPornListBaseIE): IE_DESC = 'YouPorn collection (user playlist), with sorting and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -394,7 +394,7 @@ class YouPornCollectionIE(YouPornListBase): return playlist -class YouPornTagIE(YouPornListBase): +class YouPornTagIE(YouPornListBaseIE): IE_DESC = 'YouPorn tag (porntags), with sorting, filtering and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -442,7 +442,7 @@ class YouPornTagIE(YouPornListBase): return super()._real_extract(url) -class YouPornStarIE(YouPornListBase): +class YouPornStarIE(YouPornListBaseIE): IE_DESC = 'YouPorn Pornstar, with description, sorting and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -493,7 +493,7 @@ class YouPornStarIE(YouPornListBase): } -class YouPornVideosIE(YouPornListBase): +class YouPornVideosIE(YouPornListBaseIE): IE_DESC = 'YouPorn video (browse) playlists, with sorting, filtering and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index b8cc72ab1d..b7203fd895 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -2199,7 +2199,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.24') + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.25') jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) From 336b33e72fe0105a33c40c8e5afdff720e17afdb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 25 Mar 2025 00:07:18 +0000 Subject: [PATCH 11/44] Release 2025.03.25 Created by: bashonly :ci skip all --- CONTRIBUTORS | 1 + Changelog.md | 27 +++++++++++++++++++++++++++ supportedsites.md | 18 ++++++++++-------- yt_dlp/version.py | 6 +++--- 4 files changed, 41 insertions(+), 11 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index baf3ffceea..ebcdb416b2 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -757,3 +757,4 @@ rysson somini thedenv vallovic +arabcoders diff --git a/Changelog.md b/Changelog.md index b5ad6c6263..e43581455e 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,33 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.03.25 + +#### Core changes +- [Fix attribute error on failed VT init](https://github.com/yt-dlp/yt-dlp/commit/b872ffec50fd50f790a5a490e006a369a28a3df3) ([#12696](https://github.com/yt-dlp/yt-dlp/issues/12696)) by [Grub4K](https://github.com/Grub4K) +- **utils**: `js_to_json`: [Make function less fatal](https://github.com/yt-dlp/yt-dlp/commit/9491b44032b330e05bd5eaa546187005d1e8538e) ([#12715](https://github.com/yt-dlp/yt-dlp/issues/12715)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- [Fix sorting of HLS audio formats by `GROUP-ID`](https://github.com/yt-dlp/yt-dlp/commit/86ab79e1a5182092321102adf6ca34195803b878) ([#12714](https://github.com/yt-dlp/yt-dlp/issues/12714)) by [bashonly](https://github.com/bashonly) +- **17live**: vod: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3396eb50dcd245b49c0f4aecd6e80ec914095d16) ([#12723](https://github.com/yt-dlp/yt-dlp/issues/12723)) by [subrat-lima](https://github.com/subrat-lima) +- **9now.com.au**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/9d5e6de2e7a47226d1f72c713ad45c88ba01db68) ([#12702](https://github.com/yt-dlp/yt-dlp/issues/12702)) by [bashonly](https://github.com/bashonly) +- **chzzk**: video: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/e2dfccaf808b406d5bcb7dd04ae9ce420752dd6f) ([#12692](https://github.com/yt-dlp/yt-dlp/issues/12692)) by [bashonly](https://github.com/bashonly), [dirkf](https://github.com/dirkf) +- **deezer**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/be5af3f9e91747768c2b41157851bfbe14c663f7) ([#12704](https://github.com/yt-dlp/yt-dlp/issues/12704)) by [seproDev](https://github.com/seproDev) +- **generic**: [Fix MPD base URL parsing](https://github.com/yt-dlp/yt-dlp/commit/5086d4aed6aeb3908c62f49e2d8f74cc0cb05110) ([#12718](https://github.com/yt-dlp/yt-dlp/issues/12718)) by [fireattack](https://github.com/fireattack) +- **streaks**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/801afeac91f97dc0b58cd39cc7e8c50f619dc4e1) ([#12679](https://github.com/yt-dlp/yt-dlp/issues/12679)) by [doe1080](https://github.com/doe1080) +- **tver**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/66e0bab814e4a52ef3e12d81123ad992a29df50e) ([#12659](https://github.com/yt-dlp/yt-dlp/issues/12659)) by [arabcoders](https://github.com/arabcoders), [bashonly](https://github.com/bashonly) +- **viki**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/fe4f14b8369038e7c58f7de546d76de1ce3a91ce) ([#12703](https://github.com/yt-dlp/yt-dlp/issues/12703)) by [seproDev](https://github.com/seproDev) +- **vrsquare**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b7fbb5a0a16a8e8d3e29c29e26ebed677d0d6ea3) ([#12515](https://github.com/yt-dlp/yt-dlp/issues/12515)) by [doe1080](https://github.com/doe1080) +- **youtube** + - [Fix PhantomJS nsig fallback](https://github.com/yt-dlp/yt-dlp/commit/4054a2b623bd1e277b49d2e9abc3d112a4b1c7be) ([#12728](https://github.com/yt-dlp/yt-dlp/issues/12728)) by [bashonly](https://github.com/bashonly) + - [Fix signature and nsig extraction for player `363db69b`](https://github.com/yt-dlp/yt-dlp/commit/b9c979461b244713bf42691a5bc02834e2ba4b2c) ([#12725](https://github.com/yt-dlp/yt-dlp/issues/12725)) by [bashonly](https://github.com/bashonly) + +#### Networking changes +- **Request Handler**: curl_cffi: [Support `curl_cffi` 0.10.x](https://github.com/yt-dlp/yt-dlp/commit/9bf23902ceb948b9685ce1dab575491571720fc6) ([#12670](https://github.com/yt-dlp/yt-dlp/issues/12670)) by [Grub4K](https://github.com/Grub4K) + +#### Misc. changes +- **cleanup**: Miscellaneous: [9dde546](https://github.com/yt-dlp/yt-dlp/commit/9dde546e7ee3e1515d88ee3af08b099351455dc0) by [seproDev](https://github.com/seproDev) + ### 2025.03.21 #### Core changes diff --git a/supportedsites.md b/supportedsites.md index d85325d491..4a0e7519a6 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -7,6 +7,7 @@ The only reliable way to check if a site is supported is to try it. - **17live** - **17live:clip** + - **17live:vod** - **1News**: 1news.co.nz article videos - **1tv**: Первый канал - **20min** @@ -200,7 +201,7 @@ The only reliable way to check if a site is supported is to try it. - **blogger.com** - **Bloomberg** - **Bluesky** - - **BokeCC** + - **BokeCC**: CC视频 - **BongaCams** - **Boosty** - **BostonGlobe** @@ -347,8 +348,6 @@ The only reliable way to check if a site is supported is to try it. - **daystar:clip** - **DBTV** - **DctpTv** - - **DeezerAlbum** - - **DeezerPlaylist** - **democracynow** - **DestinationAmerica** - **DetikEmbed** @@ -829,7 +828,7 @@ The only reliable way to check if a site is supported is to try it. - **MotherlessUploader** - **Motorsport**: motorsport.com (**Currently broken**) - **MovieFap** - - **Moviepilot** + - **moviepilot**: Moviepilot trailer - **MoviewPlay** - **Moviezine** - **MovingImage** @@ -1307,8 +1306,8 @@ The only reliable way to check if a site is supported is to try it. - **sejm** - **Sen** - **SenalColombiaLive**: (**Currently broken**) - - **SenateGov** - - **SenateISVP** + - **senate.gov** + - **senate.gov:isvp** - **SendtoNews**: (**Currently broken**) - **Servus** - **Sexu**: (**Currently broken**) @@ -1401,6 +1400,7 @@ The only reliable way to check if a site is supported is to try it. - **StoryFire** - **StoryFireSeries** - **StoryFireUser** + - **Streaks** - **Streamable** - **StreamCZ** - **StreetVoice** @@ -1643,8 +1643,6 @@ The only reliable way to check if a site is supported is to try it. - **viewlift** - **viewlift:embed** - **Viidea** - - **viki**: [*viki*](## "netrc machine") - - **viki:channel**: [*viki*](## "netrc machine") - **vimeo**: [*vimeo*](## "netrc machine") - **vimeo:album**: [*vimeo*](## "netrc machine") - **vimeo:channel**: [*vimeo*](## "netrc machine") @@ -1682,6 +1680,10 @@ The only reliable way to check if a site is supported is to try it. - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **vqq:series** - **vqq:video** + - **vrsquare**: VR SQUARE + - **vrsquare:channel** + - **vrsquare:search** + - **vrsquare:section** - **VRT**: VRT NWS, Flanders News, Flandern Info and Sporza - **vrtmax**: [*vrtnu*](## "netrc machine") VRT MAX (formerly VRT NU) - **VTM**: (**Currently broken**) diff --git a/yt_dlp/version.py b/yt_dlp/version.py index c12cfc079f..a10719f5a1 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.03.21' +__version__ = '2025.03.25' -RELEASE_GIT_HEAD = 'f36e4b6e65cb8403791aae2f520697115cb88dec' +RELEASE_GIT_HEAD = '9dde546e7ee3e1515d88ee3af08b099351455dc0' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.03.21' +_pkg_version = '2025.03.25' From a550dfc904a02843a26369ae50dbb7c0febfb30e Mon Sep 17 00:00:00 2001 From: sepro Date: Wed, 26 Mar 2025 00:40:58 +0100 Subject: [PATCH 12/44] [ie/youtube] Fix signature and nsig extraction for player `4fcd6e4a` (#12748) Closes #12746 Authored by: seproDev --- test/test_youtube_signature.py | 9 +++++++++ yt_dlp/extractor/youtube/_video.py | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 453caacd65..c79fdc9df8 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -88,6 +88,11 @@ _SIG_TESTS = [ '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', + ), ] _NSIG_TESTS = [ @@ -243,6 +248,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js', 'eWYu5d5YeY_4LyEDc', 'XJQqf-N7Xra3gg', ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', + 'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', + ), ] diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index b7203fd895..16e9a3eed5 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -2182,6 +2182,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?P (?P["\'])(?:(?!(?P=q2)).|\\.)+(?P=q2) \.split\((?P["\'])(?:(?!(?P=q3)).)+(?P=q3)\) + |\[\s*(?:(?P["\'])(?:(?!(?P=q4)).|\\.)*(?P=q4)\s*,?\s*)+\] ) )[;,] ''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None)) @@ -2199,7 +2200,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.25') + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.26') jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) From ecee97b4fa90d51c48f9154c3a6d5a8ffe46cd5c Mon Sep 17 00:00:00 2001 From: sepro Date: Wed, 26 Mar 2025 00:47:45 +0100 Subject: [PATCH 13/44] [ie/youtube] Only cache nsig code on successful decoding (#12750) Authored by: seproDev, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/youtube/_video.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 16e9a3eed5..e349b36517 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -2087,6 +2087,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return ret return inner + def _load_nsig_code_from_cache(self, player_id): + cache_id = ('nsig code', player_id) + + if func_code := self._player_cache.get(cache_id): + return func_code + + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.26') + if func_code: + self._player_cache[cache_id] = func_code + + return func_code + + def _store_nsig_code_to_cache(self, player_id, func_code): + cache_id = ('nsig code', player_id) + if cache_id not in self._player_cache: + self.cache.store('youtube-nsig', player_id, func_code) + self._player_cache[cache_id] = func_code + def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" extract_sig = self._cached( @@ -2127,6 +2145,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id=video_id, note='Executing signature code').strip() self.write_debug(f'Decrypted nsig {s} => {ret}') + # Only cache nsig func JS code to disk if successful, and only once + self._store_nsig_code_to_cache(player_id, func_code) return ret def _extract_n_function_name(self, jscode, player_url=None): @@ -2200,7 +2220,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.26') + func_code = self._load_nsig_code_from_cache(player_id) jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) @@ -2212,7 +2232,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # XXX: Workaround for the global array variable and lack of `typeof` implementation func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode) - self.cache.store('youtube-nsig', player_id, func_code) return jsi, player_id, func_code def _extract_n_function_from_code(self, jsi, func_code): From 6eaa574c8217ea9b9c5177fece0714fa622da34c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 26 Mar 2025 00:04:51 +0000 Subject: [PATCH 14/44] Release 2025.03.26 Created by: bashonly :ci skip all --- Changelog.md | 7 +++++++ yt_dlp/version.py | 6 +++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Changelog.md b/Changelog.md index e43581455e..8542f56fb8 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,13 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.03.26 + +#### Extractor changes +- **youtube** + - [Fix signature and nsig extraction for player `4fcd6e4a`](https://github.com/yt-dlp/yt-dlp/commit/a550dfc904a02843a26369ae50dbb7c0febfb30e) ([#12748](https://github.com/yt-dlp/yt-dlp/issues/12748)) by [seproDev](https://github.com/seproDev) + - [Only cache nsig code on successful decoding](https://github.com/yt-dlp/yt-dlp/commit/ecee97b4fa90d51c48f9154c3a6d5a8ffe46cd5c) ([#12750](https://github.com/yt-dlp/yt-dlp/issues/12750)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + ### 2025.03.25 #### Core changes diff --git a/yt_dlp/version.py b/yt_dlp/version.py index a10719f5a1..50c6ed314d 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.03.25' +__version__ = '2025.03.26' -RELEASE_GIT_HEAD = '9dde546e7ee3e1515d88ee3af08b099351455dc0' +RELEASE_GIT_HEAD = 'ecee97b4fa90d51c48f9154c3a6d5a8ffe46cd5c' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.03.25' +_pkg_version = '2025.03.26' From a8b9ff3c2a0ae25735e580173becc78545b92572 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 27 Mar 2025 17:28:30 -0500 Subject: [PATCH 15/44] [jsinterp] Fix nested attributes and object extraction (#12760) Authored by: bashonly, seproDev Co-authored-by: sepro --- test/test_jsinterp.py | 6 ++++++ yt_dlp/jsinterp.py | 17 +++++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index d3076298cc..b14069ccc6 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -118,6 +118,7 @@ class TestJSInterpreter(unittest.TestCase): self._test('function f(){var x = 20; x = 30 + 1; return x;}', 31) self._test('function f(){var x = 20; x += 30 + 1; return x;}', 51) self._test('function f(){var x = 20; x -= 30 + 1; return x;}', -11) + self._test('function f(){var x = 2; var y = ["a", "b"]; y[x%y["length"]]="z"; return y}', ['z', 'b']) @unittest.skip('Not implemented') def test_comments(self): @@ -403,6 +404,8 @@ class TestJSInterpreter(unittest.TestCase): test_result = list('test') tests = [ 'function f(a, b){return a.split(b)}', + 'function f(a, b){return a["split"](b)}', + 'function f(a, b){let x = ["split"]; return a[x[0]](b)}', 'function f(a, b){return String.prototype.split.call(a, b)}', 'function f(a, b){return String.prototype.split.apply(a, [b])}', ] @@ -441,6 +444,9 @@ class TestJSInterpreter(unittest.TestCase): self._test('function f(){return "012345678".slice(-1, 1)}', '') self._test('function f(){return "012345678".slice(-3, -1)}', '67') + def test_splice(self): + self._test('function f(){var T = ["0", "1", "2"]; T["splice"](2, 1, "0")[0]; return T }', ['0', '1', '0']) + def test_js_number_to_string(self): for test, radix, expected in [ (0, None, '0'), diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index d46b78f64f..b59fb2c615 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -188,6 +188,7 @@ _COMP_OPERATORS = {'===', '!==', '==', '!=', '<=', '>=', '<', '>'} _NAME_RE = r'[a-zA-Z_$][\w$]*' _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) _QUOTES = '\'"/' +_NESTED_BRACKETS = r'[^[\]]+(?:\[[^[\]]+(?:\[[^\]]+\])?\])?' class JS_Undefined: @@ -606,15 +607,18 @@ class JSInterpreter: m = re.match(fr'''(?x) (?P - (?P{_NAME_RE})(?:\[(?P[^\]]+?)\])?\s* + (?P{_NAME_RE})(?:\[(?P{_NESTED_BRACKETS})\])?\s* (?P{"|".join(map(re.escape, set(_OPERATORS) - _COMP_OPERATORS))})? =(?!=)(?P.*)$ )|(?P (?!if|return|true|false|null|undefined|NaN)(?P{_NAME_RE})$ + )|(?P + (?P{_NAME_RE})(?: + (?P\?)?\.(?P[^(]+)| + \[(?P{_NESTED_BRACKETS})\] + )\s* )|(?P (?P{_NAME_RE})\[(?P.+)\]$ - )|(?P - (?P{_NAME_RE})(?:(?P\?)?\.(?P[^(]+)|\[(?P[^\]]+)\])\s* )|(?P (?P{_NAME_RE})\((?P.*)\)$ )''', expr) @@ -707,7 +711,7 @@ class JSInterpreter: if obj is NO_DEFAULT: if variable not in self._objects: try: - self._objects[variable] = self.extract_object(variable) + self._objects[variable] = self.extract_object(variable, local_vars) except self.Exception: if not nullish: raise @@ -847,7 +851,7 @@ class JSInterpreter: raise self.Exception('Cannot return from an expression', expr) return ret - def extract_object(self, objname): + def extract_object(self, objname, *global_stack): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {} obj_m = re.search( @@ -869,7 +873,8 @@ class JSInterpreter: for f in fields_m: argnames = f.group('args').split(',') name = remove_quotes(f.group('key')) - obj[name] = function_with_repr(self.build_function(argnames, f.group('code')), f'F<{name}>') + obj[name] = function_with_repr( + self.build_function(argnames, f.group('code'), *global_stack), f'F<{name}>') return obj From 48be862b32648bff5b3e553e40fca4dcc6e88b28 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 27 Mar 2025 17:31:01 -0500 Subject: [PATCH 16/44] [ie/youtube] Make signature and nsig extraction more robust (#12761) Authored by: bashonly, seproDev Co-authored-by: sepro --- test/test_youtube_signature.py | 78 ++++++++++++++++++++++++--- yt_dlp/extractor/youtube/_video.py | 87 ++++++++++++++++++++++-------- 2 files changed, 136 insertions(+), 29 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index c79fdc9df8..0f0885366e 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -88,11 +88,51 @@ _SIG_TESTS = [ '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', ), + ( + 'https://www.youtube.com/s/player/363db69b/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + ), ( 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', 'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', + ), + ( + 'https://www.youtube.com/s/player/20830619/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', + ), ] _NSIG_TESTS = [ @@ -252,6 +292,30 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', 'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js', + 'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', + ), + ( + 'https://www.youtube.com/s/player/20830619/tv-player-ias.vflset/tv-player-ias.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', + 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', + 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', + ), ] @@ -302,33 +366,33 @@ def t_factory(name, sig_func, url_pattern): test_id = re.sub(r'[/.-]', '_', m.group('id') or m.group('compat_id')) def test_func(self): - basename = f'player-{name}-{test_id}.js' + basename = f'player-{test_id}.js' fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): urllib.request.urlretrieve(url, fn) with open(fn, encoding='utf-8') as testf: jscode = testf.read() - self.assertEqual(sig_func(jscode, sig_input), expected_sig) + self.assertEqual(sig_func(jscode, sig_input, url), expected_sig) test_func.__name__ = f'test_{name}_js_{test_id}' setattr(TestSignature, test_func.__name__, test_func) return make_tfunc -def signature(jscode, sig_input): - func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) +def signature(jscode, sig_input, player_url): + func = YoutubeIE(FakeYDL())._parse_sig_js(jscode, player_url) src_sig = ( str(string.printable[:sig_input]) if isinstance(sig_input, int) else sig_input) return func(src_sig) -def n_sig(jscode, sig_input): +def n_sig(jscode, sig_input, player_url): ie = YoutubeIE(FakeYDL()) - funcname = ie._extract_n_function_name(jscode) + funcname = ie._extract_n_function_name(jscode, player_url=player_url) jsi = JSInterpreter(jscode) - func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname), jscode)) + func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname), jscode, player_url)) return func([sig_input]) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index e349b36517..d86bbaff86 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -34,6 +34,7 @@ from ...utils import ( clean_html, datetime_from_str, filesize_from_tbr, + filter_dict, float_or_none, format_field, get_first, @@ -1986,12 +1987,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): assert os.path.basename(func_id) == func_id self.write_debug(f'Extracting signature function {func_id}') - cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None + cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.03.27'), None if not cache_spec: code = self._load_player(video_id, player_url) if code: - res = self._parse_sig_js(code) + res = self._parse_sig_js(code, player_url) test_string = ''.join(map(chr, range(len(example_sig)))) cache_spec = [ord(c) for c in res(test_string)] self.cache.store('youtube-sigfuncs', func_id, cache_spec) @@ -2039,7 +2040,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f' return {expr_code}\n') self.to_screen('Extracted signature function:\n' + code) - def _parse_sig_js(self, jscode): + def _parse_sig_js(self, jscode, player_url): # Examples where `sig` is funcname: # sig=function(a){a=a.split(""); ... ;return a.join("")}; # ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a}; @@ -2063,12 +2064,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') + varname, global_list = self._interpret_player_js_global_var(jscode, player_url) jsi = JSInterpreter(jscode) - global_var_map = {} - _, varname, value = self._extract_player_js_global_var(jscode) - if varname: - global_var_map[varname] = jsi.interpret_expression(value, {}, allow_recursion=100) - initial_function = jsi.extract_function(funcname, global_var_map) + initial_function = jsi.extract_function(funcname, filter_dict({varname: global_list})) return lambda s: initial_function([s]) def _cached(self, func, *cache_id): @@ -2093,7 +2091,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if func_code := self._player_cache.get(cache_id): return func_code - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.26') + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.27') if func_code: self._player_cache[cache_id] = func_code @@ -2150,6 +2148,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return ret def _extract_n_function_name(self, jscode, player_url=None): + varname, global_list = self._interpret_player_js_global_var(jscode, player_url) + if debug_str := traverse_obj(global_list, (lambda _, v: v.endswith('_w8_'), any)): + funcname = self._search_regex( + r'''(?xs) + [;\n](?: + (?Pfunction\s+)| + (?:var\s+)? + )(?P[a-zA-Z0-9_$]+)\s*(?(f)|=\s*function\s*) + \((?P[a-zA-Z0-9_$]+)\)\s*\{ + (?:(?!\}[;\n]).)+ + \}\s*catch\(\s*[a-zA-Z0-9_$]+\s*\)\s* + \{\s*return\s+%s\[%d\]\s*\+\s*(?P=argname)\s*\}\s*return\s+[^}]+\}[;\n] + ''' % (re.escape(varname), global_list.index(debug_str)), + jscode, 'nsig function name', group='funcname', default=None) + if funcname: + return funcname + self.write_debug(join_nonempty( + 'Initial search was unable to find nsig function name', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + # Examples (with placeholders nfunc, narray, idx): # * .get("n"))&&(b=nfunc(b) # * .get("n"))&&(b=narray[idx](b) @@ -2179,7 +2197,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not funcname: self.report_warning(join_nonempty( 'Falling back to generic n function search', - player_url and f' player = {player_url}', delim='\n')) + player_url and f' player = {player_url}', delim='\n'), only_once=True) return self._search_regex( r'''(?xs) ;\s*(?P[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) @@ -2192,9 +2210,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode, f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] - def _extract_player_js_global_var(self, jscode): + def _extract_player_js_global_var(self, jscode, player_url): """Returns tuple of strings: variable assignment code, variable name, variable value code""" - return self._search_regex( + extract_global_var = self._cached(self._search_regex, 'js global array', player_url) + varcode, varname, varvalue = extract_global_var( r'''(?x) (?P["\'])use\s+strict(?P=q1);\s* (?P @@ -2206,17 +2225,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) )[;,] ''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None)) + if not varcode: + self.write_debug(join_nonempty( + 'No global array variable found in player JS', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + return varcode, varname, varvalue - def _fixup_n_function_code(self, argnames, code, full_code): - global_var, varname, _ = self._extract_player_js_global_var(full_code) - if global_var: - self.write_debug(f'Prepending n function code with global array variable "{varname}"') - code = global_var + '; ' + code + def _interpret_player_js_global_var(self, jscode, player_url): + """Returns tuple of: variable name string, variable value list""" + _, varname, array_code = self._extract_player_js_global_var(jscode, player_url) + jsi = JSInterpreter(array_code) + interpret_global_var = self._cached(jsi.interpret_expression, 'js global list', player_url) + return varname, interpret_global_var(array_code, {}, allow_recursion=10) + + def _fixup_n_function_code(self, argnames, nsig_code, jscode, player_url): + varcode, varname, _ = self._extract_player_js_global_var(jscode, player_url) + if varcode and varname: + nsig_code = varcode + '; ' + nsig_code + _, global_list = self._interpret_player_js_global_var(jscode, player_url) else: - self.write_debug('No global array variable found in player JS') - return argnames, re.sub( - rf';\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(?:(["\'])undefined\1|{varname}\[\d+\])\s*\)\s*return\s+{argnames[0]};', - ';', code) + varname = 'dlp_wins' + global_list = [] + + undefined_idx = global_list.index('undefined') if 'undefined' in global_list else r'\d+' + fixed_code = re.sub( + rf'''(?x) + ;\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(?: + (["\'])undefined\1| + {re.escape(varname)}\[{undefined_idx}\] + )\s*\)\s*return\s+{re.escape(argnames[0])}; + ''', ';', nsig_code) + if fixed_code == nsig_code: + self.write_debug(join_nonempty( + 'No typeof statement found in nsig function code', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + return argnames, fixed_code def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) @@ -2230,7 +2273,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): func_name = self._extract_n_function_name(jscode, player_url=player_url) # XXX: Workaround for the global array variable and lack of `typeof` implementation - func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode) + func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode, player_url) return jsi, player_id, func_code From 3ddbebb3c69c9da88d8030d54d9bff9c8f49465c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 27 Mar 2025 23:45:56 +0000 Subject: [PATCH 17/44] Release 2025.03.27 Created by: bashonly :ci skip all --- Changelog.md | 8 ++++++++ yt_dlp/version.py | 6 +++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Changelog.md b/Changelog.md index 8542f56fb8..9ceb94ddac 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,14 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.03.27 + +#### Core changes +- **jsinterp**: [Fix nested attributes and object extraction](https://github.com/yt-dlp/yt-dlp/commit/a8b9ff3c2a0ae25735e580173becc78545b92572) ([#12760](https://github.com/yt-dlp/yt-dlp/issues/12760)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **youtube**: [Make signature and nsig extraction more robust](https://github.com/yt-dlp/yt-dlp/commit/48be862b32648bff5b3e553e40fca4dcc6e88b28) ([#12761](https://github.com/yt-dlp/yt-dlp/issues/12761)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + ### 2025.03.26 #### Extractor changes diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 50c6ed314d..5893a099be 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.03.26' +__version__ = '2025.03.27' -RELEASE_GIT_HEAD = 'ecee97b4fa90d51c48f9154c3a6d5a8ffe46cd5c' +RELEASE_GIT_HEAD = '48be862b32648bff5b3e553e40fca4dcc6e88b28' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.03.26' +_pkg_version = '2025.03.27' From 6a6d97b2cbc78f818de05cc96edcdcfd52caa259 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 29 Mar 2025 11:13:09 +1300 Subject: [PATCH 18/44] [ie/youtube:tab] Fix playlist continuation extraction (#12777) Fixes https://github.com/yt-dlp/yt-dlp/issues/12759 Authored by: coletdjnz --- yt_dlp/extractor/youtube/_base.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py index ac0604a9cf..8ca7d898e9 100644 --- a/yt_dlp/extractor/youtube/_base.py +++ b/yt_dlp/extractor/youtube/_base.py @@ -803,12 +803,14 @@ class YoutubeBaseInfoExtractor(InfoExtractor): @classmethod def _extract_continuation_ep_data(cls, continuation_ep: dict): - if isinstance(continuation_ep, dict): - continuation = try_get( - continuation_ep, lambda x: x['continuationCommand']['token'], str) + continuation_commands = traverse_obj( + continuation_ep, ('commandExecutorCommand', 'commands', ..., {dict})) + continuation_commands.append(continuation_ep) + for command in continuation_commands: + continuation = traverse_obj(command, ('continuationCommand', 'token', {str})) if not continuation: - return - ctp = continuation_ep.get('clickTrackingParams') + continue + ctp = command.get('clickTrackingParams') return cls._build_api_continuation_query(continuation, ctp) @classmethod From 22e34adbd741e1c7072015debd615dc3fb71c401 Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 31 Mar 2025 00:38:46 +0200 Subject: [PATCH 19/44] Add `--compat-options 2024` (#12789) Authored by: seproDev --- README.md | 3 ++- yt_dlp/options.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1ee4223856..0767221e7a 100644 --- a/README.md +++ b/README.md @@ -2239,7 +2239,8 @@ For ease of use, a few more compat options are available: * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext,-prefer-vp9-sort` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` * `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx` -* `--compat-options 2023`: Same as `--compat-options prefer-vp9-sort`. Use this to enable all future compat options +* `--compat-options 2023`: Same as `--compat-options 2024,prefer-vp9-sort` +* `--compat-options 2024`: Currently does nothing. Use this to enable all future compat options The following compat options restore vulnerable behavior from before security patches: diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 91c2635a79..1742cbdfaf 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -500,7 +500,8 @@ def create_parser(): 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter', '-manifest-filesize-approx', '-allow-unsafe-ext', '-prefer-vp9-sort'], '2021': ['2022', 'no-certifi', 'filename-sanitization'], '2022': ['2023', 'no-external-downloader-progress', 'playlist-match-filter', 'prefer-legacy-http-handler', 'manifest-filesize-approx'], - '2023': ['prefer-vp9-sort'], + '2023': ['2024', 'prefer-vp9-sort'], + '2024': [], }, }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' From 29560359120f28adaaac67c86fa8442eb72daa0d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 30 Mar 2025 17:54:55 -0500 Subject: [PATCH 20/44] [ie/sbs] Fix subtitles extraction (#12785) Closes #12783 Authored by: bashonly --- yt_dlp/extractor/sbs.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 8d61e22fce..7edb5214ee 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -122,6 +122,15 @@ class SBSIE(InfoExtractor): if traverse_obj(media, ('partOfSeries', {dict})): media['epName'] = traverse_obj(media, ('title', {str})) + # Need to set different language for forced subs or else they have priority over full subs + fixed_subtitles = {} + for lang, subs in subtitles.items(): + for sub in subs: + fixed_lang = lang + if sub['url'].lower().endswith('_fe.vtt'): + fixed_lang += '-forced' + fixed_subtitles.setdefault(fixed_lang, []).append(sub) + return { 'id': video_id, **traverse_obj(media, { @@ -151,6 +160,6 @@ class SBSIE(InfoExtractor): }), }), 'formats': formats, - 'subtitles': subtitles, + 'subtitles': fixed_subtitles, 'uploader': 'SBSC', } From 9a1ec1d36e172d252714cef712a6d091e0a0c4f2 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 30 Mar 2025 18:02:59 -0500 Subject: [PATCH 21/44] [ie/generic] Validate response before checking m3u8 live status (#12784) Closes #12744 Authored by: bashonly --- yt_dlp/extractor/generic.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index c144069b3f..b0c7be4627 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2214,10 +2214,21 @@ class GenericIE(InfoExtractor): if is_live is not None: info['live_status'] = 'not_live' if is_live == 'false' else 'is_live' return - headers = m3u8_format.get('http_headers') or info.get('http_headers') - duration = self._extract_m3u8_vod_duration( - m3u8_format['url'], info.get('id'), note='Checking m3u8 live status', - errnote='Failed to download m3u8 media playlist', headers=headers) + headers = m3u8_format.get('http_headers') or info.get('http_headers') or {} + display_id = info.get('id') + urlh = self._request_webpage( + m3u8_format['url'], display_id, 'Checking m3u8 live status', errnote=False, + headers={**headers, 'Accept-Encoding': 'identity'}, fatal=False) + if urlh is False: + return + first_bytes = urlh.read(512) + if not first_bytes.startswith(b'#EXTM3U'): + return + m3u8_doc = self._webpage_read_content( + urlh, urlh.url, display_id, prefix=first_bytes, fatal=False, errnote=False) + if not m3u8_doc: + return + duration = self._parse_m3u8_vod_duration(m3u8_doc, display_id) if not duration: info['live_status'] = 'is_live' info['duration'] = info.get('duration') or duration From f033d86b96b36f8c5289dd7c3304f42d4d9f6ff4 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 30 Mar 2025 18:28:14 -0500 Subject: [PATCH 22/44] [ie/mlbtv] Fix radio-only extraction (#12792) Authored by: bashonly --- yt_dlp/extractor/mlb.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 935bf85615..f7726e5b11 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -449,9 +449,7 @@ mutation initPlaybackSession( if not (m3u8_url and token): errors = '; '.join(traverse_obj(response, ('errors', ..., 'message', {str}))) - if 'not entitled' in errors: - raise ExtractorError(errors, expected=True) - elif errors: # Only warn when 'blacked out' since radio formats are available + if errors: # Only warn when 'blacked out' or 'not entitled'; radio formats may be available self.report_warning(f'API returned errors for {format_id}: {errors}') else: self.report_warning(f'No formats available for {format_id} broadcast; skipping') From 5fc521cbd0ce7b2410d0935369558838728e205d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miroslav=20Bend=C3=ADk?= Date: Mon, 31 Mar 2025 21:04:52 +0200 Subject: [PATCH 23/44] [ie/stvr] Rename extractor from RTVS to STVR (#12788) Authored by: mireq --- yt_dlp/extractor/rtvs.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/rtvs.py b/yt_dlp/extractor/rtvs.py index 927da57787..fcbc88a9f9 100644 --- a/yt_dlp/extractor/rtvs.py +++ b/yt_dlp/extractor/rtvs.py @@ -9,7 +9,9 @@ from ..utils import ( class RTVSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv(?:/\d+)?/(?P\d+)/?(?:[#?]|$)' + IE_NAME = 'stvr' + IE_DESC = 'Slovak Television and Radio (formerly RTVS)' + _VALID_URL = r'https?://(?:www\.)?(?:rtvs|stvr)\.sk/(?:radio|televizia)/archiv(?:/\d+)?/(?P\d+)/?(?:[#?]|$)' _TESTS = [{ # radio archive 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', @@ -19,7 +21,7 @@ class RTVSIE(InfoExtractor): 'ext': 'mp3', 'title': 'Ostrov pokladov 1 časť.mp3', 'duration': 2854, - 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0000/b1R8.rtvs.jpg', + 'thumbnail': 'https://www.stvr.sk/media/a501/image/file/2/0000/rtvs-00009383.png', 'display_id': '135331', }, }, { @@ -30,7 +32,7 @@ class RTVSIE(InfoExtractor): 'ext': 'mp4', 'title': 'Amaro Džives - Náš deň', 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.', - 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0031/L7Qm.amaro_dzives_png.jpg', + 'thumbnail': 'https://www.stvr.sk/media/a501/image/file/2/0031/L7Qm.amaro_dzives_png.jpg', 'timestamp': 1428555900, 'upload_date': '20150409', 'duration': 4986, @@ -47,8 +49,11 @@ class RTVSIE(InfoExtractor): 'display_id': '307655', 'duration': 831, 'upload_date': '20211111', - 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0916/robin.jpg', + 'thumbnail': 'https://www.stvr.sk/media/a501/image/file/2/0916/robin.jpg', }, + }, { + 'url': 'https://www.stvr.sk/radio/archiv/11224/414872', + 'only_matching': True, }] def _real_extract(self, url): From bb321cfdc3fd4400598ddb12a15862bc2ac8fc10 Mon Sep 17 00:00:00 2001 From: Muhammad Labeeb <72980976+mlabeeb03@users.noreply.github.com> Date: Tue, 1 Apr 2025 00:06:33 +0500 Subject: [PATCH 24/44] [ie/francaisfacile] Add extractor (#12787) Authored by: mlabeeb03 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/francaisfacile.py | 87 ++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 yt_dlp/extractor/francaisfacile.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a44601b14d..679a0a6a61 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -683,6 +683,7 @@ from .foxnews import ( ) from .foxsports import FoxSportsIE from .fptplay import FptplayIE +from .francaisfacile import FrancaisFacileIE from .franceinter import FranceInterIE from .francetv import ( FranceTVIE, diff --git a/yt_dlp/extractor/francaisfacile.py b/yt_dlp/extractor/francaisfacile.py new file mode 100644 index 0000000000..d3208c2828 --- /dev/null +++ b/yt_dlp/extractor/francaisfacile.py @@ -0,0 +1,87 @@ +import urllib.parse + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + float_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class FrancaisFacileIE(InfoExtractor): + _VALID_URL = r'https?://francaisfacile\.rfi\.fr/[a-z]{2}/(?:actualit%C3%A9|podcasts/[^/#?]+)/(?P[^/#?]+)' + _TESTS = [{ + 'url': 'https://francaisfacile.rfi.fr/fr/actualit%C3%A9/20250305-r%C3%A9concilier-les-jeunes-avec-la-lecture-gr%C3%A2ce-aux-r%C3%A9seaux-sociaux', + 'md5': '4f33674cb205744345cc835991100afa', + 'info_dict': { + 'id': 'WBMZ58952-FLE-FR-20250305', + 'display_id': '20250305-réconcilier-les-jeunes-avec-la-lecture-grâce-aux-réseaux-sociaux', + 'title': 'Réconcilier les jeunes avec la lecture grâce aux réseaux sociaux', + 'url': 'https://aod-fle.akamaized.net/fle/sounds/fr/2025/03/05/6b6af52a-f9ba-11ef-a1f8-005056a97652.mp3', + 'ext': 'mp3', + 'description': 'md5:b903c63d8585bd59e8cc4d5f80c4272d', + 'duration': 103.15, + 'timestamp': 1741177984, + 'upload_date': '20250305', + }, + }, { + 'url': 'https://francaisfacile.rfi.fr/fr/actualit%C3%A9/20250307-argentine-le-sac-d-un-alpiniste-retrouv%C3%A9-40-ans-apr%C3%A8s-sa-mort', + 'md5': 'b8c3a63652d4ae8e8092dda5700c1cd9', + 'info_dict': { + 'id': 'WBMZ59102-FLE-FR-20250307', + 'display_id': '20250307-argentine-le-sac-d-un-alpiniste-retrouvé-40-ans-après-sa-mort', + 'title': 'Argentine: le sac d\'un alpiniste retrouvé 40 ans après sa mort', + 'url': 'https://aod-fle.akamaized.net/fle/sounds/fr/2025/03/07/8edf4082-fb46-11ef-8a37-005056bf762b.mp3', + 'ext': 'mp3', + 'description': 'md5:7fd088fbdf4a943bb68cf82462160dca', + 'duration': 117.74, + 'timestamp': 1741352789, + 'upload_date': '20250307', + }, + }, { + 'url': 'https://francaisfacile.rfi.fr/fr/podcasts/un-mot-une-histoire/20250317-le-mot-de-david-foenkinos-peut-%C3%AAtre', + 'md5': 'db83c2cc2589b4c24571c6b6cf14f5f1', + 'info_dict': { + 'id': 'WBMZ59441-FLE-FR-20250317', + 'display_id': '20250317-le-mot-de-david-foenkinos-peut-être', + 'title': 'Le mot de David Foenkinos: «peut-être» - Un mot, une histoire', + 'url': 'https://aod-fle.akamaized.net/fle/sounds/fr/2025/03/17/4ca6cbbe-0315-11f0-a85b-005056a97652.mp3', + 'ext': 'mp3', + 'description': 'md5:3fe35fae035803df696bfa7af2496e49', + 'duration': 198.96, + 'timestamp': 1742210897, + 'upload_date': '20250317', + }, + }] + + def _real_extract(self, url): + display_id = urllib.parse.unquote(self._match_id(url)) + + try: # yt-dlp's default user-agents are too old and blocked by the site + webpage = self._download_webpage(url, display_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:136.0) Gecko/20100101 Firefox/136.0', + }) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or e.cause.status != 403: + raise + # Retry with impersonation if hardcoded UA is insufficient + webpage = self._download_webpage(url, display_id, impersonate=True) + + data = self._search_json( + r']+\bdata-media-id=[^>]+\btype="application/json"[^>]*>', + webpage, 'audio data', display_id) + + return { + 'id': data['mediaId'], + 'display_id': display_id, + 'vcodec': 'none', + 'title': self._html_extract_title(webpage), + **self._search_json_ld(webpage, display_id, fatal=False), + **traverse_obj(data, { + 'title': ('title', {str}), + 'url': ('sources', ..., 'url', {url_or_none}, any), + 'duration': ('sources', ..., 'duration', {float_or_none}, any), + }), + } From d63696f23a341ee36a3237ccb5d5e14b34c2c579 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 31 Mar 2025 14:21:44 -0500 Subject: [PATCH 25/44] [ie/MicrosoftLearnEpisode] Extract more formats (#12799) Closes #12798 Authored by: bashonly --- yt_dlp/extractor/microsoftembed.py | 44 +++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index 2575d6c5e4..9d58fa0a60 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, + parse_resolution, traverse_obj, unified_timestamp, url_basename, @@ -83,8 +84,8 @@ class MicrosoftMediusBaseIE(InfoExtractor): subtitles.setdefault(sub.pop('tag', 'und'), []).append(sub) return subtitles - def _extract_ism(self, ism_url, video_id): - formats = self._extract_ism_formats(ism_url, video_id) + def _extract_ism(self, ism_url, video_id, fatal=True): + formats = self._extract_ism_formats(ism_url, video_id, fatal=fatal) for fmt in formats: if fmt['language'] != 'eng' and 'English' not in fmt['format_id']: fmt['language_preference'] = -10 @@ -218,9 +219,21 @@ class MicrosoftLearnEpisodeIE(MicrosoftMediusBaseIE): 'description': 'md5:7bbbfb593d21c2cf2babc3715ade6b88', 'timestamp': 1676339547, 'upload_date': '20230214', - 'thumbnail': r're:https://learn\.microsoft\.com/video/media/.*\.png', + 'thumbnail': r're:https://learn\.microsoft\.com/video/media/.+\.png', 'subtitles': 'count:14', }, + }, { + 'url': 'https://learn.microsoft.com/en-gb/shows/on-demand-instructor-led-training-series/az-900-module-1', + 'info_dict': { + 'id': '4fe10f7c-d83c-463b-ac0e-c30a8195e01b', + 'ext': 'mp4', + 'title': 'AZ-900 Cloud fundamentals (1 of 6)', + 'description': 'md5:3c2212ce865e9142f402c766441bd5c9', + 'thumbnail': r're:https://.+/.+\.jpg', + 'timestamp': 1706605184, + 'upload_date': '20240130', + }, + 'params': {'format': 'bv[protocol=https]'}, }] def _real_extract(self, url): @@ -230,9 +243,32 @@ class MicrosoftLearnEpisodeIE(MicrosoftMediusBaseIE): entry_id = self._html_search_meta('entryId', webpage, 'entryId', fatal=True) video_info = self._download_json( f'https://learn.microsoft.com/api/video/public/v1/entries/{entry_id}', video_id) + + formats = [] + if ism_url := traverse_obj(video_info, ('publicVideo', 'adaptiveVideoUrl', {url_or_none})): + formats.extend(self._extract_ism(ism_url, video_id, fatal=False)) + if hls_url := traverse_obj(video_info, ('publicVideo', 'adaptiveVideoHLSUrl', {url_or_none})): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + if mpd_url := traverse_obj(video_info, ('publicVideo', 'adaptiveVideoDashUrl', {url_or_none})): + formats.extend(self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False)) + for key in ('low', 'medium', 'high'): + if video_url := traverse_obj(video_info, ('publicVideo', f'{key}QualityVideoUrl', {url_or_none})): + formats.append({ + 'url': video_url, + 'format_id': f'video-http-{key}', + 'acodec': 'none', + **parse_resolution(video_url), + }) + if audio_url := traverse_obj(video_info, ('publicVideo', 'audioUrl', {url_or_none})): + formats.append({ + 'url': audio_url, + 'format_id': 'audio-http', + 'vcodec': 'none', + }) + return { 'id': entry_id, - 'formats': self._extract_ism(video_info['publicVideo']['adaptiveVideoUrl'], video_id), + 'formats': formats, 'subtitles': self._sub_to_dict(traverse_obj(video_info, ( 'publicVideo', 'captions', lambda _, v: url_or_none(v['url']), { 'tag': ('language', {str}), From e465b078ead75472fcb7b86f6ccaf2b5d3bc4c21 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 31 Mar 2025 14:25:10 -0500 Subject: [PATCH 26/44] [ie/on24] Support `mainEvent` URLs (#12800) Closes #12782 Authored by: bashonly --- yt_dlp/extractor/on24.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/on24.py b/yt_dlp/extractor/on24.py index 05218e9de1..1dfc25a7da 100644 --- a/yt_dlp/extractor/on24.py +++ b/yt_dlp/extractor/on24.py @@ -11,12 +11,15 @@ class On24IE(InfoExtractor): IE_NAME = 'on24' IE_DESC = 'ON24' - _VALID_URL = r'''(?x) - https?://event\.on24\.com/(?: - wcc/r/(?P\d{7})/(?P[0-9A-F]{32})| - eventRegistration/(?:console/EventConsoleApollo|EventLobbyServlet\?target=lobby30) - \.jsp\?(?:[^/#?]*&)?eventid=(?P\d{7})[^/#?]*&key=(?P[0-9A-F]{32}) - )''' + _ID_RE = r'(?P\d{7})' + _KEY_RE = r'(?P[0-9A-F]{32})' + _URL_BASE_RE = r'https?://event\.on24\.com' + _URL_QUERY_RE = rf'(?:[^#]*&)?eventid={_ID_RE}&(?:[^#]+&)?key={_KEY_RE}' + _VALID_URL = [ + rf'{_URL_BASE_RE}/wcc/r/{_ID_RE}/{_KEY_RE}', + rf'{_URL_BASE_RE}/eventRegistration/console/(?:EventConsoleApollo\.jsp|apollox/mainEvent/?)\?{_URL_QUERY_RE}', + rf'{_URL_BASE_RE}/eventRegistration/EventLobbyServlet/?\?{_URL_QUERY_RE}', + ] _TESTS = [{ 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?uimode=nextgeneration&eventid=2197467&sessionid=1&key=5DF57BE53237F36A43B478DD36277A84&contenttype=A&eventuserid=305999&playerwidth=1000&playerheight=650&caller=previewLobby&text_language_id=en&format=fhaudio&newConsole=false', @@ -34,12 +37,16 @@ class On24IE(InfoExtractor): }, { 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?&eventid=2639291&sessionid=1&username=&partnerref=&format=fhvideo1&mobile=&flashsupportedmobiledevice=&helpcenter=&key=82829018E813065A122363877975752E&newConsole=true&nxChe=true&newTabCon=true&text_language_id=en&playerwidth=748&playerheight=526&eventuserid=338788762&contenttype=A&mediametricsessionid=384764716&mediametricid=3558192&usercd=369267058&mode=launch', 'only_matching': True, + }, { + 'url': 'https://event.on24.com/eventRegistration/EventLobbyServlet?target=reg20.jsp&eventid=3543176&key=BC0F6B968B67C34B50D461D40FDB3E18&groupId=3143628', + 'only_matching': True, + }, { + 'url': 'https://event.on24.com/eventRegistration/console/apollox/mainEvent?&eventid=4843671&sessionid=1&username=&partnerref=&format=fhvideo1&mobile=&flashsupportedmobiledevice=&helpcenter=&key=4EAC9B5C564CC98FF29E619B06A2F743&newConsole=true&nxChe=true&newTabCon=true&consoleEarEventConsole=false&consoleEarCloudApi=false&text_language_id=en&playerwidth=748&playerheight=526&referrer=https%3A%2F%2Fevent.on24.com%2Finterface%2Fregistration%2Fautoreg%2Findex.html%3Fsessionid%3D1%26eventid%3D4843671%26key%3D4EAC9B5C564CC98FF29E619B06A2F743%26email%3D000a3e42-7952-4dd6-8f8a-34c38ea3cf02%2540platform%26firstname%3Ds%26lastname%3Ds%26deletecookie%3Dtrue%26event_email%3DN%26marketing_email%3DN%26std1%3D0642572014177%26std2%3D0642572014179%26std3%3D550165f7-a44e-4725-9fe6-716f89908c2b%26std4%3D0&eventuserid=745776448&contenttype=A&mediametricsessionid=640613707&mediametricid=6810717&usercd=745776448&mode=launch', + 'only_matching': True, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - event_id = mobj.group('id_1') or mobj.group('id_2') - event_key = mobj.group('key_1') or mobj.group('key_2') + event_id, event_key = self._match_valid_url(url).group('id', 'key') event_data = self._download_json( 'https://event.on24.com/apic/utilApp/EventConsoleCachedServlet', From 07f04005e40ebdb368920c511e36e98af0077ed3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 31 Mar 2025 14:45:48 -0500 Subject: [PATCH 27/44] [ie/youtube] Add `player_js_variant` extractor-arg (#12767) - Always distinguish between different JS variants' code/functions - Change naming scheme for nsig and sigfuncs in disk cache Authored by: bashonly --- README.md | 1 + yt_dlp/extractor/youtube/_video.py | 74 ++++++++++++++++++++++-------- 2 files changed, 56 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 0767221e7a..c6cd147a73 100644 --- a/README.md +++ b/README.md @@ -1782,6 +1782,7 @@ The following extractors use this feature: * `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` * `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) * `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be either `gvs` (Google Video Server URLs) or `player` (Innertube player request) +* `player_js_variant`: The player javascript variant to use for signature and nsig deciphering. The known variants are: `main`, `tce`, `tv`, `tv_es6`, `phone`, `tablet`. Only `main` is recommended as a possible workaround; the others are for debugging purposes. The default is to use what is prescribed by the site, and can be selected with `actual` #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index d86bbaff86..469fbdc293 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -1761,6 +1761,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, ] + _PLAYER_JS_VARIANT_MAP = { + 'main': 'player_ias.vflset/en_US/base.js', + 'tce': 'player_ias_tce.vflset/en_US/base.js', + 'tv': 'tv-player-ias.vflset/tv-player-ias.js', + 'tv_es6': 'tv-player-es6.vflset/tv-player-es6.js', + 'phone': 'player-plasma-ias-phone-en_US.vflset/base.js', + 'tablet': 'player-plasma-ias-tablet-en_US.vflset/base.js', + } + _INVERSE_PLAYER_JS_VARIANT_MAP = {value: key for key, value in _PLAYER_JS_VARIANT_MAP.items()} + @classmethod def suitable(cls, url): from yt_dlp.utils import parse_qs @@ -1940,6 +1950,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): get_all=False, expected_type=str) if not player_url: return + + requested_js_variant = self._configuration_arg('player_js_variant', [''])[0] or 'actual' + if requested_js_variant in self._PLAYER_JS_VARIANT_MAP: + player_id = self._extract_player_info(player_url) + original_url = player_url + player_url = f'/s/player/{player_id}/{self._PLAYER_JS_VARIANT_MAP[requested_js_variant]}' + if original_url != player_url: + self.write_debug( + f'Forcing "{requested_js_variant}" player JS variant for player {player_id}\n' + f' original url = {original_url}', only_once=True) + elif requested_js_variant != 'actual': + self.report_warning( + f'Invalid player JS variant name "{requested_js_variant}" requested. ' + f'Valid choices are: {", ".join(self._PLAYER_JS_VARIANT_MAP)}', only_once=True) + return urljoin('https://www.youtube.com', player_url) def _download_player_url(self, video_id, fatal=False): @@ -1954,6 +1979,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_version: return f'https://www.youtube.com/s/player/{player_version}/player_ias.vflset/en_US/base.js' + def _player_js_cache_key(self, player_url): + player_id = self._extract_player_info(player_url) + player_path = remove_start(urllib.parse.urlparse(player_url).path, f'/s/player/{player_id}/') + variant = self._INVERSE_PLAYER_JS_VARIANT_MAP.get(player_path) + if not variant: + self.write_debug( + f'Unable to determine player JS variant\n' + f' player = {player_url}', only_once=True) + variant = re.sub(r'[^a-zA-Z0-9]', '_', remove_end(player_path, '.js')) + return join_nonempty(player_id, variant) + def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ return '.'.join(str(len(part)) for part in example_sig.split('.')) @@ -1969,25 +2005,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return id_m.group('id') def _load_player(self, video_id, player_url, fatal=True): - player_id = self._extract_player_info(player_url) - if player_id not in self._code_cache: + player_js_key = self._player_js_cache_key(player_url) + if player_js_key not in self._code_cache: code = self._download_webpage( player_url, video_id, fatal=fatal, - note='Downloading player ' + player_id, - errnote=f'Download of {player_url} failed') + note=f'Downloading player {player_js_key}', + errnote=f'Download of {player_js_key} failed') if code: - self._code_cache[player_id] = code - return self._code_cache.get(player_id) + self._code_cache[player_js_key] = code + return self._code_cache.get(player_js_key) def _extract_signature_function(self, video_id, player_url, example_sig): - player_id = self._extract_player_info(player_url) - # Read from filesystem cache - func_id = f'js_{player_id}_{self._signature_cache_id(example_sig)}' + func_id = join_nonempty( + self._player_js_cache_key(player_url), self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id self.write_debug(f'Extracting signature function {func_id}') - cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.03.27'), None + cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.03.31'), None if not cache_spec: code = self._load_player(video_id, player_url) @@ -2085,22 +2120,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return ret return inner - def _load_nsig_code_from_cache(self, player_id): - cache_id = ('nsig code', player_id) + def _load_nsig_code_from_cache(self, player_url): + cache_id = ('youtube-nsig', self._player_js_cache_key(player_url)) if func_code := self._player_cache.get(cache_id): return func_code - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.27') + func_code = self.cache.load(*cache_id, min_ver='2025.03.31') if func_code: self._player_cache[cache_id] = func_code return func_code - def _store_nsig_code_to_cache(self, player_id, func_code): - cache_id = ('nsig code', player_id) + def _store_nsig_code_to_cache(self, player_url, func_code): + cache_id = ('youtube-nsig', self._player_js_cache_key(player_url)) if cache_id not in self._player_cache: - self.cache.store('youtube-nsig', player_id, func_code) + self.cache.store(*cache_id, func_code) self._player_cache[cache_id] = func_code def _decrypt_signature(self, s, video_id, player_url): @@ -2144,7 +2179,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.write_debug(f'Decrypted nsig {s} => {ret}') # Only cache nsig func JS code to disk if successful, and only once - self._store_nsig_code_to_cache(player_id, func_code) + self._store_nsig_code_to_cache(player_url, func_code) return ret def _extract_n_function_name(self, jscode, player_url=None): @@ -2263,7 +2298,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self._load_nsig_code_from_cache(player_id) + func_code = self._load_nsig_code_from_cache(player_url) jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) @@ -3226,7 +3261,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url: self.report_warning( f'nsig extraction failed: Some formats may be missing\n' - f' n = {query["n"][0]} ; player = {player_url}', + f' n = {query["n"][0]} ; player = {player_url}\n' + f' {bug_reports_message(before="")}', video_id=video_id, only_once=True) self.write_debug(e, only_once=True) else: From 61046c31612b30c749cbdae934b7fe26abe659d7 Mon Sep 17 00:00:00 2001 From: DmitryScaletta Date: Tue, 1 Apr 2025 00:21:14 +0300 Subject: [PATCH 28/44] [ie/twitch:clips] Extract portrait formats (#12763) Authored by: DmitryScaletta --- yt_dlp/extractor/twitch.py | 171 ++++++++++++++++++++----------------- 1 file changed, 94 insertions(+), 77 deletions(-) diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index 44b19ad135..a36de3c017 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -14,19 +14,20 @@ from ..utils import ( dict_get, float_or_none, int_or_none, + join_nonempty, make_archive_id, parse_duration, parse_iso8601, parse_qs, qualities, str_or_none, - traverse_obj, try_get, unified_timestamp, update_url_query, url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj, value class TwitchBaseIE(InfoExtractor): @@ -42,10 +43,10 @@ class TwitchBaseIE(InfoExtractor): 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', + 'ShareClipRenderStatus': 'f130048a462a0ac86bb54d653c968c514e9ab9ca94db52368c1179e97b0f16eb', 'ChannelCollectionsContent': '447aec6a0cc1e8d0a8d7732d47eb0762c336a2294fdb009e9c9d854e49d484b9', 'StreamMetadata': 'a647c2a13599e5991e175155f798ca7f1ecddde73f7f341f39009c14dbf59962', 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', - 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11', 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', 'VideoMetadata': '49b5b8f268cdeb259d75b58dcb0c1a748e3b575003448a2333dc5cdafd49adad', 'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41', @@ -1083,16 +1084,44 @@ class TwitchClipsIE(TwitchBaseIE): 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', 'md5': '761769e1eafce0ffebfb4089cb3847cd', 'info_dict': { - 'id': '42850523', + 'id': '396245304', 'display_id': 'FaintLightGullWholeWheat', 'ext': 'mp4', 'title': 'EA Play 2016 Live from the Novo Theatre', + 'duration': 32, + 'view_count': int, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1465767393, 'upload_date': '20160612', - 'creator': 'EA', - 'uploader': 'stereotype_', - 'uploader_id': '43566419', + 'creators': ['EA'], + 'channel': 'EA', + 'channel_id': '25163635', + 'channel_is_verified': False, + 'channel_follower_count': int, + 'uploader': 'EA', + 'uploader_id': '25163635', + }, + }, { + 'url': 'https://www.twitch.tv/xqc/clip/CulturedAmazingKuduDatSheffy-TiZ_-ixAGYR3y2Uy', + 'md5': 'e90fe616b36e722a8cfa562547c543f0', + 'info_dict': { + 'id': '3207364882', + 'display_id': 'CulturedAmazingKuduDatSheffy-TiZ_-ixAGYR3y2Uy', + 'ext': 'mp4', + 'title': 'A day in the life of xQc', + 'duration': 60, + 'view_count': int, + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1742869615, + 'upload_date': '20250325', + 'creators': ['xQc'], + 'channel': 'xQc', + 'channel_id': '71092938', + 'channel_is_verified': True, + 'channel_follower_count': int, + 'uploader': 'xQc', + 'uploader_id': '71092938', + 'categories': ['Just Chatting'], }, }, { # multiple formats @@ -1116,16 +1145,14 @@ class TwitchClipsIE(TwitchBaseIE): }] def _real_extract(self, url): - video_id = self._match_id(url) + slug = self._match_id(url) clip = self._download_gql( - video_id, [{ - 'operationName': 'VideoAccessToken_Clip', - 'variables': { - 'slug': video_id, - }, + slug, [{ + 'operationName': 'ShareClipRenderStatus', + 'variables': {'slug': slug}, }], - 'Downloading clip access token GraphQL')[0]['data']['clip'] + 'Downloading clip GraphQL')[0]['data']['clip'] if not clip: raise ExtractorError( @@ -1135,81 +1162,71 @@ class TwitchClipsIE(TwitchBaseIE): 'sig': clip['playbackAccessToken']['signature'], 'token': clip['playbackAccessToken']['value'], } - - data = self._download_base_gql( - video_id, { - 'query': '''{ - clip(slug: "%s") { - broadcaster { - displayName - } - createdAt - curator { - displayName - id - } - durationSeconds - id - tiny: thumbnailURL(width: 86, height: 45) - small: thumbnailURL(width: 260, height: 147) - medium: thumbnailURL(width: 480, height: 272) - title - videoQualities { - frameRate - quality - sourceURL - } - viewCount - } -}''' % video_id}, 'Downloading clip GraphQL', fatal=False) # noqa: UP031 - - if data: - clip = try_get(data, lambda x: x['data']['clip'], dict) or clip + asset_default = traverse_obj(clip, ('assets', 0, {dict})) or {} + asset_portrait = traverse_obj(clip, ('assets', 1, {dict})) or {} formats = [] - for option in clip.get('videoQualities', []): - if not isinstance(option, dict): - continue - source = url_or_none(option.get('sourceURL')) - if not source: - continue + default_aspect_ratio = float_or_none(asset_default.get('aspectRatio')) + formats.extend(traverse_obj(asset_default, ('videoQualities', lambda _, v: url_or_none(v['sourceURL']), { + 'url': ('sourceURL', {update_url_query(query=access_query)}), + 'format_id': ('quality', {str}), + 'height': ('quality', {int_or_none}), + 'fps': ('frameRate', {float_or_none}), + 'aspect_ratio': {value(default_aspect_ratio)}, + }))) + portrait_aspect_ratio = float_or_none(asset_portrait.get('aspectRatio')) + for source in traverse_obj(asset_portrait, ('videoQualities', lambda _, v: url_or_none(v['sourceURL']))): formats.append({ - 'url': update_url_query(source, access_query), - 'format_id': option.get('quality'), - 'height': int_or_none(option.get('quality')), - 'fps': int_or_none(option.get('frameRate')), + 'url': update_url_query(source['sourceURL'], access_query), + 'format_id': join_nonempty('portrait', source.get('quality')), + 'height': int_or_none(source.get('quality')), + 'fps': float_or_none(source.get('frameRate')), + 'aspect_ratio': portrait_aspect_ratio, + 'quality': -2, }) thumbnails = [] - for thumbnail_id in ('tiny', 'small', 'medium'): - thumbnail_url = clip.get(thumbnail_id) - if not thumbnail_url: - continue - thumb = { - 'id': thumbnail_id, - 'url': thumbnail_url, - } - mobj = re.search(r'-(\d+)x(\d+)\.', thumbnail_url) - if mobj: - thumb.update({ - 'height': int(mobj.group(2)), - 'width': int(mobj.group(1)), - }) - thumbnails.append(thumb) + thumb_asset_default_url = url_or_none(asset_default.get('thumbnailURL')) + if thumb_asset_default_url: + thumbnails.append({ + 'id': 'default', + 'url': thumb_asset_default_url, + 'preference': 0, + }) + if thumb_asset_portrait_url := url_or_none(asset_portrait.get('thumbnailURL')): + thumbnails.append({ + 'id': 'portrait', + 'url': thumb_asset_portrait_url, + 'preference': -1, + }) + thumb_default_url = url_or_none(clip.get('thumbnailURL')) + if thumb_default_url and thumb_default_url != thumb_asset_default_url: + thumbnails.append({ + 'id': 'small', + 'url': thumb_default_url, + 'preference': -2, + }) old_id = self._search_regex(r'%7C(\d+)(?:-\d+)?.mp4', formats[-1]['url'], 'old id', default=None) return { - 'id': clip.get('id') or video_id, + 'id': clip.get('id') or slug, '_old_archive_ids': [make_archive_id(self, old_id)] if old_id else None, - 'display_id': video_id, - 'title': clip.get('title'), + 'display_id': slug, 'formats': formats, - 'duration': int_or_none(clip.get('durationSeconds')), - 'view_count': int_or_none(clip.get('viewCount')), - 'timestamp': unified_timestamp(clip.get('createdAt')), 'thumbnails': thumbnails, - 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], str), - 'uploader': try_get(clip, lambda x: x['curator']['displayName'], str), - 'uploader_id': try_get(clip, lambda x: x['curator']['id'], str), + **traverse_obj(clip, { + 'title': ('title', {str}), + 'duration': ('durationSeconds', {int_or_none}), + 'view_count': ('viewCount', {int_or_none}), + 'timestamp': ('createdAt', {parse_iso8601}), + 'creators': ('broadcaster', 'displayName', {str}, filter, all), + 'channel': ('broadcaster', 'displayName', {str}), + 'channel_id': ('broadcaster', 'id', {str}), + 'channel_follower_count': ('broadcaster', 'followers', 'totalCount', {int_or_none}), + 'channel_is_verified': ('broadcaster', 'isPartner', {bool}), + 'uploader': ('broadcaster', 'displayName', {str}), + 'uploader_id': ('broadcaster', 'id', {str}), + 'categories': ('game', 'displayName', {str}, filter, all, filter), + }), } From 5e457af57fae9645b1b8fa0ed689229c8fb9656b Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 31 Mar 2025 16:38:21 -0500 Subject: [PATCH 29/44] [cleanup] Misc (#12802) Authored by: bashonly --- README.md | 4 ++-- yt_dlp/extractor/youtube/_video.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c6cd147a73..c0329f5394 100644 --- a/README.md +++ b/README.md @@ -2219,7 +2219,7 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu * Live chats (if available) are considered as subtitles. Use `--sub-langs all,-live_chat` to download all subtitles except live chat. You can also use `--compat-options no-live-chat` to prevent any live chat/danmaku from downloading * YouTube channel URLs download all uploads of the channel. To download only the videos in a specific tab, pass the tab's URL. If the channel does not show the requested tab, an error will be raised. Also, `/live` URLs raise an error if there are no live videos instead of silently downloading the entire channel. You may use `--compat-options no-youtube-channel-redirect` to revert all these redirections * Unavailable videos are also listed for YouTube playlists. Use `--compat-options no-youtube-unavailable-videos` to remove this -* The upload dates extracted from YouTube are in UTC [when available](https://github.com/yt-dlp/yt-dlp/blob/89e4d86171c7b7c997c77d4714542e0383bf0db0/yt_dlp/extractor/youtube.py#L3898-L3900). Use `--compat-options no-youtube-prefer-utc-upload-date` to prefer the non-UTC upload date. +* The upload dates extracted from YouTube are in UTC. * If `ffmpeg` is used as the downloader, the downloading and merging of formats happen in a single step when possible. Use `--compat-options no-direct-merge` to revert this * Thumbnail embedding in `mp4` is done with mutagen if possible. Use `--compat-options embed-thumbnail-atomicparsley` to force the use of AtomicParsley instead * Some internal metadata such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this @@ -2238,7 +2238,7 @@ For ease of use, a few more compat options are available: * `--compat-options all`: Use all compat options (**Do NOT use this!**) * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext,-prefer-vp9-sort` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext,-prefer-vp9-sort` -* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` +* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization` * `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx` * `--compat-options 2023`: Same as `--compat-options 2024,prefer-vp9-sort` * `--compat-options 2024`: Currently does nothing. Use this to enable all future compat options diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 469fbdc293..074a2a0d8d 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -1769,7 +1769,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'phone': 'player-plasma-ias-phone-en_US.vflset/base.js', 'tablet': 'player-plasma-ias-tablet-en_US.vflset/base.js', } - _INVERSE_PLAYER_JS_VARIANT_MAP = {value: key for key, value in _PLAYER_JS_VARIANT_MAP.items()} + _INVERSE_PLAYER_JS_VARIANT_MAP = {v: k for k, v in _PLAYER_JS_VARIANT_MAP.items()} @classmethod def suitable(cls, url): @@ -3280,7 +3280,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): is_damaged = try_call(lambda: format_duration < duration // 2) if is_damaged: self.report_warning( - f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) + 'Some formats are possibly damaged. They will be deprioritized', video_id, only_once=True) po_token = fmt.get(STREAMING_DATA_INITIAL_PO_TOKEN) From 349f36606fa7fb658216334a73ac7825c13503c2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 31 Mar 2025 21:54:27 +0000 Subject: [PATCH 30/44] Release 2025.03.31 Created by: bashonly :ci skip all --- CONTRIBUTORS | 2 ++ Changelog.md | 21 +++++++++++++++++++++ supportedsites.md | 3 ++- yt_dlp/version.py | 6 +++--- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index ebcdb416b2..f22625e3f1 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -758,3 +758,5 @@ somini thedenv vallovic arabcoders +mireq +mlabeeb03 diff --git a/Changelog.md b/Changelog.md index 9ceb94ddac..7aec627714 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,27 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.03.31 + +#### Core changes +- [Add `--compat-options 2024`](https://github.com/yt-dlp/yt-dlp/commit/22e34adbd741e1c7072015debd615dc3fb71c401) ([#12789](https://github.com/yt-dlp/yt-dlp/issues/12789)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **francaisfacile**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/bb321cfdc3fd4400598ddb12a15862bc2ac8fc10) ([#12787](https://github.com/yt-dlp/yt-dlp/issues/12787)) by [mlabeeb03](https://github.com/mlabeeb03) +- **generic**: [Validate response before checking m3u8 live status](https://github.com/yt-dlp/yt-dlp/commit/9a1ec1d36e172d252714cef712a6d091e0a0c4f2) ([#12784](https://github.com/yt-dlp/yt-dlp/issues/12784)) by [bashonly](https://github.com/bashonly) +- **microsoftlearnepisode**: [Extract more formats](https://github.com/yt-dlp/yt-dlp/commit/d63696f23a341ee36a3237ccb5d5e14b34c2c579) ([#12799](https://github.com/yt-dlp/yt-dlp/issues/12799)) by [bashonly](https://github.com/bashonly) +- **mlbtv**: [Fix radio-only extraction](https://github.com/yt-dlp/yt-dlp/commit/f033d86b96b36f8c5289dd7c3304f42d4d9f6ff4) ([#12792](https://github.com/yt-dlp/yt-dlp/issues/12792)) by [bashonly](https://github.com/bashonly) +- **on24**: [Support `mainEvent` URLs](https://github.com/yt-dlp/yt-dlp/commit/e465b078ead75472fcb7b86f6ccaf2b5d3bc4c21) ([#12800](https://github.com/yt-dlp/yt-dlp/issues/12800)) by [bashonly](https://github.com/bashonly) +- **sbs**: [Fix subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/29560359120f28adaaac67c86fa8442eb72daa0d) ([#12785](https://github.com/yt-dlp/yt-dlp/issues/12785)) by [bashonly](https://github.com/bashonly) +- **stvr**: [Rename extractor from RTVS to STVR](https://github.com/yt-dlp/yt-dlp/commit/5fc521cbd0ce7b2410d0935369558838728e205d) ([#12788](https://github.com/yt-dlp/yt-dlp/issues/12788)) by [mireq](https://github.com/mireq) +- **twitch**: clips: [Extract portrait formats](https://github.com/yt-dlp/yt-dlp/commit/61046c31612b30c749cbdae934b7fe26abe659d7) ([#12763](https://github.com/yt-dlp/yt-dlp/issues/12763)) by [DmitryScaletta](https://github.com/DmitryScaletta) +- **youtube** + - [Add `player_js_variant` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/07f04005e40ebdb368920c511e36e98af0077ed3) ([#12767](https://github.com/yt-dlp/yt-dlp/issues/12767)) by [bashonly](https://github.com/bashonly) + - tab: [Fix playlist continuation extraction](https://github.com/yt-dlp/yt-dlp/commit/6a6d97b2cbc78f818de05cc96edcdcfd52caa259) ([#12777](https://github.com/yt-dlp/yt-dlp/issues/12777)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **cleanup**: Miscellaneous: [5e457af](https://github.com/yt-dlp/yt-dlp/commit/5e457af57fae9645b1b8fa0ed689229c8fb9656b) by [bashonly](https://github.com/bashonly) + ### 2025.03.27 #### Core changes diff --git a/supportedsites.md b/supportedsites.md index 4a0e7519a6..44c6ef5a10 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -472,6 +472,7 @@ The only reliable way to check if a site is supported is to try it. - **FoxNewsVideo** - **FoxSports** - **fptplay**: fptplay.vn + - **FrancaisFacile** - **FranceCulture** - **FranceInter** - **francetv** @@ -1251,7 +1252,6 @@ The only reliable way to check if a site is supported is to try it. - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams - **rtve.es:television** - - **RTVS** - **rtvslo.si** - **rtvslo.si:show** - **RudoVideo** @@ -1407,6 +1407,7 @@ The only reliable way to check if a site is supported is to try it. - **StretchInternet** - **Stripchat** - **stv:player** + - **stvr**: Slovak Television and Radio (formerly RTVS) - **Subsplash** - **subsplash:playlist** - **Substack** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 5893a099be..1479cd9607 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.03.27' +__version__ = '2025.03.31' -RELEASE_GIT_HEAD = '48be862b32648bff5b3e553e40fca4dcc6e88b28' +RELEASE_GIT_HEAD = '5e457af57fae9645b1b8fa0ed689229c8fb9656b' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.03.27' +_pkg_version = '2025.03.31' From 5361a7c6e2933c919716e0cb1e3116c28c40419f Mon Sep 17 00:00:00 2001 From: sepro Date: Thu, 3 Apr 2025 19:55:36 +0200 Subject: [PATCH 31/44] [ie/vk] Fix chapters extraction (#12821) Fix 05c8023a27dd37c49163c0498bf98e3e3c1cb4b9 Authored by: seproDev --- yt_dlp/extractor/vk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index faf3e60b0b..ef67c210bc 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -544,7 +544,7 @@ class VKIE(VKBaseIE): 'uploader_id': (('author_id', 'authorId'), {str_or_none}, any), 'duration': ('duration', {int_or_none}), 'chapters': ('time_codes', lambda _, v: isinstance(v['time'], int), { - 'title': ('text', {str}), + 'title': ('text', {unescapeHTML}), 'start_time': 'time', }), }), From e1847535e28788414a25546a45bebcada2f34558 Mon Sep 17 00:00:00 2001 From: CasperMcFadden95 <145611964+CasperMcFadden95@users.noreply.github.com> Date: Thu, 3 Apr 2025 19:02:24 +0000 Subject: [PATCH 32/44] [ie/RoyaLive] Add extractor (#12817) Authored by: CasperMcFadden95 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/roya.py | 43 +++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 yt_dlp/extractor/roya.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 679a0a6a61..9fc8913654 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1739,6 +1739,7 @@ from .roosterteeth import ( RoosterTeethSeriesIE, ) from .rottentomatoes import RottenTomatoesIE +from .roya import RoyaLiveIE from .rozhlas import ( MujRozhlasIE, RozhlasIE, diff --git a/yt_dlp/extractor/roya.py b/yt_dlp/extractor/roya.py new file mode 100644 index 0000000000..e9fe304eeb --- /dev/null +++ b/yt_dlp/extractor/roya.py @@ -0,0 +1,43 @@ +from .common import InfoExtractor +from ..utils.traversal import traverse_obj + + +class RoyaLiveIE(InfoExtractor): + _VALID_URL = r'https?://roya\.tv/live-stream/(?P\d+)' + _TESTS = [{ + 'url': 'https://roya.tv/live-stream/1', + 'info_dict': { + 'id': '1', + 'title': r're:Roya TV \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'ext': 'mp4', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://roya.tv/live-stream/21', + 'info_dict': { + 'id': '21', + 'title': r're:Roya News \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'ext': 'mp4', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://roya.tv/live-stream/10000', + 'only_matching': True, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + + stream_url = self._download_json( + f'https://ticket.roya-tv.com/api/v5/fastchannel/{media_id}', media_id)['data']['secured_url'] + + title = traverse_obj( + self._download_json('https://backend.roya.tv/api/v01/channels/schedule-pagination', media_id, fatal=False), + ('data', 0, 'channel', lambda _, v: str(v['id']) == media_id, 'title', {str}, any)) + + return { + 'id': media_id, + 'formats': self._extract_m3u8_formats(stream_url, media_id, 'mp4', m3u8_id='hls', live=True), + 'title': title, + 'is_live': True, + } From 4ebf41309d04a6e196944f1c0f5f0154cff0055a Mon Sep 17 00:00:00 2001 From: sepro Date: Sat, 5 Apr 2025 19:49:51 +0200 Subject: [PATCH 33/44] [ie/CrowdBunker] Make format extraction non-fatal (#12836) Authored by: seproDev --- yt_dlp/extractor/crowdbunker.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/crowdbunker.py b/yt_dlp/extractor/crowdbunker.py index bf814570fe..ca0323431c 100644 --- a/yt_dlp/extractor/crowdbunker.py +++ b/yt_dlp/extractor/crowdbunker.py @@ -5,7 +5,9 @@ from ..utils import ( int_or_none, try_get, unified_strdate, + url_or_none, ) +from ..utils.traversal import traverse_obj class CrowdBunkerIE(InfoExtractor): @@ -44,16 +46,15 @@ class CrowdBunkerIE(InfoExtractor): 'url': sub_url, }) - mpd_url = try_get(video_json, lambda x: x['dashManifest']['url']) - if mpd_url: - fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, video_id) + if mpd_url := traverse_obj(video_json, ('dashManifest', 'url', {url_or_none})): + fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, video_id, mpd_id='dash', fatal=False) formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url']) - if m3u8_url: - fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, video_id) + self._merge_subtitles(subs, target=subtitles) + + if m3u8_url := traverse_obj(video_json, ('hlsManifest', 'url', {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, m3u8_id='hls', fatal=False) formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) + self._merge_subtitles(subs, target=subtitles) thumbnails = [{ 'url': image['url'], From 58d0c83457b93b3c9a81eb6bc5a4c65f25e949df Mon Sep 17 00:00:00 2001 From: sepro Date: Sat, 5 Apr 2025 20:29:57 +0200 Subject: [PATCH 34/44] [ie/rumble] Improve format extraction (#12838) Closes #12837 Authored by: seproDev --- yt_dlp/extractor/rumble.py | 61 ++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 74c7e4f176..757d6994ca 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -7,7 +7,6 @@ from ..utils import ( ExtractorError, UnsupportedError, clean_html, - determine_ext, extract_attributes, format_field, get_element_by_class, @@ -36,7 +35,7 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20191020', 'channel_url': 'https://rumble.com/c/WMAR', 'channel': 'WMAR', - 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.qR4e-small-WMAR-2-News-Latest-Headline.jpg', + 'thumbnail': r're:https://.+\.jpg', 'duration': 234, 'uploader': 'WMAR', 'live_status': 'not_live', @@ -52,7 +51,7 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20220217', 'channel_url': 'https://rumble.com/c/CyberTechNews', 'channel': 'CTNews', - 'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg', + 'thumbnail': r're:https://.+\.jpg', 'duration': 901, 'uploader': 'CTNews', 'live_status': 'not_live', @@ -114,6 +113,22 @@ class RumbleEmbedIE(InfoExtractor): 'live_status': 'was_live', }, 'params': {'skip_download': True}, + }, { + 'url': 'https://rumble.com/embed/v6pezdb', + 'info_dict': { + 'id': 'v6pezdb', + 'ext': 'mp4', + 'title': '"Es war einmal ein Mädchen" – Ein filmisches Zeitzeugnis aus Leningrad 1944', + 'uploader': 'RT DE', + 'channel': 'RT DE', + 'channel_url': 'https://rumble.com/c/RTDE', + 'duration': 309, + 'thumbnail': 'https://1a-1791.com/video/fww1/dc/s8/1/n/z/2/y/nz2yy.qR4e-small-Es-war-einmal-ein-Mdchen-Ei.jpg', + 'timestamp': 1743703500, + 'upload_date': '20250403', + 'live_status': 'not_live', + }, + 'params': {'skip_download': True}, }, { 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', 'only_matching': True, @@ -168,40 +183,42 @@ class RumbleEmbedIE(InfoExtractor): live_status = None formats = [] - for ext, ext_info in (video.get('ua') or {}).items(): - if isinstance(ext_info, dict): - for height, video_info in ext_info.items(): + for format_type, format_info in (video.get('ua') or {}).items(): + if isinstance(format_info, dict): + for height, video_info in format_info.items(): if not traverse_obj(video_info, ('meta', 'h', {int_or_none})): video_info.setdefault('meta', {})['h'] = height - ext_info = ext_info.values() + format_info = format_info.values() - for video_info in ext_info: + for video_info in format_info: meta = video_info.get('meta') or {} if not video_info.get('url'): continue - if ext == 'hls': + # With default query params returns m3u8 variants which are duplicates, without returns tar files + if format_type == 'tar': + continue + if format_type == 'hls': if meta.get('live') is True and video.get('live') == 1: live_status = 'post_live' formats.extend(self._extract_m3u8_formats( video_info['url'], video_id, ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live')) continue - timeline = ext == 'timeline' - if timeline: - ext = determine_ext(video_info['url']) + is_timeline = format_type == 'timeline' + is_audio = format_type == 'audio' formats.append({ - 'ext': ext, - 'acodec': 'none' if timeline else None, + 'acodec': 'none' if is_timeline else None, + 'vcodec': 'none' if is_audio else None, 'url': video_info['url'], - 'format_id': join_nonempty(ext, format_field(meta, 'h', '%sp')), - 'format_note': 'Timeline' if timeline else None, - 'fps': None if timeline else video.get('fps'), + 'format_id': join_nonempty(format_type, format_field(meta, 'h', '%sp')), + 'format_note': 'Timeline' if is_timeline else None, + 'fps': None if is_timeline or is_audio else video.get('fps'), **traverse_obj(meta, { - 'tbr': 'bitrate', - 'filesize': 'size', - 'width': 'w', - 'height': 'h', - }, expected_type=lambda x: int(x) or None), + 'tbr': ('bitrate', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'width': ('w', {int_or_none}), + 'height': ('h', {int_or_none}), + }), }) subtitles = { From 425017531fbc3369becb5a44013e26f26efabf45 Mon Sep 17 00:00:00 2001 From: Ben Faerber Date: Sat, 5 Apr 2025 14:09:53 -0600 Subject: [PATCH 35/44] [ie/parti] Add extractors (#12769) Closes #11434 Authored by: benfaerber --- yt_dlp/extractor/_extractors.py | 4 ++ yt_dlp/extractor/parti.py | 101 ++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 yt_dlp/extractor/parti.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9fc8913654..a206a38e42 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1493,6 +1493,10 @@ from .paramountplus import ( ) from .parler import ParlerIE from .parlview import ParlviewIE +from .parti import ( + PartiLivestreamIE, + PartiVideoIE, +) from .patreon import ( PatreonCampaignIE, PatreonIE, diff --git a/yt_dlp/extractor/parti.py b/yt_dlp/extractor/parti.py new file mode 100644 index 0000000000..acadefc4e4 --- /dev/null +++ b/yt_dlp/extractor/parti.py @@ -0,0 +1,101 @@ +from .common import InfoExtractor +from ..utils import UserNotLive, int_or_none, parse_iso8601, url_or_none, urljoin +from ..utils.traversal import traverse_obj + + +class PartiBaseIE(InfoExtractor): + def _call_api(self, path, video_id, note=None): + return self._download_json( + f'https://api-backend.parti.com/parti_v2/profile/{path}', video_id, note) + + +class PartiVideoIE(PartiBaseIE): + IE_NAME = 'parti:video' + _VALID_URL = r'https?://(?:www\.)?parti\.com/video/(?P\d+)' + _TESTS = [{ + 'url': 'https://parti.com/video/66284', + 'info_dict': { + 'id': '66284', + 'ext': 'mp4', + 'title': 'NOW LIVE ', + 'upload_date': '20250327', + 'categories': ['Gaming'], + 'thumbnail': 'https://assets.parti.com/351424_eb9e5250-2821-484a-9c5f-ca99aa666c87.png', + 'channel': 'ItZTMGG', + 'timestamp': 1743044379, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api(f'get_livestream_channel_info/recent/{video_id}', video_id) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats( + urljoin('https://watch.parti.com', data['livestream_recording']), video_id, 'mp4'), + **traverse_obj(data, { + 'title': ('event_title', {str}), + 'channel': ('user_name', {str}), + 'thumbnail': ('event_file', {url_or_none}), + 'categories': ('category_name', {str}, filter, all), + 'timestamp': ('event_start_ts', {int_or_none}), + }), + } + + +class PartiLivestreamIE(PartiBaseIE): + IE_NAME = 'parti:livestream' + _VALID_URL = r'https?://(?:www\.)?parti\.com/creator/(?P[\w]+)/(?P[\w/-]+)' + _TESTS = [{ + 'url': 'https://parti.com/creator/parti/Capt_Robs_Adventures', + 'info_dict': { + 'id': 'Capt_Robs_Adventures', + 'ext': 'mp4', + 'title': r"re:I'm Live on Parti \d{4}-\d{2}-\d{2} \d{2}:\d{2}", + 'view_count': int, + 'thumbnail': r're:https://assets\.parti\.com/.+\.png', + 'timestamp': 1743879776, + 'upload_date': '20250405', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://parti.com/creator/discord/sazboxgaming/0', + 'only_matching': True, + }] + + def _real_extract(self, url): + service, creator_slug = self._match_valid_url(url).group('service', 'id') + + encoded_creator_slug = creator_slug.replace('/', '%23') + creator_id = self._call_api( + f'get_user_by_social_media/{service}/{encoded_creator_slug}', + creator_slug, note='Fetching user ID') + + data = self._call_api( + f'get_livestream_channel_info/{creator_id}', creator_id, + note='Fetching user profile feed')['channel_info'] + + if not traverse_obj(data, ('channel', 'is_live', {bool})): + raise UserNotLive(video_id=creator_id) + + channel_info = data['channel'] + + return { + 'id': creator_slug, + 'formats': self._extract_m3u8_formats( + channel_info['playback_url'], creator_slug, live=True, query={ + 'token': channel_info['playback_auth_token'], + 'player_version': '1.17.0', + }), + 'is_live': True, + **traverse_obj(data, { + 'title': ('livestream_event_info', 'event_name', {str}), + 'description': ('livestream_event_info', 'event_description', {str}), + 'thumbnail': ('livestream_event_info', 'livestream_preview_file', {url_or_none}), + 'timestamp': ('stream', 'start_time', {parse_iso8601}), + 'view_count': ('stream', 'viewer_count', {int_or_none}), + }), + } From 91832111a12d87499294a0f430829b8c2254c339 Mon Sep 17 00:00:00 2001 From: LN Liberda Date: Sun, 6 Apr 2025 17:05:43 +0200 Subject: [PATCH 36/44] [ie/TokFMPodcast] Fix formats extraction (#12842) Authored by: selfisekai --- yt_dlp/extractor/agora.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/agora.py b/yt_dlp/extractor/agora.py index 9835584254..e040db6010 100644 --- a/yt_dlp/extractor/agora.py +++ b/yt_dlp/extractor/agora.py @@ -146,7 +146,7 @@ class TokFMPodcastIE(InfoExtractor): 'url': 'https://audycje.tokfm.pl/podcast/91275,-Systemowy-rasizm-Czy-zamieszki-w-USA-po-morderstwie-w-Minneapolis-doprowadza-do-zmian-w-sluzbach-panstwowych', 'info_dict': { 'id': '91275', - 'ext': 'aac', + 'ext': 'mp3', 'title': 'md5:a9b15488009065556900169fb8061cce', 'episode': 'md5:a9b15488009065556900169fb8061cce', 'series': 'Analizy', @@ -164,23 +164,20 @@ class TokFMPodcastIE(InfoExtractor): raise ExtractorError('No such podcast', expected=True) metadata = metadata[0] - formats = [] - for ext in ('aac', 'mp3'): - url_data = self._download_json( - f'https://api.podcast.radioagora.pl/api4/getSongUrl?podcast_id={media_id}&device_id={uuid.uuid4()}&ppre=false&audio={ext}', - media_id, f'Downloading podcast {ext} URL') - # prevents inserting the mp3 (default) multiple times - if 'link_ssl' in url_data and f'.{ext}' in url_data['link_ssl']: - formats.append({ - 'url': url_data['link_ssl'], - 'ext': ext, - 'vcodec': 'none', - 'acodec': ext, - }) + mp3_url = self._download_json( + 'https://api.podcast.radioagora.pl/api4/getSongUrl', + media_id, 'Downloading podcast mp3 URL', query={ + 'podcast_id': media_id, + 'device_id': str(uuid.uuid4()), + 'ppre': 'false', + 'audio': 'mp3', + })['link_ssl'] return { 'id': media_id, - 'formats': formats, + 'url': mp3_url, + 'vcodec': 'none', + 'ext': 'mp3', 'title': metadata.get('podcast_name'), 'series': metadata.get('series_name'), 'episode': metadata.get('podcast_name'), From a3f2b54c2535d862de6efa9cfaa6ca9a2b2f7dd6 Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 6 Apr 2025 17:41:48 +0200 Subject: [PATCH 37/44] [ie/dzen.ru] Rework extractors (#12852) Closes #5523, Closes #10818, Closes #11385, Closes #11470 Authored by: seproDev --- yt_dlp/extractor/yandexvideo.py | 149 ++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 63 deletions(-) diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index cdd32c5e4e..09e7c98785 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -2,15 +2,17 @@ import itertools from .common import InfoExtractor from ..utils import ( + bug_reports_message, determine_ext, - extract_attributes, int_or_none, lowercase_escape, parse_qs, - traverse_obj, + qualities, try_get, + update_url_query, url_or_none, ) +from ..utils.traversal import traverse_obj class YandexVideoIE(InfoExtractor): @@ -186,7 +188,22 @@ class YandexVideoPreviewIE(InfoExtractor): return self.url_result(data_json['video']['url']) -class ZenYandexIE(InfoExtractor): +class ZenYandexBaseIE(InfoExtractor): + def _fetch_ssr_data(self, url, video_id): + webpage = self._download_webpage(url, video_id) + redirect = self._search_json( + r'(?:var|let|const)\s+it\s*=', webpage, 'redirect', video_id, default={}).get('retpath') + if redirect: + video_id = self._match_id(redirect) + webpage = self._download_webpage(redirect, video_id, note='Redirecting') + return video_id, self._search_json( + r'(?:var|let|const)\s+_params\s*=\s*\(', webpage, 'metadata', video_id, + contains_pattern=r'{["\']ssrData.+}')['ssrData'] + + +class ZenYandexIE(ZenYandexBaseIE): + IE_NAME = 'dzen.ru' + IE_DESC = 'Дзен (dzen) formerly Яндекс.Дзен (Yandex Zen)' _VALID_URL = r'https?://(zen\.yandex|dzen)\.ru(?:/video)?/(media|watch)/(?:(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-)?(?P[a-z0-9-]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7', @@ -216,6 +233,7 @@ class ZenYandexIE(InfoExtractor): 'timestamp': 1573465585, }, 'params': {'skip_download': 'm3u8'}, + 'skip': 'The page does not exist', }, { 'url': 'https://zen.yandex.ru/video/watch/6002240ff8b1af50bb2da5e3', 'info_dict': { @@ -227,6 +245,9 @@ class ZenYandexIE(InfoExtractor): 'uploader': 'TechInsider', 'timestamp': 1611378221, 'upload_date': '20210123', + 'view_count': int, + 'duration': 243, + 'tags': ['опыт', 'эксперимент', 'огонь'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -240,6 +261,9 @@ class ZenYandexIE(InfoExtractor): 'uploader': 'TechInsider', 'upload_date': '20210123', 'timestamp': 1611378221, + 'view_count': int, + 'duration': 243, + 'tags': ['опыт', 'эксперимент', 'огонь'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -252,44 +276,56 @@ class ZenYandexIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - redirect = self._search_json(r'var it\s*=', webpage, 'redirect', id, default={}).get('retpath') - if redirect: - video_id = self._match_id(redirect) - webpage = self._download_webpage(redirect, video_id, note='Redirecting') - data_json = self._search_json( - r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}') - serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state') - uploader = self._search_regex(r'(]+>)', - webpage, 'uploader', default='') - uploader_name = extract_attributes(uploader).get('aria-label') - item_id = traverse_obj(data_json, (serverstate, 'videoViewer', 'openedItemId', {str})) - video_json = traverse_obj(data_json, (serverstate, 'videoViewer', 'items', item_id, {dict})) or {} + video_id, ssr_data = self._fetch_ssr_data(url, video_id) + video_data = ssr_data['videoMetaResponse'] formats, subtitles = [], {} - for s_url in traverse_obj(video_json, ('video', 'streams', ..., {url_or_none})): + quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7')) + # Deduplicate stream URLs. The "dzen_dash" query parameter is present in some URLs but can be omitted + stream_urls = set(traverse_obj(video_data, ( + 'video', ('id', ('streams', ...), ('mp4Streams', ..., 'url'), ('oneVideoStreams', ..., 'url')), + {url_or_none}, {update_url_query(query={'dzen_dash': []})}))) + for s_url in stream_urls: ext = determine_ext(s_url) - if ext == 'mpd': - fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash') - elif ext == 'm3u8': - fmts, subs = self._extract_m3u8_formats_and_subtitles(s_url, video_id, 'mp4') + content_type = traverse_obj(parse_qs(s_url), ('ct', 0)) + if ext == 'mpd' or content_type == '6': + fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash', fatal=False) + elif ext == 'm3u8' or content_type == '8': + fmts, subs = self._extract_m3u8_formats_and_subtitles(s_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif content_type == '0': + format_type = traverse_obj(parse_qs(s_url), ('type', 0)) + formats.append({ + 'url': s_url, + 'format_id': format_type, + 'ext': 'mp4', + 'quality': quality(format_type), + }) + continue + else: + self.report_warning(f'Unsupported stream URL: {s_url}{bug_reports_message()}') + continue formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) + self._merge_subtitles(subs, target=subtitles) + return { 'id': video_id, - 'title': video_json.get('title') or self._og_search_title(webpage), 'formats': formats, 'subtitles': subtitles, - 'duration': int_or_none(video_json.get('duration')), - 'view_count': int_or_none(video_json.get('views')), - 'timestamp': int_or_none(video_json.get('publicationDate')), - 'uploader': uploader_name or data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), - 'description': video_json.get('description') or self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage) or try_get(data_json, lambda x: x['og']['imageUrl']), + **traverse_obj(video_data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('image', {url_or_none}), + 'duration': ('video', 'duration', {int_or_none}), + 'view_count': ('video', 'views', {int_or_none}), + 'timestamp': ('publicationDate', {int_or_none}), + 'tags': ('tags', ..., {str}), + 'uploader': ('source', 'title', {str}), + }), } -class ZenYandexChannelIE(InfoExtractor): +class ZenYandexChannelIE(ZenYandexBaseIE): + IE_NAME = 'dzen.ru:channel' _VALID_URL = r'https?://(zen\.yandex|dzen)\.ru/(?!media|video)(?:id/)?(?P[a-z0-9-_]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/tok_media', @@ -323,8 +359,8 @@ class ZenYandexChannelIE(InfoExtractor): 'url': 'https://zen.yandex.ru/jony_me', 'info_dict': { 'id': 'jony_me', - 'description': 'md5:ce0a5cad2752ab58701b5497835b2cc5', - 'title': 'JONY ', + 'description': 'md5:7c30d11dc005faba8826feae99da3113', + 'title': 'JONY', }, 'playlist_count': 18, }, { @@ -333,9 +369,8 @@ class ZenYandexChannelIE(InfoExtractor): 'url': 'https://zen.yandex.ru/tatyanareva', 'info_dict': { 'id': 'tatyanareva', - 'description': 'md5:40a1e51f174369ec3ba9d657734ac31f', + 'description': 'md5:92e56fa730a932ca2483ba5c2186ad96', 'title': 'Татьяна Рева', - 'entries': 'maxcount:200', }, 'playlist_mincount': 46, }, { @@ -348,43 +383,31 @@ class ZenYandexChannelIE(InfoExtractor): 'playlist_mincount': 657, }] - def _entries(self, item_id, server_state_json, server_settings_json): - items = (traverse_obj(server_state_json, ('feed', 'items', ...)) - or traverse_obj(server_settings_json, ('exportData', 'items', ...))) - - more = (traverse_obj(server_state_json, ('links', 'more')) - or traverse_obj(server_settings_json, ('exportData', 'more', 'link'))) - + def _entries(self, feed_data, channel_id): next_page_id = None for page in itertools.count(1): - for item in items or []: - if item.get('type') != 'gif': - continue - video_id = traverse_obj(item, 'publication_id', 'publicationId') or '' - yield self.url_result(item['link'], ZenYandexIE, video_id.split(':')[-1]) + for item in traverse_obj(feed_data, ( + (None, ('items', lambda _, v: v['tab'] in ('shorts', 'longs'))), + 'items', lambda _, v: url_or_none(v['link']), + )): + yield self.url_result(item['link'], ZenYandexIE, item.get('id'), title=item.get('title')) + more = traverse_obj(feed_data, ('more', 'link', {url_or_none})) current_page_id = next_page_id next_page_id = traverse_obj(parse_qs(more), ('next_page_id', -1)) - if not all((more, items, next_page_id, next_page_id != current_page_id)): + if not all((more, next_page_id, next_page_id != current_page_id)): break - data = self._download_json(more, item_id, note=f'Downloading Page {page}') - items, more = data.get('items'), traverse_obj(data, ('more', 'link')) + feed_data = self._download_json(more, channel_id, note=f'Downloading Page {page}') def _real_extract(self, url): - item_id = self._match_id(url) - webpage = self._download_webpage(url, item_id) - redirect = self._search_json( - r'var it\s*=', webpage, 'redirect', item_id, default={}).get('retpath') - if redirect: - item_id = self._match_id(redirect) - webpage = self._download_webpage(redirect, item_id, note='Redirecting') - data = self._search_json( - r'("data"\s*:|data\s*=)', webpage, 'channel data', item_id, contains_pattern=r'{\"__serverState__.+}') - server_state_json = traverse_obj(data, lambda k, _: k.startswith('__serverState__'), get_all=False) - server_settings_json = traverse_obj(data, lambda k, _: k.startswith('__serverSettings__'), get_all=False) + channel_id = self._match_id(url) + channel_id, ssr_data = self._fetch_ssr_data(url, channel_id) + channel_data = ssr_data['exportResponse'] return self.playlist_result( - self._entries(item_id, server_state_json, server_settings_json), - item_id, traverse_obj(server_state_json, ('channel', 'source', 'title')), - traverse_obj(server_state_json, ('channel', 'source', 'description'))) + self._entries(channel_data['feedData'], channel_id), + channel_id, **traverse_obj(channel_data, ('channel', 'source', { + 'title': ('title', {str}), + 'description': ('description', {str}), + }))) From db6d1f145ad583e0220637726029f8f2fa6200a0 Mon Sep 17 00:00:00 2001 From: WouterGordts Date: Sun, 6 Apr 2025 19:51:08 +0200 Subject: [PATCH 38/44] [ie/mixcloud] Refactor extractor (#12830) Authored by: WouterGordts, seproDev Co-authored-by: sepro --- yt_dlp/extractor/mixcloud.py | 73 ++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 40 deletions(-) diff --git a/yt_dlp/extractor/mixcloud.py b/yt_dlp/extractor/mixcloud.py index 19b7fd4e70..852670fcba 100644 --- a/yt_dlp/extractor/mixcloud.py +++ b/yt_dlp/extractor/mixcloud.py @@ -10,7 +10,9 @@ from ..utils import ( parse_iso8601, strip_or_none, try_get, + url_or_none, ) +from ..utils.traversal import traverse_obj class MixcloudBaseIE(InfoExtractor): @@ -37,7 +39,7 @@ class MixcloudIE(MixcloudBaseIE): 'ext': 'm4a', 'title': 'Cryptkeeper', 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', - 'uploader': 'Daniel Holbach', + 'uploader': 'dholbach', 'uploader_id': 'dholbach', 'thumbnail': r're:https?://.*\.jpg', 'view_count': int, @@ -46,10 +48,11 @@ class MixcloudIE(MixcloudBaseIE): 'uploader_url': 'https://www.mixcloud.com/dholbach/', 'artist': 'Submorphics & Chino , Telekinesis, Porter Robinson, Enei, Breakage ft Jess Mills', 'duration': 3723, - 'tags': [], + 'tags': ['liquid drum and bass', 'drum and bass'], 'comment_count': int, 'repost_count': int, 'like_count': int, + 'artists': list, }, 'params': {'skip_download': 'm3u8'}, }, { @@ -67,7 +70,7 @@ class MixcloudIE(MixcloudBaseIE): 'upload_date': '20150203', 'uploader_url': 'https://www.mixcloud.com/gillespeterson/', 'duration': 2992, - 'tags': [], + 'tags': ['jazz', 'soul', 'world music', 'funk'], 'comment_count': int, 'repost_count': int, 'like_count': int, @@ -149,8 +152,6 @@ class MixcloudIE(MixcloudBaseIE): elif reason: raise ExtractorError('Track is restricted', expected=True) - title = cloudcast['name'] - stream_info = cloudcast['streamInfo'] formats = [] @@ -182,47 +183,39 @@ class MixcloudIE(MixcloudBaseIE): self.raise_login_required(metadata_available=True) comments = [] - for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []): - node = edge.get('node') or {} + for node in traverse_obj(cloudcast, ('comments', 'edges', ..., 'node', {dict})): text = strip_or_none(node.get('comment')) if not text: continue - user = node.get('user') or {} comments.append({ - 'author': user.get('displayName'), - 'author_id': user.get('username'), 'text': text, - 'timestamp': parse_iso8601(node.get('created')), + **traverse_obj(node, { + 'author': ('user', 'displayName', {str}), + 'author_id': ('user', 'username', {str}), + 'timestamp': ('created', {parse_iso8601}), + }), }) - tags = [] - for t in cloudcast.get('tags'): - tag = try_get(t, lambda x: x['tag']['name'], str) - if not tag: - tags.append(tag) - - get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount'])) - - owner = cloudcast.get('owner') or {} - return { 'id': track_id, - 'title': title, 'formats': formats, - 'description': cloudcast.get('description'), - 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], str), - 'uploader': owner.get('displayName'), - 'timestamp': parse_iso8601(cloudcast.get('publishDate')), - 'uploader_id': owner.get('username'), - 'uploader_url': owner.get('url'), - 'duration': int_or_none(cloudcast.get('audioLength')), - 'view_count': int_or_none(cloudcast.get('plays')), - 'like_count': get_count('favorites'), - 'repost_count': get_count('reposts'), - 'comment_count': get_count('comments'), 'comments': comments, - 'tags': tags, - 'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None, + **traverse_obj(cloudcast, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'thumbnail': ('picture', 'url', {url_or_none}), + 'timestamp': ('publishDate', {parse_iso8601}), + 'duration': ('audioLength', {int_or_none}), + 'uploader': ('owner', 'displayName', {str}), + 'uploader_id': ('owner', 'username', {str}), + 'uploader_url': ('owner', 'url', {url_or_none}), + 'view_count': ('plays', {int_or_none}), + 'like_count': ('favorites', 'totalCount', {int_or_none}), + 'repost_count': ('reposts', 'totalCount', {int_or_none}), + 'comment_count': ('comments', 'totalCount', {int_or_none}), + 'tags': ('tags', ..., 'tag', 'name', {str}, filter, all, filter), + 'artists': ('featuringArtistList', ..., {str}, filter, all, filter), + }), } @@ -295,7 +288,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/', 'info_dict': { 'id': 'dholbach_uploads', - 'title': 'Daniel Holbach (uploads)', + 'title': 'dholbach (uploads)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, @@ -303,7 +296,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/uploads/', 'info_dict': { 'id': 'dholbach_uploads', - 'title': 'Daniel Holbach (uploads)', + 'title': 'dholbach (uploads)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, @@ -311,7 +304,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/favorites/', 'info_dict': { 'id': 'dholbach_favorites', - 'title': 'Daniel Holbach (favorites)', + 'title': 'dholbach (favorites)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, # 'params': { @@ -337,7 +330,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'title': 'First Ear (stream)', 'description': 'we maraud for ears', }, - 'playlist_mincount': 269, + 'playlist_mincount': 267, }] _TITLE_KEY = 'displayName' @@ -361,7 +354,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): 'id': 'maxvibes_jazzcat-on-ness-radio', 'title': 'Ness Radio sessions', }, - 'playlist_mincount': 59, + 'playlist_mincount': 58, }] _TITLE_KEY = 'name' _DESCRIPTION_KEY = 'description' From 45f01de00e1bc076b7f676a669736326178647b1 Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 6 Apr 2025 20:31:00 +0200 Subject: [PATCH 39/44] [utils] `_yield_json_ld`: Make function less fatal (#12855) Authored by: seproDev --- yt_dlp/extractor/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 4c1bc4cf47..d5607296df 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1570,6 +1570,8 @@ class InfoExtractor: """Yield all json ld objects in the html""" if default is not NO_DEFAULT: fatal = False + if not fatal and not isinstance(html, str): + return for mobj in re.finditer(JSON_LD_RE, html): json_ld_item = self._parse_json( mobj.group('json_ld'), video_id, fatal=fatal, From a473e592337edb8ca40cde52c1fcaee261c54df9 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Mon, 7 Apr 2025 03:46:08 +0900 Subject: [PATCH 40/44] [utils] `url_or_none`: Support WebSocket URLs (#12848) Authored by: doe1080 --- test/test_utils.py | 2 ++ yt_dlp/utils/_utils.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index e60ceed8fd..aedb565ec1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -659,6 +659,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de') self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de') self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de') + self.assertEqual(url_or_none('ws://foo.de'), 'ws://foo.de') + self.assertEqual(url_or_none('wss://foo.de'), 'wss://foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 24525560ef..99d7250876 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2044,7 +2044,7 @@ def url_or_none(url): if not url or not isinstance(url, str): return None url = url.strip() - return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None + return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?|wss?):)?//', url) else None def strftime_or_none(timestamp, date_format='%Y%m%d', default=None): From 7faa18b83dcfc74a1a1e2034e6b0369c495ca645 Mon Sep 17 00:00:00 2001 From: "J.Luis" Date: Sun, 6 Apr 2025 20:48:07 +0200 Subject: [PATCH 41/44] [ie/ivoox] Add extractor (#12768) Authored by: NeonMan, seproDev Co-authored-by: sepro --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/ivoox.py | 78 +++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 yt_dlp/extractor/ivoox.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a206a38e42..5ae7a8a972 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -903,6 +903,7 @@ from .ivi import ( IviIE, ) from .ivideon import IvideonIE +from .ivoox import IvooxIE from .iwara import ( IwaraIE, IwaraPlaylistIE, diff --git a/yt_dlp/extractor/ivoox.py b/yt_dlp/extractor/ivoox.py new file mode 100644 index 0000000000..36e02493a5 --- /dev/null +++ b/yt_dlp/extractor/ivoox.py @@ -0,0 +1,78 @@ +from .common import InfoExtractor +from ..utils import int_or_none, parse_iso8601, url_or_none, urljoin +from ..utils.traversal import traverse_obj + + +class IvooxIE(InfoExtractor): + _VALID_URL = ( + r'https?://(?:www\.)?ivoox\.com/(?:\w{2}/)?[^/?#]+_rf_(?P[0-9]+)_1\.html', + r'https?://go\.ivoox\.com/rf/(?P[0-9]+)', + ) + _TESTS = [{ + 'url': 'https://www.ivoox.com/dex-08x30-rostros-del-mal-los-asesinos-en-audios-mp3_rf_143594959_1.html', + 'md5': '993f712de5b7d552459fc66aa3726885', + 'info_dict': { + 'id': '143594959', + 'ext': 'mp3', + 'timestamp': 1742731200, + 'channel': 'DIAS EXTRAÑOS con Santiago Camacho', + 'title': 'DEx 08x30 Rostros del mal: Los asesinos en serie que aterrorizaron España', + 'description': 'md5:eae8b4b9740d0216d3871390b056bb08', + 'uploader': 'Santiago Camacho', + 'thumbnail': 'https://static-1.ivoox.com/audios/c/d/5/2/cd52f46783fe735000c33a803dce2554_XXL.jpg', + 'upload_date': '20250323', + 'episode': 'DEx 08x30 Rostros del mal: Los asesinos en serie que aterrorizaron España', + 'duration': 11837, + 'tags': ['españa', 'asesinos en serie', 'arropiero', 'historia criminal', 'mataviejas'], + }, + }, { + 'url': 'https://go.ivoox.com/rf/143594959', + 'only_matching': True, + }, { + 'url': 'https://www.ivoox.com/en/campodelgas-28-03-2025-audios-mp3_rf_144036942_1.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id, fatal=False) + + data = self._search_nuxt_data( + webpage, media_id, fatal=False, traverse=('data', 0, 'data', 'audio')) + + direct_download = self._download_json( + f'https://vcore-web.ivoox.com/v1/public/audios/{media_id}/download-url', media_id, fatal=False, + note='Fetching direct download link', headers={'Referer': url}) + + download_paths = { + *traverse_obj(direct_download, ('data', 'downloadUrl', {str}, filter, all)), + *traverse_obj(data, (('downloadUrl', 'mediaUrl'), {str}, filter)), + } + + formats = [] + for path in download_paths: + formats.append({ + 'url': urljoin('https://ivoox.com', path), + 'http_headers': {'Referer': url}, + }) + + return { + 'id': media_id, + 'formats': formats, + 'uploader': self._html_search_regex(r'data-prm-author="([^"]+)"', webpage, 'author', default=None), + 'timestamp': parse_iso8601( + self._html_search_regex(r'data-prm-pubdate="([^"]+)"', webpage, 'timestamp', default=None)), + 'channel': self._html_search_regex(r'data-prm-podname="([^"]+)"', webpage, 'channel', default=None), + 'title': self._html_search_regex(r'data-prm-title="([^"]+)"', webpage, 'title', default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + **self._search_json_ld(webpage, media_id, default={}), + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('uploadDate', {parse_iso8601(delimiter=' ')}), + 'duration': ('duration', {int_or_none}), + 'tags': ('tags', ..., 'name', {str}), + }), + } From 3c1c75ecb8ab352f422b59af46fff2be992e4115 Mon Sep 17 00:00:00 2001 From: Frank Aurich <1100101+automatic@gmail.com> Date: Sun, 6 Apr 2025 21:04:24 +0200 Subject: [PATCH 42/44] [ie/kika] Add playlist extractor (#12832) Closes #3658 Authored by: 1100101 --- yt_dlp/extractor/_extractors.py | 5 +++- yt_dlp/extractor/kika.py | 42 +++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 5ae7a8a972..379e64548c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -961,7 +961,10 @@ from .kick import ( ) from .kicker import KickerIE from .kickstarter import KickStarterIE -from .kika import KikaIE +from .kika import ( + KikaIE, + KikaPlaylistIE, +) from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .kommunetv import KommunetvIE diff --git a/yt_dlp/extractor/kika.py b/yt_dlp/extractor/kika.py index 69f4a3ce03..e277564524 100644 --- a/yt_dlp/extractor/kika.py +++ b/yt_dlp/extractor/kika.py @@ -1,3 +1,5 @@ +import itertools + from .common import InfoExtractor from ..utils import ( determine_ext, @@ -124,3 +126,43 @@ class KikaIE(InfoExtractor): 'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}), }), } + + +class KikaPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kika\.de/[\w-]+/(?P[a-z-]+\d+)' + + _TESTS = [{ + 'url': 'https://www.kika.de/logo/logo-die-welt-und-ich-562', + 'info_dict': { + 'id': 'logo-die-welt-und-ich-562', + 'title': 'logo!', + 'description': 'md5:7b9d7f65561b82fa512f2cfb553c397d', + }, + 'playlist_count': 100, + }] + + def _entries(self, playlist_url, playlist_id): + for page in itertools.count(1): + data = self._download_json(playlist_url, playlist_id, note=f'Downloading page {page}') + for item in traverse_obj(data, ('content', lambda _, v: url_or_none(v['api']['url']))): + yield self.url_result( + item['api']['url'], ie=KikaIE, + **traverse_obj(item, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'duration': ('duration', {int_or_none}), + 'timestamp': ('date', {parse_iso8601}), + })) + + playlist_url = traverse_obj(data, ('links', 'next', {url_or_none})) + if not playlist_url: + break + + def _real_extract(self, url): + playlist_id = self._match_id(url) + brand_data = self._download_json( + f'https://www.kika.de/_next-api/proxy/v1/brands/{playlist_id}', playlist_id) + + return self.playlist_result( + self._entries(brand_data['videoSubchannel']['videosPageUrl'], playlist_id), + playlist_id, title=brand_data.get('title'), description=brand_data.get('description')) From 1d45e30537bf83e069184a440703e4c43b2e0198 Mon Sep 17 00:00:00 2001 From: Snack Date: Mon, 7 Apr 2025 08:24:58 +0900 Subject: [PATCH 43/44] [ie/niconico:live] Fix extractor (#12809) Closes #12365 Authored by: Snack-X --- yt_dlp/downloader/niconico.py | 1 + yt_dlp/extractor/niconico.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py index 462c6e2d63..310144e7d2 100644 --- a/yt_dlp/downloader/niconico.py +++ b/yt_dlp/downloader/niconico.py @@ -85,6 +85,7 @@ class NiconicoLiveFD(FileDownloader): 'quality': live_quality, 'protocol': 'hls+fmp4', 'latency': live_latency, + 'accessRightMethod': 'single_cookie', 'chasePlay': False, }, 'room': { diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 62dd0ab9cf..5e66aebeba 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -27,6 +27,7 @@ from ..utils import ( traverse_obj, try_get, unescapeHTML, + unified_timestamp, update_url_query, url_basename, url_or_none, @@ -985,6 +986,7 @@ class NiconicoLiveIE(InfoExtractor): 'quality': 'abr', 'protocol': 'hls+fmp4', 'latency': latency, + 'accessRightMethod': 'single_cookie', 'chasePlay': False, }, 'room': { @@ -1005,6 +1007,7 @@ class NiconicoLiveIE(InfoExtractor): if data.get('type') == 'stream': m3u8_url = data['data']['uri'] qualities = data['data']['availableQualities'] + cookies = data['data']['cookies'] break elif data.get('type') == 'disconnect': self.write_debug(recv) @@ -1043,6 +1046,11 @@ class NiconicoLiveIE(InfoExtractor): **res, }) + for cookie in cookies: + self._set_cookie( + cookie['domain'], cookie['name'], cookie['value'], + expire_time=unified_timestamp(cookie['expires']), path=cookie['path'], secure=cookie['secure']) + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True) for fmt, q in zip(formats, reversed(qualities[1:])): fmt.update({ From 74e90dd9b8f9c1a5c48a2515126654f4d398d687 Mon Sep 17 00:00:00 2001 From: Subrat Lima <74418100+subrat-lima@users.noreply.github.com> Date: Mon, 7 Apr 2025 04:56:44 +0530 Subject: [PATCH 44/44] [ie/LRTRadio] Add extractor (#12801) Closes #12745 Authored by: subrat-lima --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/lrt.py | 44 ++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 379e64548c..f7e3f25c3b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1065,6 +1065,7 @@ from .loom import ( from .lovehomeporn import LoveHomePornIE from .lrt import ( LRTVODIE, + LRTRadioIE, LRTStreamIE, ) from .lsm import ( diff --git a/yt_dlp/extractor/lrt.py b/yt_dlp/extractor/lrt.py index 1a0b6da230..e50194f88b 100644 --- a/yt_dlp/extractor/lrt.py +++ b/yt_dlp/extractor/lrt.py @@ -2,8 +2,11 @@ from .common import InfoExtractor from ..utils import ( clean_html, merge_dicts, + str_or_none, traverse_obj, + unified_timestamp, url_or_none, + urljoin, ) @@ -80,7 +83,7 @@ class LRTVODIE(LRTBaseIE): }] def _real_extract(self, url): - path, video_id = self._match_valid_url(url).groups() + path, video_id = self._match_valid_url(url).group('path', 'id') webpage = self._download_webpage(url, video_id) media_url = self._extract_js_var(webpage, 'main_url', path) @@ -106,3 +109,42 @@ class LRTVODIE(LRTBaseIE): } return merge_dicts(clean_info, jw_data, json_ld_data) + + +class LRTRadioIE(LRTBaseIE): + _VALID_URL = r'https?://(?:www\.)?lrt\.lt/radioteka/irasas/(?P\d+)/(?P[^?#/]+)' + _TESTS = [{ + # m3u8 download + 'url': 'https://www.lrt.lt/radioteka/irasas/2000359728/nemarios-eiles-apie-pragarus-ir-skaistyklas-su-aiste-kiltinaviciute', + 'info_dict': { + 'id': '2000359728', + 'ext': 'm4a', + 'title': 'Nemarios eilės: apie pragarus ir skaistyklas su Aiste Kiltinavičiūte', + 'description': 'md5:5eee9a0e86a55bf547bd67596204625d', + 'timestamp': 1726143120, + 'upload_date': '20240912', + 'tags': 'count:5', + 'thumbnail': r're:https?://.+/.+\.jpe?g', + 'categories': ['Daiktiniai įrodymai'], + }, + }, { + 'url': 'https://www.lrt.lt/radioteka/irasas/2000304654/vakaras-su-knyga-svetlana-aleksijevic-cernobylio-malda-v-dalis?season=%2Fmediateka%2Faudio%2Fvakaras-su-knyga%2F2023', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, path = self._match_valid_url(url).group('id', 'path') + media = self._download_json( + 'https://www.lrt.lt/radioteka/api/media', video_id, + query={'url': f'/mediateka/irasas/{video_id}/{path}'}) + + return traverse_obj(media, { + 'id': ('id', {int}, {str_or_none}), + 'title': ('title', {str}), + 'tags': ('tags', ..., 'name', {str}), + 'categories': ('playlist_item', 'category', {str}, filter, all, filter), + 'description': ('content', {clean_html}, {str}), + 'timestamp': ('date', {lambda x: x.replace('.', '/')}, {unified_timestamp}), + 'thumbnail': ('playlist_item', 'image', {urljoin('https://www.lrt.lt')}), + 'formats': ('playlist_item', 'file', {lambda x: self._extract_m3u8_formats(x, video_id)}), + })