diff --git a/README.md b/README.md index c0329f5394..0cc2cd7b2c 100644 --- a/README.md +++ b/README.md @@ -1770,7 +1770,7 @@ The following extractors use this feature: * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively * `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv` and `tv_embedded`. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` -* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details +* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player), `initial_data` (skip initial data/next ep request). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause issues such as missing formats or metadata. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) and [#12826](https://github.com/yt-dlp/yt-dlp/issues/12826) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` diff --git a/test/helper.py b/test/helper.py index 4169af799f..e4cb478e28 100644 --- a/test/helper.py +++ b/test/helper.py @@ -136,7 +136,7 @@ def _iter_differences(got, expected, field): return if op == 'startswith': - if not val.startswith(got): + if not got.startswith(val): yield field, f'should start with {val!r}, got {got!r}' return diff --git a/test/test_networking.py b/test/test_networking.py index 3ab60fe836..2f441fced2 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -39,6 +39,7 @@ from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.dependencies import brotli, curl_cffi, requests, urllib3 from yt_dlp.networking import ( HEADRequest, + PATCHRequest, PUTRequest, Request, RequestDirector, @@ -1856,6 +1857,7 @@ class TestRequest: def test_request_helpers(self): assert HEADRequest('http://example.com').method == 'HEAD' + assert PATCHRequest('http://example.com').method == 'PATCH' assert PUTRequest('http://example.com').method == 'PUT' def test_headers(self): diff --git a/test/test_utils.py b/test/test_utils.py index e60ceed8fd..aedb565ec1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -659,6 +659,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de') self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de') self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de') + self.assertEqual(url_or_none('ws://foo.de'), 'ws://foo.de') + self.assertEqual(url_or_none('wss://foo.de'), 'wss://foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py index 462c6e2d63..310144e7d2 100644 --- a/yt_dlp/downloader/niconico.py +++ b/yt_dlp/downloader/niconico.py @@ -85,6 +85,7 @@ class NiconicoLiveFD(FileDownloader): 'quality': live_quality, 'protocol': 'hls+fmp4', 'latency': live_latency, + 'accessRightMethod': 'single_cookie', 'chasePlay': False, }, 'room': { diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9fc8913654..047af92820 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -903,6 +903,7 @@ from .ivi import ( IviIE, ) from .ivideon import IvideonIE +from .ivoox import IvooxIE from .iwara import ( IwaraIE, IwaraPlaylistIE, @@ -960,7 +961,10 @@ from .kick import ( ) from .kicker import KickerIE from .kickstarter import KickStarterIE -from .kika import KikaIE +from .kika import ( + KikaIE, + KikaPlaylistIE, +) from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .kommunetv import KommunetvIE @@ -1061,6 +1065,7 @@ from .loom import ( from .lovehomeporn import LoveHomePornIE from .lrt import ( LRTVODIE, + LRTRadioIE, LRTStreamIE, ) from .lsm import ( @@ -1493,6 +1498,10 @@ from .paramountplus import ( ) from .parler import ParlerIE from .parlview import ParlviewIE +from .parti import ( + PartiLivestreamIE, + PartiVideoIE, +) from .patreon import ( PatreonCampaignIE, PatreonIE, @@ -1774,7 +1783,6 @@ from .rtvcplay import ( from .rtve import ( RTVEALaCartaIE, RTVEAudioIE, - RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE, ) @@ -2228,7 +2236,10 @@ from .tvplay import ( TVPlayIE, ) from .tvplayer import TVPlayerIE -from .tvw import TvwIE +from .tvw import ( + TvwIE, + TvwTvChannelsIE, +) from .tweakers import TweakersIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 8c7131b10a..8f2fc4c80a 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -21,6 +21,7 @@ from ..utils import ( int_or_none, time_seconds, traverse_obj, + update_url, update_url_query, ) @@ -417,6 +418,10 @@ class AbemaTVIE(AbemaTVBaseIE): 'is_live': is_live, 'availability': availability, }) + + if thumbnail := update_url(self._og_search_thumbnail(webpage, default=''), query=None): + info['thumbnails'] = [{'url': thumbnail}] + return info diff --git a/yt_dlp/extractor/agora.py b/yt_dlp/extractor/agora.py index 9835584254..e040db6010 100644 --- a/yt_dlp/extractor/agora.py +++ b/yt_dlp/extractor/agora.py @@ -146,7 +146,7 @@ class TokFMPodcastIE(InfoExtractor): 'url': 'https://audycje.tokfm.pl/podcast/91275,-Systemowy-rasizm-Czy-zamieszki-w-USA-po-morderstwie-w-Minneapolis-doprowadza-do-zmian-w-sluzbach-panstwowych', 'info_dict': { 'id': '91275', - 'ext': 'aac', + 'ext': 'mp3', 'title': 'md5:a9b15488009065556900169fb8061cce', 'episode': 'md5:a9b15488009065556900169fb8061cce', 'series': 'Analizy', @@ -164,23 +164,20 @@ class TokFMPodcastIE(InfoExtractor): raise ExtractorError('No such podcast', expected=True) metadata = metadata[0] - formats = [] - for ext in ('aac', 'mp3'): - url_data = self._download_json( - f'https://api.podcast.radioagora.pl/api4/getSongUrl?podcast_id={media_id}&device_id={uuid.uuid4()}&ppre=false&audio={ext}', - media_id, f'Downloading podcast {ext} URL') - # prevents inserting the mp3 (default) multiple times - if 'link_ssl' in url_data and f'.{ext}' in url_data['link_ssl']: - formats.append({ - 'url': url_data['link_ssl'], - 'ext': ext, - 'vcodec': 'none', - 'acodec': ext, - }) + mp3_url = self._download_json( + 'https://api.podcast.radioagora.pl/api4/getSongUrl', + media_id, 'Downloading podcast mp3 URL', query={ + 'podcast_id': media_id, + 'device_id': str(uuid.uuid4()), + 'ppre': 'false', + 'audio': 'mp3', + })['link_ssl'] return { 'id': media_id, - 'formats': formats, + 'url': mp3_url, + 'vcodec': 'none', + 'ext': 'mp3', 'title': metadata.get('podcast_name'), 'series': metadata.get('series_name'), 'episode': metadata.get('podcast_name'), diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index 0fe95bec5c..1258a5704d 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -1,64 +1,105 @@ +import urllib.parse + from .common import InfoExtractor from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, + parse_age_limit, + url_or_none, urlencode_postdata, ) +from ..utils.traversal import traverse_obj class AtresPlayerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P.+?)_(?P[0-9a-f]{24})' + _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/(?:[^/?#]+/){4}(?P.+?)_(?P[0-9a-f]{24})' _NETRC_MACHINE = 'atresplayer' - _TESTS = [ - { - 'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/', - 'info_dict': { - 'id': '5d4aa2c57ed1a88fc715a615', - 'ext': 'mp4', - 'title': 'Capítulo 7: Asuntos pendientes', - 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', - 'duration': 3413, - }, - 'skip': 'This video is only available for registered users', + _TESTS = [{ + 'url': 'https://www.atresplayer.com/lasexta/programas/el-objetivo/clips/mbappe-describe-como-entrenador-a-carlo-ancelotti-sabe-cuando-tiene-que-ser-padre-jefe-amigo-entrenador_67f2dfb2fb6ab0e4c7203849/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67f2dfb2fb6ab0e4c7203849', + 'display_id': 'md5:c203f8d4e425ed115ba56a1c6e4b3e6c', + 'title': 'Mbappé describe como entrenador a Carlo Ancelotti: "Sabe cuándo tiene que ser padre, jefe, amigo, entrenador..."', + 'channel': 'laSexta', + 'duration': 31, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages02/2025/04/06/B02DBE1E-D59B-4683-8404-1A9595D15269/1920x1080.jpg', + 'tags': ['Entrevista informativa', 'Actualidad', 'Debate informativo', 'Política', 'Economía', 'Sociedad', 'Cara a cara', 'Análisis', 'Más periodismo'], + 'series': 'El Objetivo', + 'season': 'Temporada 12', + 'timestamp': 1743970079, + 'upload_date': '20250406', }, - { - 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', - 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/antena3/programas/el-hormiguero/clips/revive-la-entrevista-completa-a-miguel-bose-en-el-hormiguero_67f836baa4a5b0e4147ca59a/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67f836baa4a5b0e4147ca59a', + 'display_id': 'revive-la-entrevista-completa-a-miguel-bose-en-el-hormiguero', + 'title': 'Revive la entrevista completa a Miguel Bosé en El Hormiguero', + 'description': 'md5:c6d2b591408d45a7bc2986dfb938eb72', + 'channel': 'Antena 3', + 'duration': 2556, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages02/2025/04/10/9076395F-F1FD-48BE-9F18-540DBA10EBAD/1920x1080.jpg', + 'tags': ['Entrevista', 'Variedades', 'Humor', 'Entretenimiento', 'Te sigo', 'Buen rollo', 'Cara a cara'], + 'series': 'El Hormiguero ', + 'season': 'Temporada 14', + 'timestamp': 1744320111, + 'upload_date': '20250410', }, - { - 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', - 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/flooxer/series/biara-proyecto-lazarus/temporada-1/capitulo-3-supervivientes_67a6038b64ceca00070f4f69/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67a6038b64ceca00070f4f69', + 'display_id': 'capitulo-3-supervivientes', + 'title': 'Capítulo 3: Supervivientes', + 'description': 'md5:65b231f20302f776c2b0dd24594599a1', + 'channel': 'Flooxer', + 'duration': 1196, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages01/2025/02/14/17CF90D3-FE67-40C5-A941-7825B3E13992/1920x1080.jpg', + 'tags': ['Juvenil', 'Terror', 'Piel de gallina', 'Te sigo', 'Un break', 'Del tirón'], + 'series': 'BIARA: Proyecto Lázarus', + 'season': 'Temporada 1', + 'season_number': 1, + 'episode': 'Episode 3', + 'episode_number': 3, + 'timestamp': 1743095191, + 'upload_date': '20250327', }, - ] + }, { + 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', + 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', + 'only_matching': True, + }] _API_BASE = 'https://api.atresplayer.com/' def _perform_login(self, username, password): - self._request_webpage( - self._API_BASE + 'login', None, 'Downloading login page') - try: - target_url = self._download_json( - 'https://account.atresmedia.com/api/login', None, - 'Logging in', headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }, data=urlencode_postdata({ + self._download_webpage( + 'https://account.atresplayer.com/auth/v1/login', None, + 'Logging in', 'Failed to log in', data=urlencode_postdata({ 'username': username, 'password': password, - }))['targetUrl'] + })) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 400: raise ExtractorError('Invalid username and/or password', expected=True) raise - self._request_webpage(target_url, None, 'Following Target URL') - def _real_extract(self, url): display_id, video_id = self._match_valid_url(url).groups() + metadata_url = self._download_json( + self._API_BASE + 'client/v1/url', video_id, 'Downloading API endpoint data', + query={'href': urllib.parse.urlparse(url).path})['href'] + metadata = self._download_json(metadata_url, video_id) + try: - episode = self._download_json( - self._API_BASE + 'client/v1/player/episode/' + video_id, video_id) + video_data = self._download_json(metadata['urlVideo'], video_id, 'Downloading video data') except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 403: error = self._parse_json(e.cause.response.read(), None) @@ -67,37 +108,45 @@ class AtresPlayerIE(InfoExtractor): raise ExtractorError(error['error_description'], expected=True) raise - title = episode['titulo'] - formats = [] subtitles = {} - for source in episode.get('sources', []): - src = source.get('src') - if not src: - continue + for source in traverse_obj(video_data, ('sources', lambda _, v: url_or_none(v['src']))): + src_url = source['src'] src_type = source.get('type') - if src_type == 'application/vnd.apple.mpegurl': - formats, subtitles = self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - elif src_type == 'application/dash+xml': - formats, subtitles = self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False) - - heartbeat = episode.get('heartbeat') or {} - omniture = episode.get('omniture') or {} - get_meta = lambda x: heartbeat.get(x) or omniture.get(x) + if src_type in ('application/vnd.apple.mpegurl', 'application/hls+legacy', 'application/hls+hevc'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif src_type in ('application/dash+xml', 'application/dash+hevc'): + fmts, subs = self._extract_mpd_formats_and_subtitles( + src_url, video_id, mpd_id='dash', fatal=False) + else: + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { 'display_id': display_id, 'id': video_id, - 'title': title, - 'description': episode.get('descripcion'), - 'thumbnail': episode.get('imgPoster'), - 'duration': int_or_none(episode.get('duration')), 'formats': formats, - 'channel': get_meta('channel'), - 'season': get_meta('season'), - 'episode_number': int_or_none(get_meta('episodeNumber')), 'subtitles': subtitles, + **traverse_obj(video_data, { + 'title': ('titulo', {str}), + 'description': ('descripcion', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('imgPoster', {url_or_none}, {lambda v: f'{v}1920x1080.jpg'}), + 'age_limit': ('ageRating', {parse_age_limit}), + }), + **traverse_obj(metadata, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {int_or_none}), + 'tags': ('tags', ..., 'title', {str}), + 'age_limit': ('ageRating', {parse_age_limit}), + 'series': ('format', 'title', {str}), + 'season': ('currentSeason', 'title', {str}), + 'season_number': ('currentSeason', 'seasonNumber', {int_or_none}), + 'episode_number': ('numberOfEpisode', {int_or_none}), + 'timestamp': ('publicationDate', {int_or_none(scale=1000)}), + 'channel': ('channel', 'title', {str}), + }), } diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 96f25c22a8..027b37d448 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -13,16 +13,17 @@ from ..compat import compat_ord from ..utils import ( ExtractorError, OnDemandPagedList, + determine_ext, float_or_none, int_or_none, merge_dicts, multipart_encode, parse_duration, - traverse_obj, try_call, - try_get, + url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj class CDAIE(InfoExtractor): @@ -290,34 +291,47 @@ class CDAIE(InfoExtractor): if not video or 'file' not in video: self.report_warning(f'Unable to extract {version} version information') return - if video['file'].startswith('uggc'): - video['file'] = codecs.decode(video['file'], 'rot_13') - if video['file'].endswith('adc.mp4'): - video['file'] = video['file'].replace('adc.mp4', '.mp4') - elif not video['file'].startswith('http'): - video['file'] = decrypt_file(video['file']) video_quality = video.get('quality') qualities = video.get('qualities', {}) video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality) - info_dict['formats'].append({ - 'url': video['file'], - 'format_id': video_quality, - 'height': int_or_none(video_quality[:-1]), - }) + if video.get('file'): + if video['file'].startswith('uggc'): + video['file'] = codecs.decode(video['file'], 'rot_13') + if video['file'].endswith('adc.mp4'): + video['file'] = video['file'].replace('adc.mp4', '.mp4') + elif not video['file'].startswith('http'): + video['file'] = decrypt_file(video['file']) + info_dict['formats'].append({ + 'url': video['file'], + 'format_id': video_quality, + 'height': int_or_none(video_quality[:-1]), + }) for quality, cda_quality in qualities.items(): if quality == video_quality: continue data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2, 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]} data = json.dumps(data).encode() - video_url = self._download_json( + response = self._download_json( f'https://www.cda.pl/video/{video_id}', video_id, headers={ 'Content-Type': 'application/json', 'X-Requested-With': 'XMLHttpRequest', }, data=data, note=f'Fetching {quality} url', errnote=f'Failed to fetch {quality} url', fatal=False) - if try_get(video_url, lambda x: x['result']['status']) == 'ok': - video_url = try_get(video_url, lambda x: x['result']['resp']) + if ( + traverse_obj(response, ('result', 'status')) != 'ok' + or not traverse_obj(response, ('result', 'resp', {url_or_none})) + ): + continue + video_url = response['result']['resp'] + ext = determine_ext(video_url) + if ext == 'mpd': + info_dict['formats'].extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'm3u8': + info_dict['formats'].extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: info_dict['formats'].append({ 'url': video_url, 'format_id': quality, @@ -353,7 +367,7 @@ class CDAIE(InfoExtractor): class CDAFolderIE(InfoExtractor): _MAX_PAGE_SIZE = 36 - _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P\w+)/folder/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P[\w-]+)/folder/(?P\d+)' _TESTS = [ { 'url': 'https://www.cda.pl/domino264/folder/31188385', @@ -378,6 +392,9 @@ class CDAFolderIE(InfoExtractor): 'title': 'TESTY KOSMETYKÓW', }, 'playlist_mincount': 139, + }, { + 'url': 'https://www.cda.pl/FILMY-SERIALE-ANIME-KRESKOWKI-BAJKI/folder/18493422', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 4c1bc4cf47..d5607296df 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1570,6 +1570,8 @@ class InfoExtractor: """Yield all json ld objects in the html""" if default is not NO_DEFAULT: fatal = False + if not fatal and not isinstance(html, str): + return for mobj in re.finditer(JSON_LD_RE, html): json_ld_item = self._parse_json( mobj.group('json_ld'), video_id, fatal=fatal, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index b0c7be4627..721d04e317 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -16,7 +16,6 @@ from ..utils import ( MEDIA_EXTENSIONS, ExtractorError, UnsupportedError, - base_url, determine_ext, determine_protocol, dict_get, @@ -38,6 +37,7 @@ from ..utils import ( unescapeHTML, unified_timestamp, unsmuggle_url, + update_url, update_url_query, url_or_none, urlhandle_detect_ext, @@ -2538,12 +2538,13 @@ class GenericIE(InfoExtractor): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=full_response.url), + xspf_base_url=new_url), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, - mpd_base_url=base_url(full_response.url), + # Do not use yt_dlp.utils.base_url here since it will raise on file:// URLs + mpd_base_url=update_url(new_url, query=None, fragment=None).rpartition('/')[0], mpd_url=url) info_dict['live_status'] = 'is_live' if doc.get('type') == 'dynamic' else None self._extra_manifest_info(info_dict, url) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index b7581d77e2..2d923cf540 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -8,7 +8,7 @@ from ..utils.traversal import traverse_obj class GetCourseRuPlayerIE(InfoExtractor): - _VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+' + _VALID_URL = r'https?://(?:player02\.getcourse\.ru|cf-api-2\.vhcdn\.com)/sign-player/?\?(?:[^#]+&)?json=[^#&]+' _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL}[^\'"]*)'] _TESTS = [{ 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', @@ -20,6 +20,16 @@ class GetCourseRuPlayerIE(InfoExtractor): 'duration': 1693, }, 'skip': 'JWT expired', + }, { + 'url': 'https://cf-api-2.vhcdn.com/sign-player/?json=example', + 'info_dict': { + 'id': '435735291', + 'title': '8afd7c489952108e00f019590f3711f3', + 'ext': 'mp4', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/8afd7c489952108e00f019590f3711f3/preview.jpg?version=1682170973&host=vh-72', + 'duration': 777, + }, + 'skip': 'JWT expired', }] def _real_extract(self, url): @@ -168,7 +178,7 @@ class GetCourseRuIE(InfoExtractor): playlist_id = self._search_regex( r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) - title = self._og_search_title(webpage) or self._html_extract_title(webpage) + title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) return self.playlist_from_matches( re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage), diff --git a/yt_dlp/extractor/ivoox.py b/yt_dlp/extractor/ivoox.py new file mode 100644 index 0000000000..36e02493a5 --- /dev/null +++ b/yt_dlp/extractor/ivoox.py @@ -0,0 +1,78 @@ +from .common import InfoExtractor +from ..utils import int_or_none, parse_iso8601, url_or_none, urljoin +from ..utils.traversal import traverse_obj + + +class IvooxIE(InfoExtractor): + _VALID_URL = ( + r'https?://(?:www\.)?ivoox\.com/(?:\w{2}/)?[^/?#]+_rf_(?P[0-9]+)_1\.html', + r'https?://go\.ivoox\.com/rf/(?P[0-9]+)', + ) + _TESTS = [{ + 'url': 'https://www.ivoox.com/dex-08x30-rostros-del-mal-los-asesinos-en-audios-mp3_rf_143594959_1.html', + 'md5': '993f712de5b7d552459fc66aa3726885', + 'info_dict': { + 'id': '143594959', + 'ext': 'mp3', + 'timestamp': 1742731200, + 'channel': 'DIAS EXTRAÑOS con Santiago Camacho', + 'title': 'DEx 08x30 Rostros del mal: Los asesinos en serie que aterrorizaron España', + 'description': 'md5:eae8b4b9740d0216d3871390b056bb08', + 'uploader': 'Santiago Camacho', + 'thumbnail': 'https://static-1.ivoox.com/audios/c/d/5/2/cd52f46783fe735000c33a803dce2554_XXL.jpg', + 'upload_date': '20250323', + 'episode': 'DEx 08x30 Rostros del mal: Los asesinos en serie que aterrorizaron España', + 'duration': 11837, + 'tags': ['españa', 'asesinos en serie', 'arropiero', 'historia criminal', 'mataviejas'], + }, + }, { + 'url': 'https://go.ivoox.com/rf/143594959', + 'only_matching': True, + }, { + 'url': 'https://www.ivoox.com/en/campodelgas-28-03-2025-audios-mp3_rf_144036942_1.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id, fatal=False) + + data = self._search_nuxt_data( + webpage, media_id, fatal=False, traverse=('data', 0, 'data', 'audio')) + + direct_download = self._download_json( + f'https://vcore-web.ivoox.com/v1/public/audios/{media_id}/download-url', media_id, fatal=False, + note='Fetching direct download link', headers={'Referer': url}) + + download_paths = { + *traverse_obj(direct_download, ('data', 'downloadUrl', {str}, filter, all)), + *traverse_obj(data, (('downloadUrl', 'mediaUrl'), {str}, filter)), + } + + formats = [] + for path in download_paths: + formats.append({ + 'url': urljoin('https://ivoox.com', path), + 'http_headers': {'Referer': url}, + }) + + return { + 'id': media_id, + 'formats': formats, + 'uploader': self._html_search_regex(r'data-prm-author="([^"]+)"', webpage, 'author', default=None), + 'timestamp': parse_iso8601( + self._html_search_regex(r'data-prm-pubdate="([^"]+)"', webpage, 'timestamp', default=None)), + 'channel': self._html_search_regex(r'data-prm-podname="([^"]+)"', webpage, 'channel', default=None), + 'title': self._html_search_regex(r'data-prm-title="([^"]+)"', webpage, 'title', default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + **self._search_json_ld(webpage, media_id, default={}), + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('uploadDate', {parse_iso8601(delimiter=' ')}), + 'duration': ('duration', {int_or_none}), + 'tags': ('tags', ..., 'name', {str}), + }), + } diff --git a/yt_dlp/extractor/kika.py b/yt_dlp/extractor/kika.py index 69f4a3ce03..e277564524 100644 --- a/yt_dlp/extractor/kika.py +++ b/yt_dlp/extractor/kika.py @@ -1,3 +1,5 @@ +import itertools + from .common import InfoExtractor from ..utils import ( determine_ext, @@ -124,3 +126,43 @@ class KikaIE(InfoExtractor): 'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}), }), } + + +class KikaPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kika\.de/[\w-]+/(?P[a-z-]+\d+)' + + _TESTS = [{ + 'url': 'https://www.kika.de/logo/logo-die-welt-und-ich-562', + 'info_dict': { + 'id': 'logo-die-welt-und-ich-562', + 'title': 'logo!', + 'description': 'md5:7b9d7f65561b82fa512f2cfb553c397d', + }, + 'playlist_count': 100, + }] + + def _entries(self, playlist_url, playlist_id): + for page in itertools.count(1): + data = self._download_json(playlist_url, playlist_id, note=f'Downloading page {page}') + for item in traverse_obj(data, ('content', lambda _, v: url_or_none(v['api']['url']))): + yield self.url_result( + item['api']['url'], ie=KikaIE, + **traverse_obj(item, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'duration': ('duration', {int_or_none}), + 'timestamp': ('date', {parse_iso8601}), + })) + + playlist_url = traverse_obj(data, ('links', 'next', {url_or_none})) + if not playlist_url: + break + + def _real_extract(self, url): + playlist_id = self._match_id(url) + brand_data = self._download_json( + f'https://www.kika.de/_next-api/proxy/v1/brands/{playlist_id}', playlist_id) + + return self.playlist_result( + self._entries(brand_data['videoSubchannel']['videosPageUrl'], playlist_id), + playlist_id, title=brand_data.get('title'), description=brand_data.get('description')) diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index c8c8ae52ad..d25f0fe6a0 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -82,7 +82,10 @@ class LinkedInLearningBaseIE(LinkedInBaseIE): class LinkedInIE(LinkedInBaseIE): - _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P\d+)-\w{4}/?(?:[?#]|$)' + _VALID_URL = [ + r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P\d+)-\w{4}/?(?:[?#]|$)', + r'https?://(?:www\.)?linkedin\.com/feed/update/urn:li:activity:(?P\d+)', + ] _TESTS = [{ 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20', 'info_dict': { @@ -106,6 +109,9 @@ class LinkedInIE(LinkedInBaseIE): 'like_count': int, 'subtitles': 'mincount:1', }, + }, { + 'url': 'https://www.linkedin.com/feed/update/urn:li:activity:7016901149999955968/?utm_source=share&utm_medium=member_desktop', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/loco.py b/yt_dlp/extractor/loco.py index a648f7e13d..6c9a255678 100644 --- a/yt_dlp/extractor/loco.py +++ b/yt_dlp/extractor/loco.py @@ -1,5 +1,9 @@ +import json +import random +import time + from .common import InfoExtractor -from ..utils import int_or_none, url_or_none +from ..utils import int_or_none, jwt_decode_hs256, try_call, url_or_none from ..utils.traversal import require, traverse_obj @@ -55,13 +59,81 @@ class LocoIE(InfoExtractor): 'upload_date': '20250226', 'modified_date': '20250226', }, + }, { + # Requires video authorization + 'url': 'https://loco.com/stream/ac854641-ae0f-497c-a8ea-4195f6d8cc53', + 'md5': '0513edf85c1e65c9521f555f665387d5', + 'info_dict': { + 'id': 'ac854641-ae0f-497c-a8ea-4195f6d8cc53', + 'ext': 'mp4', + 'title': 'DUAS CONTAS DESAFIANTE, RUSH TOP 1 NO BRASIL!', + 'description': 'md5:aa77818edd6fe00dd4b6be75cba5f826', + 'uploader_id': '7Y9JNAZC3Q', + 'channel': 'ayellol', + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'concurrent_view_count': int, + 'like_count': int, + 'duration': 1229, + 'thumbnail': 'https://static.ivory.getloconow.com/default_thumb/f5aa678b-6d04-45d9-a89a-859af0a8028f.jpg', + 'tags': ['Gameplay', 'Carry'], + 'series': 'League of Legends', + 'timestamp': 1741182253, + 'upload_date': '20250305', + 'modified_timestamp': 1741182419, + 'modified_date': '20250305', + }, }] + # From _app.js + _CLIENT_ID = 'TlwKp1zmF6eKFpcisn3FyR18WkhcPkZtzwPVEEC3' + _CLIENT_SECRET = 'Kp7tYlUN7LXvtcSpwYvIitgYcLparbtsQSe5AdyyCdiEJBP53Vt9J8eB4AsLdChIpcO2BM19RA3HsGtqDJFjWmwoonvMSG3ZQmnS8x1YIM8yl82xMXZGbE3NKiqmgBVU' + + def _is_jwt_expired(self, token): + return jwt_decode_hs256(token)['exp'] - time.time() < 300 + + def _get_access_token(self, video_id): + access_token = try_call(lambda: self._get_cookies('https://loco.com')['access_token'].value) + if access_token and not self._is_jwt_expired(access_token): + return access_token + access_token = traverse_obj(self._download_json( + 'https://api.getloconow.com/v3/user/device_profile/', video_id, + 'Downloading access token', fatal=False, data=json.dumps({ + 'platform': 7, + 'client_id': self._CLIENT_ID, + 'client_secret': self._CLIENT_SECRET, + 'model': 'Mozilla', + 'os_name': 'Win32', + 'os_ver': '5.0 (Windows)', + 'app_ver': '5.0 (Windows)', + }).encode(), headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'DEVICE-ID': ''.join(random.choices('0123456789abcdef', k=32)) + 'live', + 'X-APP-LANG': 'en', + 'X-APP-LOCALE': 'en-US', + 'X-CLIENT-ID': self._CLIENT_ID, + 'X-CLIENT-SECRET': self._CLIENT_SECRET, + 'X-PLATFORM': '7', + }), 'access_token') + if access_token and not self._is_jwt_expired(access_token): + self._set_cookie('.loco.com', 'access_token', access_token) + return access_token + def _real_extract(self, url): video_type, video_id = self._match_valid_url(url).group('type', 'id') webpage = self._download_webpage(url, video_id) stream = traverse_obj(self._search_nextjs_data(webpage, video_id), ( - 'props', 'pageProps', ('liveStreamData', 'stream'), {dict}, any, {require('stream info')})) + 'props', 'pageProps', ('liveStreamData', 'stream', 'liveStream'), {dict}, any, {require('stream info')})) + + if access_token := self._get_access_token(video_id): + self._request_webpage( + 'https://drm.loco.com/v1/streams/playback/', video_id, + 'Downloading video authorization', fatal=False, headers={ + 'authorization': access_token, + }, query={ + 'stream_uid': stream['uid'], + }) return { 'formats': self._extract_m3u8_formats(stream['conf']['hls'], video_id), diff --git a/yt_dlp/extractor/lrt.py b/yt_dlp/extractor/lrt.py index 1a0b6da230..e50194f88b 100644 --- a/yt_dlp/extractor/lrt.py +++ b/yt_dlp/extractor/lrt.py @@ -2,8 +2,11 @@ from .common import InfoExtractor from ..utils import ( clean_html, merge_dicts, + str_or_none, traverse_obj, + unified_timestamp, url_or_none, + urljoin, ) @@ -80,7 +83,7 @@ class LRTVODIE(LRTBaseIE): }] def _real_extract(self, url): - path, video_id = self._match_valid_url(url).groups() + path, video_id = self._match_valid_url(url).group('path', 'id') webpage = self._download_webpage(url, video_id) media_url = self._extract_js_var(webpage, 'main_url', path) @@ -106,3 +109,42 @@ class LRTVODIE(LRTBaseIE): } return merge_dicts(clean_info, jw_data, json_ld_data) + + +class LRTRadioIE(LRTBaseIE): + _VALID_URL = r'https?://(?:www\.)?lrt\.lt/radioteka/irasas/(?P\d+)/(?P[^?#/]+)' + _TESTS = [{ + # m3u8 download + 'url': 'https://www.lrt.lt/radioteka/irasas/2000359728/nemarios-eiles-apie-pragarus-ir-skaistyklas-su-aiste-kiltinaviciute', + 'info_dict': { + 'id': '2000359728', + 'ext': 'm4a', + 'title': 'Nemarios eilės: apie pragarus ir skaistyklas su Aiste Kiltinavičiūte', + 'description': 'md5:5eee9a0e86a55bf547bd67596204625d', + 'timestamp': 1726143120, + 'upload_date': '20240912', + 'tags': 'count:5', + 'thumbnail': r're:https?://.+/.+\.jpe?g', + 'categories': ['Daiktiniai įrodymai'], + }, + }, { + 'url': 'https://www.lrt.lt/radioteka/irasas/2000304654/vakaras-su-knyga-svetlana-aleksijevic-cernobylio-malda-v-dalis?season=%2Fmediateka%2Faudio%2Fvakaras-su-knyga%2F2023', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, path = self._match_valid_url(url).group('id', 'path') + media = self._download_json( + 'https://www.lrt.lt/radioteka/api/media', video_id, + query={'url': f'/mediateka/irasas/{video_id}/{path}'}) + + return traverse_obj(media, { + 'id': ('id', {int}, {str_or_none}), + 'title': ('title', {str}), + 'tags': ('tags', ..., 'name', {str}), + 'categories': ('playlist_item', 'category', {str}, filter, all, filter), + 'description': ('content', {clean_html}, {str}), + 'timestamp': ('date', {lambda x: x.replace('.', '/')}, {unified_timestamp}), + 'thumbnail': ('playlist_item', 'image', {urljoin('https://www.lrt.lt')}), + 'formats': ('playlist_item', 'file', {lambda x: self._extract_m3u8_formats(x, video_id)}), + }) diff --git a/yt_dlp/extractor/manyvids.py b/yt_dlp/extractor/manyvids.py index 8caa8f87fe..1356169bfd 100644 --- a/yt_dlp/extractor/manyvids.py +++ b/yt_dlp/extractor/manyvids.py @@ -1,31 +1,38 @@ -import re - from .common import InfoExtractor from ..utils import ( + clean_html, determine_ext, - extract_attributes, int_or_none, - str_to_int, + join_nonempty, + parse_count, + parse_duration, + parse_iso8601, url_or_none, - urlencode_postdata, ) +from ..utils.traversal import traverse_obj class ManyVidsIE(InfoExtractor): - _WORKING = False _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P\d+)' _TESTS = [{ # preview video - 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', - 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', + 'url': 'https://www.manyvids.com/Video/530341/mv-tips-tricks', + 'md5': '738dc723f7735ee9602f7ea352a6d058', 'info_dict': { - 'id': '133957', + 'id': '530341-preview', 'ext': 'mp4', - 'title': 'everthing about me (Preview)', - 'uploader': 'ellyxxix', + 'title': 'MV Tips & Tricks (Preview)', + 'description': r're:I will take you on a tour around .{1313}$', + 'thumbnail': r're:https://cdn5\.manyvids\.com/php_uploads/video_images/DestinyDiaz/.+\.jpg', + 'uploader': 'DestinyDiaz', 'view_count': int, 'like_count': int, + 'release_timestamp': 1508419904, + 'tags': ['AdultSchool', 'BBW', 'SFW', 'TeacherFetish'], + 'release_date': '20171019', + 'duration': 3167.0, }, + 'expected_warnings': ['Only extracting preview'], }, { # full video 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/', @@ -34,129 +41,68 @@ class ManyVidsIE(InfoExtractor): 'id': '935718', 'ext': 'mp4', 'title': 'MY FACE REVEAL', - 'description': 'md5:ec5901d41808b3746fed90face161612', + 'description': r're:Today is the day!! I am finally taking off my mask .{445}$', + 'thumbnail': r're:https://ods\.manyvids\.com/1001061960/3aa5397f2a723ec4597e344df66ab845/screenshots/.+\.jpg', 'uploader': 'Sarah Calanthe', 'view_count': int, 'like_count': int, + 'release_date': '20181110', + 'tags': ['EyeContact', 'Interviews', 'MaskFetish', 'MouthFetish', 'Redhead'], + 'release_timestamp': 1541851200, + 'duration': 224.0, }, }] + _API_BASE = 'https://www.manyvids.com/bff/store/video' def _real_extract(self, url): video_id = self._match_id(url) + video_data = self._download_json(f'{self._API_BASE}/{video_id}/private', video_id)['data'] + formats, preview_only = [], True - real_url = f'https://www.manyvids.com/video/{video_id}/gtm.js' - try: - webpage = self._download_webpage(real_url, video_id) - except Exception: - # probably useless fallback - webpage = self._download_webpage(url, video_id) - - info = self._search_regex( - r'''(]*\bid\s*=\s*(['"])pageMetaDetails\2[^>]*>)''', - webpage, 'meta details', default='') - info = extract_attributes(info) - - player = self._search_regex( - r'''(]*\bid\s*=\s*(['"])rmpPlayerStream\2[^>]*>)''', - webpage, 'player details', default='') - player = extract_attributes(player) - - video_urls_and_ids = ( - (info.get('data-meta-video'), 'video'), - (player.get('data-video-transcoded'), 'transcoded'), - (player.get('data-video-filepath'), 'filepath'), - (self._og_search_video_url(webpage, secure=False, default=None), 'og_video'), - ) - - def txt_or_none(s, default=None): - return (s.strip() or default) if isinstance(s, str) else default - - uploader = txt_or_none(info.get('data-meta-author')) - - def mung_title(s): - if uploader: - s = re.sub(rf'^\s*{re.escape(uploader)}\s+[|-]', '', s) - return txt_or_none(s) - - title = ( - mung_title(info.get('data-meta-title')) - or self._html_search_regex( - (r']+class=["\']item-title[^>]+>([^<]+)', - r']+class=["\']h2 m-0["\'][^>]*>([^<]+)'), - webpage, 'title', default=None) - or self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True)) - - title = re.sub(r'\s*[|-]\s+ManyVids\s*$', '', title) or title - - if any(p in webpage for p in ('preview_videos', '_preview.mp4')): - title += ' (Preview)' - - mv_token = self._search_regex( - r'data-mvtoken=(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'mv token', default=None, group='value') - - if mv_token: - # Sets some cookies - self._download_webpage( - 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php', - video_id, note='Setting format cookies', fatal=False, - data=urlencode_postdata({ - 'mvtoken': mv_token, - 'vid': video_id, - }), headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - }) - - formats = [] - for v_url, fmt in video_urls_and_ids: - v_url = url_or_none(v_url) - if not v_url: + for format_id, path in [ + ('preview', ['teaser', 'filepath']), + ('transcoded', ['transcodedFilepath']), + ('filepath', ['filepath']), + ]: + format_url = traverse_obj(video_data, (*path, {url_or_none})) + if not format_url: continue - if determine_ext(v_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - v_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls')) + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id=format_id)) else: formats.append({ - 'url': v_url, - 'format_id': fmt, + 'url': format_url, + 'format_id': format_id, + 'preference': -10 if format_id == 'preview' else None, + 'quality': 10 if format_id == 'filepath' else None, + 'height': int_or_none( + self._search_regex(r'_(\d{2,3}[02468])_', format_url, 'height', default=None)), }) + if format_id != 'preview': + preview_only = False - self._remove_duplicate_formats(formats) + metadata = traverse_obj( + self._download_json(f'{self._API_BASE}/{video_id}', video_id, fatal=False), 'data') + title = traverse_obj(metadata, ('title', {clean_html})) - for f in formats: - if f.get('height') is None: - f['height'] = int_or_none( - self._search_regex(r'_(\d{2,3}[02468])_', f['url'], 'video height', default=None)) - if '/preview/' in f['url']: - f['format_id'] = '_'.join(filter(None, (f.get('format_id'), 'preview'))) - f['preference'] = -10 - if 'transcoded' in f['format_id']: - f['preference'] = f.get('preference', -1) - 1 - - def get_likes(): - likes = self._search_regex( - rf'''(]*\bdata-id\s*=\s*(['"]){video_id}\2[^>]*>)''', - webpage, 'likes', default='') - likes = extract_attributes(likes) - return int_or_none(likes.get('data-likes')) - - def get_views(): - return str_to_int(self._html_search_regex( - r'''(?s)]*\bclass\s*=["']views-wrapper\b[^>]+>.+?]+>\s*(\d[\d,.]*)\s*''', - webpage, 'view count', default=None)) + if preview_only: + title = join_nonempty(title, '(Preview)', delim=' ') + video_id += '-preview' + self.report_warning( + f'Only extracting preview. Video may be paid or subscription only. {self._login_hint()}') return { 'id': video_id, 'title': title, 'formats': formats, - 'description': txt_or_none(info.get('data-meta-description')), - 'uploader': txt_or_none(info.get('data-meta-author')), - 'thumbnail': ( - url_or_none(info.get('data-meta-image')) - or url_or_none(player.get('data-video-screenshot'))), - 'view_count': get_views(), - 'like_count': get_likes(), + **traverse_obj(metadata, { + 'description': ('description', {clean_html}), + 'uploader': ('model', 'displayName', {clean_html}), + 'thumbnail': (('screenshot', 'thumbnail'), {url_or_none}, any), + 'view_count': ('views', {parse_count}), + 'like_count': ('likes', {parse_count}), + 'release_timestamp': ('launchDate', {parse_iso8601}), + 'duration': ('videoDuration', {parse_duration}), + 'tags': ('tagList', ..., 'label', {str}, filter, all, filter), + }), } diff --git a/yt_dlp/extractor/mixcloud.py b/yt_dlp/extractor/mixcloud.py index 19b7fd4e70..852670fcba 100644 --- a/yt_dlp/extractor/mixcloud.py +++ b/yt_dlp/extractor/mixcloud.py @@ -10,7 +10,9 @@ from ..utils import ( parse_iso8601, strip_or_none, try_get, + url_or_none, ) +from ..utils.traversal import traverse_obj class MixcloudBaseIE(InfoExtractor): @@ -37,7 +39,7 @@ class MixcloudIE(MixcloudBaseIE): 'ext': 'm4a', 'title': 'Cryptkeeper', 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', - 'uploader': 'Daniel Holbach', + 'uploader': 'dholbach', 'uploader_id': 'dholbach', 'thumbnail': r're:https?://.*\.jpg', 'view_count': int, @@ -46,10 +48,11 @@ class MixcloudIE(MixcloudBaseIE): 'uploader_url': 'https://www.mixcloud.com/dholbach/', 'artist': 'Submorphics & Chino , Telekinesis, Porter Robinson, Enei, Breakage ft Jess Mills', 'duration': 3723, - 'tags': [], + 'tags': ['liquid drum and bass', 'drum and bass'], 'comment_count': int, 'repost_count': int, 'like_count': int, + 'artists': list, }, 'params': {'skip_download': 'm3u8'}, }, { @@ -67,7 +70,7 @@ class MixcloudIE(MixcloudBaseIE): 'upload_date': '20150203', 'uploader_url': 'https://www.mixcloud.com/gillespeterson/', 'duration': 2992, - 'tags': [], + 'tags': ['jazz', 'soul', 'world music', 'funk'], 'comment_count': int, 'repost_count': int, 'like_count': int, @@ -149,8 +152,6 @@ class MixcloudIE(MixcloudBaseIE): elif reason: raise ExtractorError('Track is restricted', expected=True) - title = cloudcast['name'] - stream_info = cloudcast['streamInfo'] formats = [] @@ -182,47 +183,39 @@ class MixcloudIE(MixcloudBaseIE): self.raise_login_required(metadata_available=True) comments = [] - for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []): - node = edge.get('node') or {} + for node in traverse_obj(cloudcast, ('comments', 'edges', ..., 'node', {dict})): text = strip_or_none(node.get('comment')) if not text: continue - user = node.get('user') or {} comments.append({ - 'author': user.get('displayName'), - 'author_id': user.get('username'), 'text': text, - 'timestamp': parse_iso8601(node.get('created')), + **traverse_obj(node, { + 'author': ('user', 'displayName', {str}), + 'author_id': ('user', 'username', {str}), + 'timestamp': ('created', {parse_iso8601}), + }), }) - tags = [] - for t in cloudcast.get('tags'): - tag = try_get(t, lambda x: x['tag']['name'], str) - if not tag: - tags.append(tag) - - get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount'])) - - owner = cloudcast.get('owner') or {} - return { 'id': track_id, - 'title': title, 'formats': formats, - 'description': cloudcast.get('description'), - 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], str), - 'uploader': owner.get('displayName'), - 'timestamp': parse_iso8601(cloudcast.get('publishDate')), - 'uploader_id': owner.get('username'), - 'uploader_url': owner.get('url'), - 'duration': int_or_none(cloudcast.get('audioLength')), - 'view_count': int_or_none(cloudcast.get('plays')), - 'like_count': get_count('favorites'), - 'repost_count': get_count('reposts'), - 'comment_count': get_count('comments'), 'comments': comments, - 'tags': tags, - 'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None, + **traverse_obj(cloudcast, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'thumbnail': ('picture', 'url', {url_or_none}), + 'timestamp': ('publishDate', {parse_iso8601}), + 'duration': ('audioLength', {int_or_none}), + 'uploader': ('owner', 'displayName', {str}), + 'uploader_id': ('owner', 'username', {str}), + 'uploader_url': ('owner', 'url', {url_or_none}), + 'view_count': ('plays', {int_or_none}), + 'like_count': ('favorites', 'totalCount', {int_or_none}), + 'repost_count': ('reposts', 'totalCount', {int_or_none}), + 'comment_count': ('comments', 'totalCount', {int_or_none}), + 'tags': ('tags', ..., 'tag', 'name', {str}, filter, all, filter), + 'artists': ('featuringArtistList', ..., {str}, filter, all, filter), + }), } @@ -295,7 +288,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/', 'info_dict': { 'id': 'dholbach_uploads', - 'title': 'Daniel Holbach (uploads)', + 'title': 'dholbach (uploads)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, @@ -303,7 +296,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/uploads/', 'info_dict': { 'id': 'dholbach_uploads', - 'title': 'Daniel Holbach (uploads)', + 'title': 'dholbach (uploads)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, @@ -311,7 +304,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/favorites/', 'info_dict': { 'id': 'dholbach_favorites', - 'title': 'Daniel Holbach (favorites)', + 'title': 'dholbach (favorites)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, # 'params': { @@ -337,7 +330,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'title': 'First Ear (stream)', 'description': 'we maraud for ears', }, - 'playlist_mincount': 269, + 'playlist_mincount': 267, }] _TITLE_KEY = 'displayName' @@ -361,7 +354,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): 'id': 'maxvibes_jazzcat-on-ness-radio', 'title': 'Ness Radio sessions', }, - 'playlist_mincount': 59, + 'playlist_mincount': 58, }] _TITLE_KEY = 'name' _DESCRIPTION_KEY = 'description' diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 62dd0ab9cf..5e66aebeba 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -27,6 +27,7 @@ from ..utils import ( traverse_obj, try_get, unescapeHTML, + unified_timestamp, update_url_query, url_basename, url_or_none, @@ -985,6 +986,7 @@ class NiconicoLiveIE(InfoExtractor): 'quality': 'abr', 'protocol': 'hls+fmp4', 'latency': latency, + 'accessRightMethod': 'single_cookie', 'chasePlay': False, }, 'room': { @@ -1005,6 +1007,7 @@ class NiconicoLiveIE(InfoExtractor): if data.get('type') == 'stream': m3u8_url = data['data']['uri'] qualities = data['data']['availableQualities'] + cookies = data['data']['cookies'] break elif data.get('type') == 'disconnect': self.write_debug(recv) @@ -1043,6 +1046,11 @@ class NiconicoLiveIE(InfoExtractor): **res, }) + for cookie in cookies: + self._set_cookie( + cookie['domain'], cookie['name'], cookie['value'], + expire_time=unified_timestamp(cookie['expires']), path=cookie['path'], secure=cookie['secure']) + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True) for fmt, q in zip(formats, reversed(qualities[1:])): fmt.update({ diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index 91f1055193..9f307a53e2 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -14,8 +14,9 @@ from ..utils import ( int_or_none, parse_qs, srt_subtitles_timecode, - traverse_obj, + url_or_none, ) +from ..utils.traversal import traverse_obj class PanoptoBaseIE(InfoExtractor): @@ -345,21 +346,16 @@ class PanoptoIE(PanoptoBaseIE): subtitles = {} for stream in streams or []: stream_formats = [] - http_stream_url = stream.get('StreamHttpUrl') - stream_url = stream.get('StreamUrl') - - if http_stream_url: - stream_formats.append({'url': http_stream_url}) - - if stream_url: + for stream_url in set(traverse_obj(stream, (('StreamHttpUrl', 'StreamUrl'), {url_or_none}))): media_type = stream.get('ViewerMediaFileTypeName') if media_type in ('hls', ): - m3u8_formats, stream_subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, video_id) - stream_formats.extend(m3u8_formats) - subtitles = self._merge_subtitles(subtitles, stream_subtitles) + fmts, subs = self._extract_m3u8_formats_and_subtitles(stream_url, video_id, m3u8_id='hls', fatal=False) + stream_formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: stream_formats.append({ 'url': stream_url, + 'ext': media_type, }) for fmt in stream_formats: fmt.update({ diff --git a/yt_dlp/extractor/parti.py b/yt_dlp/extractor/parti.py new file mode 100644 index 0000000000..acadefc4e4 --- /dev/null +++ b/yt_dlp/extractor/parti.py @@ -0,0 +1,101 @@ +from .common import InfoExtractor +from ..utils import UserNotLive, int_or_none, parse_iso8601, url_or_none, urljoin +from ..utils.traversal import traverse_obj + + +class PartiBaseIE(InfoExtractor): + def _call_api(self, path, video_id, note=None): + return self._download_json( + f'https://api-backend.parti.com/parti_v2/profile/{path}', video_id, note) + + +class PartiVideoIE(PartiBaseIE): + IE_NAME = 'parti:video' + _VALID_URL = r'https?://(?:www\.)?parti\.com/video/(?P\d+)' + _TESTS = [{ + 'url': 'https://parti.com/video/66284', + 'info_dict': { + 'id': '66284', + 'ext': 'mp4', + 'title': 'NOW LIVE ', + 'upload_date': '20250327', + 'categories': ['Gaming'], + 'thumbnail': 'https://assets.parti.com/351424_eb9e5250-2821-484a-9c5f-ca99aa666c87.png', + 'channel': 'ItZTMGG', + 'timestamp': 1743044379, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api(f'get_livestream_channel_info/recent/{video_id}', video_id) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats( + urljoin('https://watch.parti.com', data['livestream_recording']), video_id, 'mp4'), + **traverse_obj(data, { + 'title': ('event_title', {str}), + 'channel': ('user_name', {str}), + 'thumbnail': ('event_file', {url_or_none}), + 'categories': ('category_name', {str}, filter, all), + 'timestamp': ('event_start_ts', {int_or_none}), + }), + } + + +class PartiLivestreamIE(PartiBaseIE): + IE_NAME = 'parti:livestream' + _VALID_URL = r'https?://(?:www\.)?parti\.com/creator/(?P[\w]+)/(?P[\w/-]+)' + _TESTS = [{ + 'url': 'https://parti.com/creator/parti/Capt_Robs_Adventures', + 'info_dict': { + 'id': 'Capt_Robs_Adventures', + 'ext': 'mp4', + 'title': r"re:I'm Live on Parti \d{4}-\d{2}-\d{2} \d{2}:\d{2}", + 'view_count': int, + 'thumbnail': r're:https://assets\.parti\.com/.+\.png', + 'timestamp': 1743879776, + 'upload_date': '20250405', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://parti.com/creator/discord/sazboxgaming/0', + 'only_matching': True, + }] + + def _real_extract(self, url): + service, creator_slug = self._match_valid_url(url).group('service', 'id') + + encoded_creator_slug = creator_slug.replace('/', '%23') + creator_id = self._call_api( + f'get_user_by_social_media/{service}/{encoded_creator_slug}', + creator_slug, note='Fetching user ID') + + data = self._call_api( + f'get_livestream_channel_info/{creator_id}', creator_id, + note='Fetching user profile feed')['channel_info'] + + if not traverse_obj(data, ('channel', 'is_live', {bool})): + raise UserNotLive(video_id=creator_id) + + channel_info = data['channel'] + + return { + 'id': creator_slug, + 'formats': self._extract_m3u8_formats( + channel_info['playback_url'], creator_slug, live=True, query={ + 'token': channel_info['playback_auth_token'], + 'player_version': '1.17.0', + }), + 'is_live': True, + **traverse_obj(data, { + 'title': ('livestream_event_info', 'event_name', {str}), + 'description': ('livestream_event_info', 'event_description', {str}), + 'thumbnail': ('livestream_event_info', 'livestream_preview_file', {url_or_none}), + 'timestamp': ('stream', 'start_time', {parse_iso8601}), + 'view_count': ('stream', 'viewer_count', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py index 7e0b666ab3..2812d93059 100644 --- a/yt_dlp/extractor/rtve.py +++ b/yt_dlp/extractor/rtve.py @@ -1,35 +1,142 @@ import base64 import io import struct +import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, + clean_html, determine_ext, float_or_none, + make_archive_id, + parse_iso8601, qualities, - remove_end, - remove_start, - try_get, + url_or_none, ) +from ..utils.traversal import subs_list_to_dict, traverse_obj -class RTVEALaCartaIE(InfoExtractor): +class RTVEBaseIE(InfoExtractor): + # Reimplementation of https://js2.rtve.es/pages/app-player/3.5.1/js/pf_video.js + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(base64.b64decode(png)[8:]) + while True: + length_data = encrypted_data.read(4) + length = struct.unpack('!I', length_data)[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + data = bytes(filter(None, data)) + alphabet_data, _, url_data = data.partition(b'#') + quality_str, _, url_data = url_data.rpartition(b'%%') + quality_str = quality_str.decode() or '' + alphabet = RTVEBaseIE._get_alphabet(alphabet_data) + url = RTVEBaseIE._get_url(alphabet, url_data) + yield quality_str, url + encrypted_data.read(4) # CRC + + @staticmethod + def _get_url(alphabet, url_data): + url = '' + f = 0 + e = 3 + b = 1 + for char in url_data.decode('iso-8859-1'): + if f == 0: + l = int(char) * 10 + f = 1 + else: + if e == 0: + l += int(char) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + return url + + @staticmethod + def _get_alphabet(alphabet_data): + alphabet = [] + e = 0 + d = 0 + for char in alphabet_data.decode('iso-8859-1'): + if d == 0: + alphabet.append(char) + d = e = (e + 1) % 4 + else: + d -= 1 + return alphabet + + def _extract_png_formats_and_subtitles(self, video_id, media_type='videos'): + formats, subtitles = [], {} + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + for manager in ('rtveplayw', 'default'): + png = self._download_webpage( + f'http://www.rtve.es/ztnr/movil/thumbnail/{manager}/{media_type}/{video_id}.png', + video_id, 'Downloading url information', query={'q': 'v2'}, fatal=False) + if not png: + continue + + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + video_url, video_id, 'dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + return formats, subtitles + + def _parse_metadata(self, metadata): + return traverse_obj(metadata, { + 'title': ('title', {str.strip}), + 'alt_title': ('alt', {str.strip}), + 'description': ('description', {clean_html}), + 'timestamp': ('dateOfEmission', {parse_iso8601(delimiter=' ')}), + 'release_timestamp': ('publicationDate', {parse_iso8601(delimiter=' ')}), + 'modified_timestamp': ('modificationDate', {parse_iso8601(delimiter=' ')}), + 'thumbnail': (('thumbnail', 'image', 'imageSEO'), {url_or_none}, any), + 'duration': ('duration', {float_or_none(scale=1000)}), + 'is_live': ('live', {bool}), + 'series': (('programTitle', ('programInfo', 'title')), {clean_html}, any), + }) + + +class RTVEALaCartaIE(RTVEBaseIE): IE_NAME = 'rtve.es:alacarta' - IE_DESC = 'RTVE a la carta' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P\d+)' + IE_DESC = 'RTVE a la carta and Play' + _VALID_URL = [ + r'https?://(?:www\.)?rtve\.es/(?:m/)?(?:(?:alacarta|play)/videos|filmoteca)/(?!directo)(?:[^/?#]+/){2}(?P\d+)', + r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/?#]+/video/[^/?#]+/(?P\d+)', + ] _TESTS = [{ - 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', - 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', + 'url': 'http://www.rtve.es/alacarta/videos/la-aventura-del-saber/aventuraentornosilla/3088905/', + 'md5': 'a964547824359a5753aef09d79fe984b', 'info_dict': { - 'id': '2491869', + 'id': '3088905', 'ext': 'mp4', - 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', - 'duration': 5024.566, - 'series': 'Balonmano', + 'title': 'En torno a la silla', + 'duration': 1216.981, + 'series': 'La aventura del Saber', + 'thumbnail': 'https://img2.rtve.es/v/aventuraentornosilla_3088905.png', }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', @@ -38,140 +145,88 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'live_status': 'is_live', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', }, 'params': { 'skip_download': 'live stream', }, }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'd850f3c8731ea53952ebab489cf81cbf', + 'md5': 'f3cf0d1902d008c48c793e736706c174', 'info_dict': { 'id': '4236788', 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104', - 'duration': 3222.0, + 'title': 'Episodio 104', + 'duration': 3222.8, + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'series': 'Servir y proteger', }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, }, { 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', 'only_matching': True, + }, { + 'url': 'https://www.rtve.es/play/videos/saber-vivir/07-07-24/16177116/', + 'md5': 'a5b24fcdfa3ff5cb7908aba53d22d4b6', + 'info_dict': { + 'id': '16177116', + 'ext': 'mp4', + 'title': 'Saber vivir - 07/07/24', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'duration': 2162.68, + 'series': 'Saber vivir', + }, + }, { + 'url': 'https://www.rtve.es/infantil/serie/agus-lui-churros-crafts/video/gusano/7048976/', + 'info_dict': { + 'id': '7048976', + 'ext': 'mp4', + 'title': 'Gusano', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'duration': 292.86, + 'series': 'Agus & Lui: Churros y Crafts', + '_old_archive_ids': ['rtveinfantil 7048976'], + }, }] - def _real_initialize(self): - user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode()).decode('utf-8') - self._manager = self._download_json( - 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info')['manager'] - - @staticmethod - def _decrypt_url(png): - encrypted_data = io.BytesIO(base64.b64decode(png)[8:]) - while True: - length = struct.unpack('!I', encrypted_data.read(4))[0] - chunk_type = encrypted_data.read(4) - if chunk_type == b'IEND': - break - data = encrypted_data.read(length) - if chunk_type == b'tEXt': - alphabet_data, text = data.split(b'\0') - quality, url_data = text.split(b'%%') - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data.decode('iso-8859-1'): - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data.decode('iso-8859-1'): - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - yield quality.decode(), url - encrypted_data.read(4) # CRC - - def _extract_png_formats(self, video_id): - png = self._download_webpage( - f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/videos/{video_id}.png', - video_id, 'Downloading url information', query={'q': 'v2'}) - q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) - formats = [] - for quality, video_url in self._decrypt_url(png): - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, 'dash', fatal=False)) - else: - formats.append({ - 'format_id': quality, - 'quality': q(quality), - 'url': video_url, - }) - return formats + def _get_subtitles(self, video_id): + subtitle_data = self._download_json( + f'https://api2.rtve.es/api/videos/{video_id}/subtitulos.json', video_id, + 'Downloading subtitles info') + return traverse_obj(subtitle_data, ('page', 'items', ..., { + 'id': ('lang', {str}), + 'url': ('src', {url_or_none}), + }, all, {subs_list_to_dict(lang='es')})) def _real_extract(self, url): video_id = self._match_id(url) - info = self._download_json( + metadata = self._download_json( f'http://www.rtve.es/api/videos/{video_id}/config/alacarta_videos.json', video_id)['page']['items'][0] - if info['state'] == 'DESPU': + if metadata['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) - title = info['title'].strip() - formats = self._extract_png_formats(video_id) + formats, subtitles = self._extract_png_formats_and_subtitles(video_id) - subtitles = None - sbt_file = info.get('sbtFile') - if sbt_file: - subtitles = self.extract_subtitles(video_id, sbt_file) + self._merge_subtitles(self.extract_subtitles(video_id), target=subtitles) - is_live = info.get('live') is True + is_infantil = urllib.parse.urlparse(url).path.startswith('/infantil/') return { 'id': video_id, - 'title': title, 'formats': formats, - 'thumbnail': info.get('image'), 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), 1000), - 'is_live': is_live, - 'series': info.get('programTitle'), + **self._parse_metadata(metadata), + '_old_archive_ids': [make_archive_id('rtveinfantil', video_id)] if is_infantil else None, } - def _get_subtitles(self, video_id, sub_file): - subs = self._download_json( - sub_file + '.json', video_id, - 'Downloading subtitles info')['page']['items'] - return dict( - (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) - for s in subs) - -class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE +class RTVEAudioIE(RTVEBaseIE): IE_NAME = 'rtve.es:audio' IE_DESC = 'RTVE audio' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/(?:[^/?#]+/){2}(?P\d+)' _TESTS = [{ 'url': 'https://www.rtve.es/alacarta/audios/a-hombros-de-gigantes/palabra-ingeniero-codigos-informaticos-27-04-21/5889192/', @@ -180,9 +235,11 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '5889192', 'ext': 'mp3', 'title': 'Códigos informáticos', - 'thumbnail': r're:https?://.+/1598856591583.jpg', + 'alt_title': 'Códigos informáticos - Escuchar ahora', 'duration': 349.440, 'series': 'A hombros de gigantes', + 'description': 'md5:72b0d7c1ca20fd327bdfff7ac0171afb', + 'thumbnail': 'https://img2.rtve.es/a/palabra-ingeniero-codigos-informaticos-270421_5889192.png', }, }, { 'url': 'https://www.rtve.es/play/audios/en-radio-3/ignatius-farray/5791165/', @@ -191,9 +248,11 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '5791165', 'ext': 'mp3', 'title': 'Ignatius Farray', + 'alt_title': 'En Radio 3 - Ignatius Farray - 13/02/21 - escuchar ahora', 'thumbnail': r're:https?://.+/1613243011863.jpg', 'duration': 3559.559, 'series': 'En Radio 3', + 'description': 'md5:124aa60b461e0b1724a380bad3bc4040', }, }, { 'url': 'https://www.rtve.es/play/audios/frankenstein-o-el-moderno-prometeo/capitulo-26-ultimo-muerte-victor-juan-jose-plans-mary-shelley/6082623/', @@ -202,126 +261,101 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '6082623', 'ext': 'mp3', 'title': 'Capítulo 26 y último: La muerte de Victor', + 'alt_title': 'Frankenstein o el moderno Prometeo - Capítulo 26 y último: La muerte de Victor', 'thumbnail': r're:https?://.+/1632147445707.jpg', 'duration': 3174.086, 'series': 'Frankenstein o el moderno Prometeo', + 'description': 'md5:4ee6fcb82ebe2e46d267e1d1c1a8f7b5', }, }] - def _extract_png_formats(self, audio_id): - """ - This function retrieves media related png thumbnail which obfuscate - valuable information about the media. This information is decrypted - via base class _decrypt_url function providing media quality and - media url - """ - png = self._download_webpage( - f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/audios/{audio_id}.png', - audio_id, 'Downloading url information', query={'q': 'v2'}) - q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) - formats = [] - for quality, audio_url in self._decrypt_url(png): - ext = determine_ext(audio_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - audio_url, audio_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - audio_url, audio_id, 'dash', fatal=False)) - else: - formats.append({ - 'format_id': quality, - 'quality': q(quality), - 'url': audio_url, - }) - return formats - def _real_extract(self, url): audio_id = self._match_id(url) - info = self._download_json( - f'https://www.rtve.es/api/audios/{audio_id}.json', - audio_id)['page']['items'][0] + metadata = self._download_json( + f'https://www.rtve.es/api/audios/{audio_id}.json', audio_id)['page']['items'][0] + + formats, subtitles = self._extract_png_formats_and_subtitles(audio_id, media_type='audios') return { 'id': audio_id, - 'title': info['title'].strip(), - 'thumbnail': info.get('thumbnail'), - 'duration': float_or_none(info.get('duration'), 1000), - 'series': try_get(info, lambda x: x['programInfo']['title']), - 'formats': self._extract_png_formats(audio_id), + 'formats': formats, + 'subtitles': subtitles, + **self._parse_metadata(metadata), } -class RTVEInfantilIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE - IE_NAME = 'rtve.es:infantil' - IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P[0-9]+)/' - - _TESTS = [{ - 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '5747454717aedf9f9fdf212d1bcfc48d', - 'info_dict': { - 'id': '3040283', - 'ext': 'mp4', - 'title': 'Maneras de vivir', - 'thumbnail': r're:https?://.+/1426182947956\.JPG', - 'duration': 357.958, - }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], - }] - - -class RTVELiveIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE +class RTVELiveIE(RTVEBaseIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' + _VALID_URL = [ + r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)', + r'https?://(?:www\.)?rtve\.es/play/videos/directo/[^/?#]+/(?P[a-zA-Z0-9-]+)', + ] _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', 'info_dict': { 'id': 'la-1', 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'live_status': 'is_live', + 'title': str, + 'description': str, + 'thumbnail': r're:https://img\d\.rtve\.es/resources/thumbslive/\d+\.jpg', + 'timestamp': int, + 'upload_date': str, }, - 'params': { - 'skip_download': 'live stream', + 'params': {'skip_download': 'live stream'}, + }, { + 'url': 'https://www.rtve.es/play/videos/directo/deportes/tdp/', + 'info_dict': { + 'id': 'tdp', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': str, + 'description': str, + 'thumbnail': r're:https://img2\d\.rtve\.es/resources/thumbslive/\d+\.jpg', + 'timestamp': int, + 'upload_date': str, }, + 'params': {'skip_download': 'live stream'}, + }, { + 'url': 'http://www.rtve.es/play/videos/directo/canales-lineales/la-1/', + 'only_matching': True, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') - title = remove_start(title, 'Estoy viendo ') - vidplayer_id = self._search_regex( - (r'playerId=player([0-9]+)', - r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', - r'data-id=["\'](\d+)'), - webpage, 'internal video ID') + data_setup = self._search_json( + r']+class="[^"]*videoPlayer[^"]*"[^>]*data-setup=\'', + webpage, 'data_setup', video_id) + + formats, subtitles = self._extract_png_formats_and_subtitles(data_setup['idAsset']) return { 'id': video_id, - 'title': title, - 'formats': self._extract_png_formats(vidplayer_id), + **self._search_json_ld(webpage, video_id, fatal=False), + 'title': self._html_extract_title(webpage), + 'formats': formats, + 'subtitles': subtitles, 'is_live': True, } class RTVETelevisionIE(InfoExtractor): IE_NAME = 'rtve.es:television' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P\d+).shtml' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/?#]+/[^/?#]+/(?P\d+).shtml' _TEST = { - 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'url': 'https://www.rtve.es/television/20091103/video-inedito-del-8o-programa/299020.shtml', 'info_dict': { - 'id': '3069778', + 'id': '572515', 'ext': 'mp4', - 'title': 'Documentos TV - La revolución del móvil', - 'duration': 3496.948, + 'title': 'Clase inédita', + 'duration': 335.817, + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'series': 'El coro de la cárcel', }, 'params': { 'skip_download': True, @@ -332,11 +366,8 @@ class RTVETelevisionIE(InfoExtractor): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - alacarta_url = self._search_regex( - r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', - webpage, 'alacarta url', default=None) - if alacarta_url is None: - raise ExtractorError( - 'The webpage doesn\'t contain any video', expected=True) + play_url = self._html_search_meta('contentUrl', webpage) + if play_url is None: + raise ExtractorError('The webpage doesn\'t contain any video', expected=True) - return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) + return self.url_result(play_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 74c7e4f176..757d6994ca 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -7,7 +7,6 @@ from ..utils import ( ExtractorError, UnsupportedError, clean_html, - determine_ext, extract_attributes, format_field, get_element_by_class, @@ -36,7 +35,7 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20191020', 'channel_url': 'https://rumble.com/c/WMAR', 'channel': 'WMAR', - 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.qR4e-small-WMAR-2-News-Latest-Headline.jpg', + 'thumbnail': r're:https://.+\.jpg', 'duration': 234, 'uploader': 'WMAR', 'live_status': 'not_live', @@ -52,7 +51,7 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20220217', 'channel_url': 'https://rumble.com/c/CyberTechNews', 'channel': 'CTNews', - 'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg', + 'thumbnail': r're:https://.+\.jpg', 'duration': 901, 'uploader': 'CTNews', 'live_status': 'not_live', @@ -114,6 +113,22 @@ class RumbleEmbedIE(InfoExtractor): 'live_status': 'was_live', }, 'params': {'skip_download': True}, + }, { + 'url': 'https://rumble.com/embed/v6pezdb', + 'info_dict': { + 'id': 'v6pezdb', + 'ext': 'mp4', + 'title': '"Es war einmal ein Mädchen" – Ein filmisches Zeitzeugnis aus Leningrad 1944', + 'uploader': 'RT DE', + 'channel': 'RT DE', + 'channel_url': 'https://rumble.com/c/RTDE', + 'duration': 309, + 'thumbnail': 'https://1a-1791.com/video/fww1/dc/s8/1/n/z/2/y/nz2yy.qR4e-small-Es-war-einmal-ein-Mdchen-Ei.jpg', + 'timestamp': 1743703500, + 'upload_date': '20250403', + 'live_status': 'not_live', + }, + 'params': {'skip_download': True}, }, { 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', 'only_matching': True, @@ -168,40 +183,42 @@ class RumbleEmbedIE(InfoExtractor): live_status = None formats = [] - for ext, ext_info in (video.get('ua') or {}).items(): - if isinstance(ext_info, dict): - for height, video_info in ext_info.items(): + for format_type, format_info in (video.get('ua') or {}).items(): + if isinstance(format_info, dict): + for height, video_info in format_info.items(): if not traverse_obj(video_info, ('meta', 'h', {int_or_none})): video_info.setdefault('meta', {})['h'] = height - ext_info = ext_info.values() + format_info = format_info.values() - for video_info in ext_info: + for video_info in format_info: meta = video_info.get('meta') or {} if not video_info.get('url'): continue - if ext == 'hls': + # With default query params returns m3u8 variants which are duplicates, without returns tar files + if format_type == 'tar': + continue + if format_type == 'hls': if meta.get('live') is True and video.get('live') == 1: live_status = 'post_live' formats.extend(self._extract_m3u8_formats( video_info['url'], video_id, ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live')) continue - timeline = ext == 'timeline' - if timeline: - ext = determine_ext(video_info['url']) + is_timeline = format_type == 'timeline' + is_audio = format_type == 'audio' formats.append({ - 'ext': ext, - 'acodec': 'none' if timeline else None, + 'acodec': 'none' if is_timeline else None, + 'vcodec': 'none' if is_audio else None, 'url': video_info['url'], - 'format_id': join_nonempty(ext, format_field(meta, 'h', '%sp')), - 'format_note': 'Timeline' if timeline else None, - 'fps': None if timeline else video.get('fps'), + 'format_id': join_nonempty(format_type, format_field(meta, 'h', '%sp')), + 'format_note': 'Timeline' if is_timeline else None, + 'fps': None if is_timeline or is_audio else video.get('fps'), **traverse_obj(meta, { - 'tbr': 'bitrate', - 'filesize': 'size', - 'width': 'w', - 'height': 'h', - }, expected_type=lambda x: int(x) or None), + 'tbr': ('bitrate', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'width': ('w', {int_or_none}), + 'height': ('h', {int_or_none}), + }), }) subtitles = { diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index da3082907e..416cbab3c5 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -513,7 +513,7 @@ class TVPVODBaseIE(InfoExtractor): class TVPVODVideoIE(TVPVODBaseIE): IE_NAME = 'tvp:vod' - _VALID_URL = r'https?://vod\.tvp\.pl/(?P[a-z\d-]+,\d+)/[a-z\d-]+(?\d+)/?(?:[?#]|$)' + _VALID_URL = r'https?://vod\.tvp\.pl/(?P[a-z\d-]+,\d+)/[a-z\d-]+(?\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357', @@ -568,6 +568,9 @@ class TVPVODVideoIE(TVPVODBaseIE): 'live_status': 'is_live', 'thumbnail': 're:https?://.+', }, + }, { + 'url': 'https://vod.tvp.pl/informacje-i-publicystyka,205/konskie-2025-debata-przedwyborcza-odcinki,2028435/odcinek--1,S01E-1,2028419', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py index 1c060cd7a0..0ab926dbdd 100644 --- a/yt_dlp/extractor/tvw.py +++ b/yt_dlp/extractor/tvw.py @@ -1,13 +1,21 @@ import json from .common import InfoExtractor -from ..utils import clean_html, remove_end, unified_timestamp, url_or_none -from ..utils.traversal import traverse_obj +from ..utils import ( + clean_html, + extract_attributes, + parse_qs, + remove_end, + require, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import find_element, traverse_obj class TvwIE(InfoExtractor): + IE_NAME = 'tvw' _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P[^/?#]+)' - _TESTS = [{ 'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/', 'md5': '9ceb94fe2bb7fd726f74f16356825703', @@ -115,3 +123,43 @@ class TvwIE(InfoExtractor): 'is_live': ('eventStatus', {lambda x: x == 'live'}), }), } + + +class TvwTvChannelsIE(InfoExtractor): + IE_NAME = 'tvw:tvchannels' + _VALID_URL = r'https?://(?:www\.)?tvw\.org/tvchannels/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://tvw.org/tvchannels/air/', + 'info_dict': { + 'id': 'air', + 'ext': 'mp4', + 'title': r're:TVW Cable Channel Live Stream', + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)$', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://tvw.org/tvchannels/tvw2/', + 'info_dict': { + 'id': 'tvw2', + 'ext': 'mp4', + 'title': r're:TVW-2 Broadcast Channel', + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)$', + 'live_status': 'is_live', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8_url = traverse_obj(webpage, ( + {find_element(id='invintus-persistent-stream-frame', html=True)}, {extract_attributes}, + 'src', {parse_qs}, 'encoder', 0, {json.loads}, 'live247URI', {url_or_none}, {require('stream url')})) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True), + 'title': remove_end(self._og_search_title(webpage, default=None), ' - TVW'), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'is_live': True, + } diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index bf9c6348cb..0a7f95c21a 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -14,12 +14,13 @@ from ..utils import ( parse_duration, qualities, str_to_int, - traverse_obj, try_get, unified_timestamp, + url_or_none, urlencode_postdata, urljoin, ) +from ..utils.traversal import traverse_obj class TwitCastingIE(InfoExtractor): @@ -138,13 +139,7 @@ class TwitCastingIE(InfoExtractor): r'data-toggle="true"[^>]+datetime="([^"]+)"', webpage, 'datetime', None)) - stream_server_data = self._download_json( - f'https://twitcasting.tv/streamserver.php?target={uploader_id}&mode=client', video_id, - 'Downloading live info', fatal=False) - is_live = any(f'data-{x}' in webpage for x in ['is-onlive="true"', 'live-type="live"', 'status="online"']) - if not traverse_obj(stream_server_data, 'llfmp4') and is_live: - self.raise_login_required(method='cookies') base_dict = { 'title': title, @@ -165,28 +160,37 @@ class TwitCastingIE(InfoExtractor): return [data_movie_url] m3u8_urls = (try_get(webpage, find_dmu, list) - or traverse_obj(video_js_data, (..., 'source', 'url')) - or ([f'https://twitcasting.tv/{uploader_id}/metastream.m3u8'] if is_live else None)) - if not m3u8_urls: - raise ExtractorError('Failed to get m3u8 playlist') + or traverse_obj(video_js_data, (..., 'source', 'url'))) if is_live: - m3u8_url = m3u8_urls[0] - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', m3u8_id='hls', - live=True, headers=self._M3U8_HEADERS) + stream_data = self._download_json( + 'https://twitcasting.tv/streamserver.php', + video_id, 'Downloading live info', query={ + 'target': uploader_id, + 'mode': 'client', + 'player': 'pc_web', + }) - if traverse_obj(stream_server_data, ('hls', 'source')): - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', m3u8_id='source', - live=True, query={'mode': 'source'}, - note='Downloading source quality m3u8', - headers=self._M3U8_HEADERS, fatal=False)) + formats = [] + # low: 640x360, medium: 1280x720, high: 1920x1080 + qq = qualities(['low', 'medium', 'high']) + for quality, m3u8_url in traverse_obj(stream_data, ( + 'tc-hls', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), + )): + formats.append({ + 'url': m3u8_url, + 'format_id': f'hls-{quality}', + 'ext': 'mp4', + 'quality': qq(quality), + 'protocol': 'm3u8', + 'http_headers': self._M3U8_HEADERS, + }) if websockets: qq = qualities(['base', 'mobilesource', 'main']) - streams = traverse_obj(stream_server_data, ('llfmp4', 'streams')) or {} - for mode, ws_url in streams.items(): + for mode, ws_url in traverse_obj(stream_data, ( + 'llfmp4', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), + )): formats.append({ 'url': ws_url, 'format_id': f'ws-{mode}', @@ -197,10 +201,15 @@ class TwitCastingIE(InfoExtractor): 'protocol': 'websocket_frag', }) + if not formats: + self.raise_login_required() + infodict = { 'formats': formats, '_format_sort_fields': ('source', ), } + elif not m3u8_urls: + raise ExtractorError('Failed to get m3u8 playlist') elif len(m3u8_urls) == 1: formats = self._extract_m3u8_formats( m3u8_urls[0], video_id, 'mp4', headers=self._M3U8_HEADERS) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 8a0aaaa464..62b8db3829 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -244,9 +244,13 @@ class VimeoBaseInfoExtractor(InfoExtractor): join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), video_id, 'Downloading API JSON', headers={ 'Authorization': f'jwt {jwt_token}', - 'Accept': 'application/json', + 'Accept': 'application/vnd.vimeo.*+json;version=3.4.10', }, query={ + # TODO: Reverse-engineer generating the 'anon_signature' param + # Ref: https://f.vimeocdn.com/js_opt/app/vimeo-next/_next/static/chunks/60908-af70235e46909bce.js + 'outro': 'beginning', # Needed to avoid https://github.com/yt-dlp/yt-dlp/issues/12974 'fields': ','.join(( + # 'embed_player_config_url' is a viable alternative to 'config_url' 'config_url', 'created_time', 'description', 'download', 'license', 'metadata.connections.comments.total', 'metadata.connections.likes.total', 'release_time', 'stats.plays')), @@ -410,6 +414,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 10, 'comment_count': int, 'like_count': int, + 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d', }, 'params': { @@ -500,15 +505,16 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'The DMCI', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci', 'uploader_id': 'dmci', - 'timestamp': 1324343742, + 'timestamp': 1324361742, 'upload_date': '20111220', - 'description': 'md5:ae23671e82d05415868f7ad1aec21147', + 'description': 'md5:f37b4ad0f3ded6fa16f38ecde16c3c44', 'duration': 60, 'comment_count': int, 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d', 'like_count': int, - 'tags': 'count:11', + 'release_timestamp': 1324361742, + 'release_date': '20111220', }, # 'params': {'format': 'Original'}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -521,15 +527,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '393756517', # 'ext': 'mov', 'ext': 'mp4', - 'timestamp': 1582642091, + 'timestamp': 1582660091, 'uploader_id': 'frameworkla', 'title': 'Straight To Hell - Sabrina: Netflix', 'uploader': 'Framework Studio', - 'description': 'md5:f2edc61af3ea7a5592681ddbb683db73', 'upload_date': '20200225', 'duration': 176, 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d', 'uploader_url': 'https://vimeo.com/frameworkla', + 'comment_count': int, + 'like_count': int, + 'release_timestamp': 1582660091, + 'release_date': '20200225', }, # 'params': {'format': 'source'}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -630,7 +639,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': str, # FIXME: Dynamic SEO spam description 'upload_date': '20150209', 'timestamp': 1423518307, - 'thumbnail': 'https://i.vimeocdn.com/video/default', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/default', 'duration': 10, 'like_count': int, 'uploader_url': 'https://vimeo.com/user20132939', @@ -667,6 +676,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, 'uploader_url': 'https://vimeo.com/aliniamedia', 'release_date': '20160329', + 'view_count': int, }, 'params': {'skip_download': True}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -678,18 +688,19 @@ class VimeoIE(VimeoBaseInfoExtractor): # 'ext': 'm4v', 'ext': 'mp4', 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', - 'description': 'md5:5967e090768a831488f6e74b7821b3c1', + 'description': 'md5:9441e6829ae94f380cc6417d982f63ac', 'uploader_id': 'fireworkchampions', 'uploader': 'Firework Champions', 'upload_date': '20150910', - 'timestamp': 1441901895, + 'timestamp': 1441916295, 'thumbnail': 'https://i.vimeocdn.com/video/534715882-6ff8e4660cbf2fea68282876d8d44f318825dfe572cc4016e73b3266eac8ae3a-d', 'uploader_url': 'https://vimeo.com/fireworkchampions', - 'tags': 'count:6', 'duration': 229, 'view_count': int, 'like_count': int, 'comment_count': int, + 'release_timestamp': 1441916295, + 'release_date': '20150910', }, 'params': { 'skip_download': True, @@ -820,7 +831,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Raja Virdi', 'uploader_id': 'rajavirdi', 'uploader_url': 'https://vimeo.com/rajavirdi', - 'duration': 309, + 'duration': 300, 'thumbnail': r're:https://i\.vimeocdn\.com/video/1716727772-[\da-f]+-d', }, # 'params': {'format': 'source'}, @@ -1122,7 +1133,7 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'description': 'md5:aeeba3dbd4d04b0fa98a4fdc9c639998', 'upload_date': '20140906', 'timestamp': 1410032453, - 'thumbnail': 'https://i.vimeocdn.com/video/488238335-d7bf151c364cff8d467f1b73784668fe60aae28a54573a35d53a1210ae283bd8-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'comment_count': int, 'license': 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'duration': 53, @@ -1132,7 +1143,7 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'params': { 'format': 'best[protocol=https]', }, - 'expected_warnings': ['Unable to download JSON metadata'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # requires Referer to be passed along with og:video:url 'url': 'https://vimeo.com/ondemand/36938/126682985', @@ -1149,13 +1160,14 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'duration': 121, 'comment_count': int, 'view_count': int, - 'thumbnail': 'https://i.vimeocdn.com/video/517077723-7066ae1d9a79d3eb361334fb5d58ec13c8f04b52f8dd5eadfbd6fb0bcf11f613-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'like_count': int, + 'tags': 'count:5', }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['Unable to download JSON metadata'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, @@ -1237,7 +1249,7 @@ class VimeoUserIE(VimeoChannelIE): # XXX: Do not subclass from concrete IE _TESTS = [{ 'url': 'https://vimeo.com/nkistudio/videos', 'info_dict': { - 'title': 'Nki', + 'title': 'AKAMA', 'id': 'nkistudio', }, 'playlist_mincount': 66, @@ -1370,10 +1382,10 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'uploader_id': 'user170863801', 'uploader_url': 'https://vimeo.com/user170863801', 'duration': 30, - 'thumbnail': 'https://i.vimeocdn.com/video/1912612821-09a43bd2e75c203d503aed89de7534f28fc4474a48f59c51999716931a246af5-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', }, 'params': {'skip_download': 'm3u8'}, - 'expected_warnings': ['Failed to parse XML'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', @@ -1528,20 +1540,22 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', - 'description': 'md5:2c362968038d4499f4d79f88458590c1', + 'description': 'md5:8cf69a1a435f2d763f4adf601e9c3125', 'duration': 1595, 'upload_date': '20130610', - 'timestamp': 1370893156, + 'timestamp': 1370907556, 'license': 'by', - 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'view_count': int, 'comment_count': int, 'like_count': int, - 'tags': 'count:1', + 'release_timestamp': 1370907556, + 'release_date': '20130610', }, 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # password-protected VimeoPro page with Vimeo player embed 'url': 'https://vimeopro.com/cadfem/simulation-conference-mechanische-systeme-in-perfektion', @@ -1549,7 +1563,7 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'id': '764543723', 'ext': 'mp4', 'title': 'Mechanische Systeme in Perfektion: Realität erfassen, Innovation treiben', - 'thumbnail': 'https://i.vimeocdn.com/video/1543784598-a1a750494a485e601110136b9fe11e28c2131942452b3a5d30391cb3800ca8fd-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'description': 'md5:2a9d195cd1b0f6f79827107dc88c2420', 'uploader': 'CADFEM', 'uploader_id': 'cadfem', @@ -1561,6 +1575,7 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'videopassword': 'Conference2022', 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py old mode 100644 new mode 100755 index ef67c210bc..c269802b37 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -300,6 +300,24 @@ class VKIE(VKBaseIE): 'upload_date': '20250130', }, }, + { + 'url': 'https://vkvideo.ru/video-50883936_456244102', + 'info_dict': { + 'id': '-50883936_456244102', + 'ext': 'mp4', + 'title': 'Добивание Украины // Техник в коме // МОЯ ЗЛОСТЬ №140', + 'description': 'md5:a9bc46181e9ebd0fdd82cef6c0191140', + 'uploader': 'Стас Ай, Как Просто!', + 'uploader_id': '-50883936', + 'comment_count': int, + 'like_count': int, + 'duration': 4651, + 'thumbnail': r're:https?://.+\.jpg', + 'chapters': 'count:59', + 'timestamp': 1743333869, + 'upload_date': '20250330', + }, + }, { # live stream, hls and rtmp links, most likely already finished live # stream by the time you are reading this comment @@ -540,7 +558,7 @@ class VKIE(VKBaseIE): 'title': ('md_title', {unescapeHTML}), 'description': ('description', {clean_html}, filter), 'thumbnail': ('jpg', {url_or_none}), - 'uploader': ('md_author', {str}), + 'uploader': ('md_author', {unescapeHTML}), 'uploader_id': (('author_id', 'authorId'), {str_or_none}, any), 'duration': ('duration', {int_or_none}), 'chapters': ('time_codes', lambda _, v: isinstance(v['time'], int), { diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index cdd32c5e4e..09e7c98785 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -2,15 +2,17 @@ import itertools from .common import InfoExtractor from ..utils import ( + bug_reports_message, determine_ext, - extract_attributes, int_or_none, lowercase_escape, parse_qs, - traverse_obj, + qualities, try_get, + update_url_query, url_or_none, ) +from ..utils.traversal import traverse_obj class YandexVideoIE(InfoExtractor): @@ -186,7 +188,22 @@ class YandexVideoPreviewIE(InfoExtractor): return self.url_result(data_json['video']['url']) -class ZenYandexIE(InfoExtractor): +class ZenYandexBaseIE(InfoExtractor): + def _fetch_ssr_data(self, url, video_id): + webpage = self._download_webpage(url, video_id) + redirect = self._search_json( + r'(?:var|let|const)\s+it\s*=', webpage, 'redirect', video_id, default={}).get('retpath') + if redirect: + video_id = self._match_id(redirect) + webpage = self._download_webpage(redirect, video_id, note='Redirecting') + return video_id, self._search_json( + r'(?:var|let|const)\s+_params\s*=\s*\(', webpage, 'metadata', video_id, + contains_pattern=r'{["\']ssrData.+}')['ssrData'] + + +class ZenYandexIE(ZenYandexBaseIE): + IE_NAME = 'dzen.ru' + IE_DESC = 'Дзен (dzen) formerly Яндекс.Дзен (Yandex Zen)' _VALID_URL = r'https?://(zen\.yandex|dzen)\.ru(?:/video)?/(media|watch)/(?:(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-)?(?P[a-z0-9-]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7', @@ -216,6 +233,7 @@ class ZenYandexIE(InfoExtractor): 'timestamp': 1573465585, }, 'params': {'skip_download': 'm3u8'}, + 'skip': 'The page does not exist', }, { 'url': 'https://zen.yandex.ru/video/watch/6002240ff8b1af50bb2da5e3', 'info_dict': { @@ -227,6 +245,9 @@ class ZenYandexIE(InfoExtractor): 'uploader': 'TechInsider', 'timestamp': 1611378221, 'upload_date': '20210123', + 'view_count': int, + 'duration': 243, + 'tags': ['опыт', 'эксперимент', 'огонь'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -240,6 +261,9 @@ class ZenYandexIE(InfoExtractor): 'uploader': 'TechInsider', 'upload_date': '20210123', 'timestamp': 1611378221, + 'view_count': int, + 'duration': 243, + 'tags': ['опыт', 'эксперимент', 'огонь'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -252,44 +276,56 @@ class ZenYandexIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - redirect = self._search_json(r'var it\s*=', webpage, 'redirect', id, default={}).get('retpath') - if redirect: - video_id = self._match_id(redirect) - webpage = self._download_webpage(redirect, video_id, note='Redirecting') - data_json = self._search_json( - r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}') - serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state') - uploader = self._search_regex(r'(]+>)', - webpage, 'uploader', default='') - uploader_name = extract_attributes(uploader).get('aria-label') - item_id = traverse_obj(data_json, (serverstate, 'videoViewer', 'openedItemId', {str})) - video_json = traverse_obj(data_json, (serverstate, 'videoViewer', 'items', item_id, {dict})) or {} + video_id, ssr_data = self._fetch_ssr_data(url, video_id) + video_data = ssr_data['videoMetaResponse'] formats, subtitles = [], {} - for s_url in traverse_obj(video_json, ('video', 'streams', ..., {url_or_none})): + quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7')) + # Deduplicate stream URLs. The "dzen_dash" query parameter is present in some URLs but can be omitted + stream_urls = set(traverse_obj(video_data, ( + 'video', ('id', ('streams', ...), ('mp4Streams', ..., 'url'), ('oneVideoStreams', ..., 'url')), + {url_or_none}, {update_url_query(query={'dzen_dash': []})}))) + for s_url in stream_urls: ext = determine_ext(s_url) - if ext == 'mpd': - fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash') - elif ext == 'm3u8': - fmts, subs = self._extract_m3u8_formats_and_subtitles(s_url, video_id, 'mp4') + content_type = traverse_obj(parse_qs(s_url), ('ct', 0)) + if ext == 'mpd' or content_type == '6': + fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash', fatal=False) + elif ext == 'm3u8' or content_type == '8': + fmts, subs = self._extract_m3u8_formats_and_subtitles(s_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif content_type == '0': + format_type = traverse_obj(parse_qs(s_url), ('type', 0)) + formats.append({ + 'url': s_url, + 'format_id': format_type, + 'ext': 'mp4', + 'quality': quality(format_type), + }) + continue + else: + self.report_warning(f'Unsupported stream URL: {s_url}{bug_reports_message()}') + continue formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) + self._merge_subtitles(subs, target=subtitles) + return { 'id': video_id, - 'title': video_json.get('title') or self._og_search_title(webpage), 'formats': formats, 'subtitles': subtitles, - 'duration': int_or_none(video_json.get('duration')), - 'view_count': int_or_none(video_json.get('views')), - 'timestamp': int_or_none(video_json.get('publicationDate')), - 'uploader': uploader_name or data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), - 'description': video_json.get('description') or self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage) or try_get(data_json, lambda x: x['og']['imageUrl']), + **traverse_obj(video_data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('image', {url_or_none}), + 'duration': ('video', 'duration', {int_or_none}), + 'view_count': ('video', 'views', {int_or_none}), + 'timestamp': ('publicationDate', {int_or_none}), + 'tags': ('tags', ..., {str}), + 'uploader': ('source', 'title', {str}), + }), } -class ZenYandexChannelIE(InfoExtractor): +class ZenYandexChannelIE(ZenYandexBaseIE): + IE_NAME = 'dzen.ru:channel' _VALID_URL = r'https?://(zen\.yandex|dzen)\.ru/(?!media|video)(?:id/)?(?P[a-z0-9-_]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/tok_media', @@ -323,8 +359,8 @@ class ZenYandexChannelIE(InfoExtractor): 'url': 'https://zen.yandex.ru/jony_me', 'info_dict': { 'id': 'jony_me', - 'description': 'md5:ce0a5cad2752ab58701b5497835b2cc5', - 'title': 'JONY ', + 'description': 'md5:7c30d11dc005faba8826feae99da3113', + 'title': 'JONY', }, 'playlist_count': 18, }, { @@ -333,9 +369,8 @@ class ZenYandexChannelIE(InfoExtractor): 'url': 'https://zen.yandex.ru/tatyanareva', 'info_dict': { 'id': 'tatyanareva', - 'description': 'md5:40a1e51f174369ec3ba9d657734ac31f', + 'description': 'md5:92e56fa730a932ca2483ba5c2186ad96', 'title': 'Татьяна Рева', - 'entries': 'maxcount:200', }, 'playlist_mincount': 46, }, { @@ -348,43 +383,31 @@ class ZenYandexChannelIE(InfoExtractor): 'playlist_mincount': 657, }] - def _entries(self, item_id, server_state_json, server_settings_json): - items = (traverse_obj(server_state_json, ('feed', 'items', ...)) - or traverse_obj(server_settings_json, ('exportData', 'items', ...))) - - more = (traverse_obj(server_state_json, ('links', 'more')) - or traverse_obj(server_settings_json, ('exportData', 'more', 'link'))) - + def _entries(self, feed_data, channel_id): next_page_id = None for page in itertools.count(1): - for item in items or []: - if item.get('type') != 'gif': - continue - video_id = traverse_obj(item, 'publication_id', 'publicationId') or '' - yield self.url_result(item['link'], ZenYandexIE, video_id.split(':')[-1]) + for item in traverse_obj(feed_data, ( + (None, ('items', lambda _, v: v['tab'] in ('shorts', 'longs'))), + 'items', lambda _, v: url_or_none(v['link']), + )): + yield self.url_result(item['link'], ZenYandexIE, item.get('id'), title=item.get('title')) + more = traverse_obj(feed_data, ('more', 'link', {url_or_none})) current_page_id = next_page_id next_page_id = traverse_obj(parse_qs(more), ('next_page_id', -1)) - if not all((more, items, next_page_id, next_page_id != current_page_id)): + if not all((more, next_page_id, next_page_id != current_page_id)): break - data = self._download_json(more, item_id, note=f'Downloading Page {page}') - items, more = data.get('items'), traverse_obj(data, ('more', 'link')) + feed_data = self._download_json(more, channel_id, note=f'Downloading Page {page}') def _real_extract(self, url): - item_id = self._match_id(url) - webpage = self._download_webpage(url, item_id) - redirect = self._search_json( - r'var it\s*=', webpage, 'redirect', item_id, default={}).get('retpath') - if redirect: - item_id = self._match_id(redirect) - webpage = self._download_webpage(redirect, item_id, note='Redirecting') - data = self._search_json( - r'("data"\s*:|data\s*=)', webpage, 'channel data', item_id, contains_pattern=r'{\"__serverState__.+}') - server_state_json = traverse_obj(data, lambda k, _: k.startswith('__serverState__'), get_all=False) - server_settings_json = traverse_obj(data, lambda k, _: k.startswith('__serverSettings__'), get_all=False) + channel_id = self._match_id(url) + channel_id, ssr_data = self._fetch_ssr_data(url, channel_id) + channel_data = ssr_data['exportResponse'] return self.playlist_result( - self._entries(item_id, server_state_json, server_settings_json), - item_id, traverse_obj(server_state_json, ('channel', 'source', 'title')), - traverse_obj(server_state_json, ('channel', 'source', 'description'))) + self._entries(channel_data['feedData'], channel_id), + channel_id, **traverse_obj(channel_data, ('channel', 'source', { + 'title': ('title', {str}), + 'description': ('description', {str}), + }))) diff --git a/yt_dlp/extractor/youtube/_tab.py b/yt_dlp/extractor/youtube/_tab.py index 122300e600..c018ee8cfb 100644 --- a/yt_dlp/extractor/youtube/_tab.py +++ b/yt_dlp/extractor/youtube/_tab.py @@ -524,10 +524,16 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): response = self._extract_response( item_id=f'{item_id} page {page_num}', query=continuation, headers=headers, ytcfg=ytcfg, - check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints')) + check_get_keys=( + 'continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints', + # Playlist recommendations may return with no data - ignore + ('responseContext', 'serviceTrackingParams', ..., 'params', ..., lambda k, v: k == 'key' and v == 'GetRecommendedMusicPlaylists_rid'), + )) if not response: break + + continuation = None # Extracting updated visitor data is required to prevent an infinite extraction loop in some cases # See: https://github.com/ytdl-org/youtube-dl/issues/28702 visitor_data = self._extract_visitor_data(response) or visitor_data @@ -564,7 +570,13 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): yield from func(video_items_renderer) continuation = continuation_list[0] or self._extract_continuation(video_items_renderer) - if not video_items_renderer: + # In the case only a continuation is returned, try to follow it. + # We extract this after trying to extract non-continuation items as otherwise this + # may be prioritized over other continuations. + # see: https://github.com/yt-dlp/yt-dlp/issues/12933 + continuation = continuation or self._extract_continuation({'contents': [continuation_item]}) + + if not continuation and not video_items_renderer: break @staticmethod @@ -999,14 +1011,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner Ph.D. - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', - 'uploader': 'Igor Kleiner Ph.D.', + 'uploader': 'Igor Kleiner ', 'uploader_id': '@IgorDataScience', 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'channel': 'Igor Kleiner Ph.D.', + 'channel': 'Igor Kleiner ', 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], + 'tags': 'count:23', 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', 'channel_follower_count': int, }, @@ -1016,18 +1028,19 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner Ph.D. - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', - 'uploader': 'Igor Kleiner Ph.D.', + 'uploader': 'Igor Kleiner ', 'uploader_id': '@IgorDataScience', 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], + 'tags': 'count:23', 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'channel': 'Igor Kleiner Ph.D.', + 'channel': 'Igor Kleiner ', 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', 'channel_follower_count': int, }, }, { + # TODO: fix channel_is_verified extraction 'note': 'playlists, series', 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', 'playlist_mincount': 5, @@ -1066,22 +1079,23 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'basic, single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlSLRHmI1qNm0wjyVNWw1pCU', 'info_dict': { - 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'title': 'youtube-dl public playlist', + 'id': 'PLt5yu3-wZAlSLRHmI1qNm0wjyVNWw1pCU', + 'title': 'single video playlist', 'description': '', 'tags': [], 'view_count': int, - 'modified_date': '20201130', - 'channel': 'Sergey M.', - 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'modified_date': '20250417', + 'channel': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', 'availability': 'public', - 'uploader': 'Sergey M.', - 'uploader_url': 'https://www.youtube.com/@sergeym.6173', - 'uploader_id': '@sergeym.6173', + 'uploader': 'cole-dlp-test-acc', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', }, 'playlist_count': 1, }, { @@ -1171,11 +1185,11 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 17, }, { - 'note': 'Community tab', + 'note': 'Posts tab', 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Community', + 'title': 'lex will - Posts', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'channel': 'lex will', 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', @@ -1188,30 +1202,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 18, }, { - 'note': 'Channels tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Channels', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'channel': 'lex will', - 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'tags': ['bible', 'history', 'prophesy'], - 'channel_follower_count': int, - 'uploader_url': 'https://www.youtube.com/@lexwill718', - 'uploader_id': '@lexwill718', - 'uploader': 'lex will', - }, - 'playlist_mincount': 12, - }, { + # TODO: fix channel_is_verified extraction 'note': 'Search tab', 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', 'playlist_mincount': 40, 'info_dict': { 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Search - linear algebra', - 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', + 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', 'tags': ['Mathematics'], 'channel': '3Blue1Brown', @@ -1232,6 +1230,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'info_dict': { @@ -1294,24 +1293,25 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 21, }, { + # TODO: fix availability extraction 'note': 'Playlist with "show unavailable videos" button', - 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', + 'url': 'https://www.youtube.com/playlist?list=PLYwq8WOe86_xGmR7FrcJq8Sb7VW8K3Tt2', 'info_dict': { - 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', - 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', + 'title': 'The Memes Of 2010s.....', + 'id': 'PLYwq8WOe86_xGmR7FrcJq8Sb7VW8K3Tt2', 'view_count': int, - 'channel': 'Phim Siêu Nhân Nhật Bản', + 'channel': "I'm Not JiNxEd", 'tags': [], - 'description': '', - 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', - 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', + 'description': 'md5:44dc3b315ba69394feaafa2f40e7b2a1', + 'channel_url': 'https://www.youtube.com/channel/UC5H5H85D1QE5-fuWWQ1hdNg', + 'channel_id': 'UC5H5H85D1QE5-fuWWQ1hdNg', 'modified_date': r're:\d{8}', 'availability': 'public', - 'uploader_url': 'https://www.youtube.com/@phimsieunhannhatban', - 'uploader_id': '@phimsieunhannhatban', - 'uploader': 'Phim Siêu Nhân Nhật Bản', + 'uploader_url': 'https://www.youtube.com/@imnotjinxed1998', + 'uploader_id': '@imnotjinxed1998', + 'uploader': "I'm Not JiNxEd", }, - 'playlist_mincount': 200, + 'playlist_mincount': 150, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { 'note': 'Playlist with unavailable videos in page 7', @@ -1334,6 +1334,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 1000, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { + # TODO: fix availability extraction 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', 'info_dict': { @@ -1384,7 +1385,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': 'hGkQjiJLjWQ', # This will keep changing + 'id': 'YDvsBbKfLPA', # This will keep changing 'ext': 'mp4', 'title': str, 'upload_date': r're:\d{8}', @@ -1409,6 +1410,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_id': '@SkyNews', 'uploader': 'Sky News', 'channel_is_verified': True, + 'media_type': 'livestream', + 'timestamp': int, }, 'params': { 'skip_download': True, @@ -1496,6 +1499,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'VLPL, should redirect to playlist?list=PL...', 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', 'info_dict': { @@ -1537,6 +1541,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # Destination channel with only a hidden self tab (tab id is UCtFRv9O2AHqOZjjynzrv-xg) # Treat as a general feed + # TODO: fix extraction 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', 'info_dict': { 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', @@ -1560,21 +1565,21 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'expected_warnings': ['YouTube Music is not directly supported'], }, { 'note': 'unlisted single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_', 'info_dict': { - 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'title': 'yt-dlp unlisted playlist test', + 'id': 'PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_', + 'title': 'unlisted playlist', 'availability': 'unlisted', 'tags': [], - 'modified_date': '20220418', - 'channel': 'colethedj', + 'modified_date': '20250417', + 'channel': 'cole-dlp-test-acc', 'view_count': int, 'description': '', - 'channel_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', - 'channel_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q', - 'uploader_url': 'https://www.youtube.com/@colethedj1894', - 'uploader_id': '@colethedj1894', - 'uploader': 'colethedj', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', }, 'playlist': [{ 'info_dict': { @@ -1596,6 +1601,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 1, 'params': {'extract_flat': True}, }, { + # By default, recommended is always empty. 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', 'url': 'https://www.youtube.com/feed/recommended', 'info_dict': { @@ -1603,7 +1609,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'recommended', 'tags': [], }, - 'playlist_mincount': 50, + 'playlist_count': 0, 'params': { 'skip_download': True, 'extractor_args': {'youtubetab': {'skip': ['webpage']}}, @@ -1628,6 +1634,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'skip': 'Query for sorting no longer works', }, { + # TODO: fix 'unviewable' issue with this playlist when reloading with unavailable videos 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', 'info_dict': { @@ -1654,11 +1661,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ', 'only_matching': True, }, { + # TODO: fix metadata extraction 'note': 'collaborative playlist (uploader name in the form "by and x other(s)")', 'url': 'https://www.youtube.com/playlist?list=PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', 'info_dict': { 'id': 'PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', - 'modified_date': '20220407', + 'modified_date': '20250115', 'channel_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', 'tags': [], 'availability': 'unlisted', @@ -1692,6 +1700,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'expected_warnings': ['Preferring "ja"'], }, { # XXX: this should really check flat playlist entries, but the test suite doesn't support that + # TODO: fix availability extraction 'note': 'preferred lang set with playlist with translated video titles', 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', 'info_dict': { @@ -1714,6 +1723,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # shorts audio pivot for 2GtVksBMYFM. 'url': 'https://www.youtube.com/feed/sfv_audio_pivot?bp=8gUrCikSJwoLMkd0VmtzQk1ZRk0SCzJHdFZrc0JNWUZNGgsyR3RWa3NCTVlGTQ==', + # TODO: fix extraction 'info_dict': { 'id': 'sfv_audio_pivot', 'title': 'sfv_audio_pivot', @@ -1751,6 +1761,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 8, }, { # Should get three playlists for videos, shorts and streams tabs + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', 'info_dict': { 'id': 'UCK9V2B22uJYu3N7eR_BT9QA', @@ -1758,7 +1769,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_follower_count': int, 'channel_id': 'UCK9V2B22uJYu3N7eR_BT9QA', 'channel_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', - 'description': 'md5:49809d8bf9da539bc48ed5d1f83c33f2', + 'description': 'md5:01e53f350ab8ad6fcf7c4fedb3c1b99f', 'channel': 'Polka Ch. 尾丸ポルカ', 'tags': 'count:35', 'uploader_url': 'https://www.youtube.com/@OmaruPolka', @@ -1769,14 +1780,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 3, }, { # Shorts tab with channel with handle - # TODO: fix channel description + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@NotJustBikes/shorts', 'info_dict': { 'id': 'UC0intLFzLaudFG-xAvUEO-A', 'title': 'Not Just Bikes - Shorts', 'tags': 'count:10', 'channel_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', - 'description': 'md5:5e82545b3a041345927a92d0585df247', + 'description': 'md5:1d9fc1bad7f13a487299d1fe1712e031', 'channel_follower_count': int, 'channel_id': 'UC0intLFzLaudFG-xAvUEO-A', 'channel': 'Not Just Bikes', @@ -1797,7 +1808,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', 'channel': '中村悠一', 'channel_follower_count': int, - 'description': 'md5:e744f6c93dafa7a03c0c6deecb157300', + 'description': 'md5:e8fd705073a594f27d6d6d020da560dc', 'uploader_url': 'https://www.youtube.com/@Yuichi-Nakamura', 'uploader_id': '@Yuichi-Nakamura', 'uploader': '中村悠一', @@ -1815,6 +1826,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'only_matching': True, }, { # No videos tab but has a shorts tab + # TODO: fix metadata extraction 'url': 'https://www.youtube.com/c/TKFShorts', 'info_dict': { 'id': 'UCgJ5_1F6yJhYLnyMszUdmUg', @@ -1851,6 +1863,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # Shorts url result in shorts tab # TODO: Fix channel id extraction + # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts', 'info_dict': { 'id': 'UCiu-3thuViMebBjw_5nWYrA', @@ -1879,6 +1892,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'params': {'extract_flat': True}, }, { # Live video status should be extracted + # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/live', 'info_dict': { 'id': 'UCQvWX73GQygcwXOTSf_VDVg', @@ -1907,6 +1921,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 1, }, { # Channel renderer metadata. Contains number of videos on the channel + # TODO: channels tab removed, change this test to use another page with channel renderer 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/channels', 'info_dict': { 'id': 'UCiu-3thuViMebBjw_5nWYrA', @@ -1940,7 +1955,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, }], 'params': {'extract_flat': True}, + 'skip': 'channels tab removed', }, { + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@3blue1brown/about', 'info_dict': { 'id': '@3blue1brown', @@ -1950,7 +1967,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', 'channel': '3Blue1Brown', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', - 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', + 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader_id': '@3blue1brown', 'uploader': '3Blue1Brown', @@ -1976,6 +1993,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 5, }, { # Releases tab, with rich entry playlistRenderers (same as Podcasts tab) + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@AHimitsu/releases', 'info_dict': { 'id': 'UCgFwu-j5-xNJml2FtTrrB3A', @@ -2015,6 +2033,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 100, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { + # TODO: fix channel_is_verified extraction 'note': 'Tags containing spaces', 'url': 'https://www.youtube.com/channel/UC7_YxT-KID8kRbqZo7MyscQ', 'playlist_count': 3, @@ -2035,6 +2054,24 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'challenges', 'sketches', 'scary games', 'funny games', 'rage games', 'mark fischbach'], }, + }, { + # https://github.com/yt-dlp/yt-dlp/issues/12933 + 'note': 'streams tab, some scheduled streams. Empty intermediate response with only continuation - must follow', + 'url': 'https://www.youtube.com/@sbcitygov/streams', + 'playlist_mincount': 150, + 'info_dict': { + 'id': 'UCH6-qfQwlUgz9SAf05jvc_w', + 'channel': 'sbcitygov', + 'channel_id': 'UCH6-qfQwlUgz9SAf05jvc_w', + 'title': 'sbcitygov - Live', + 'channel_follower_count': int, + 'description': 'md5:ca1a92059835c071e33b3db52f4a6d67', + 'uploader_id': '@sbcitygov', + 'uploader_url': 'https://www.youtube.com/@sbcitygov', + 'uploader': 'sbcitygov', + 'channel_url': 'https://www.youtube.com/channel/UCH6-qfQwlUgz9SAf05jvc_w', + 'tags': [], + }, }] @classmethod diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 074a2a0d8d..bcfe8b1520 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -3646,6 +3646,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'sign in' in reason.lower(): reason = remove_end(reason, 'This helps protect our community. Learn more') reason = f'{remove_end(reason.strip(), ".")}. {self._youtube_login_hint}' + elif get_first(playability_statuses, ('errorScreen', 'playerCaptchaViewModel', {dict})): + reason += '. YouTube is requiring a captcha challenge before playback' self.raise_no_formats(reason, expected=True) keywords = get_first(video_details, 'keywords', expected_type=list) or [] @@ -3874,7 +3876,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not traverse_obj(initial_data, 'contents'): self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.') initial_data = None - if not initial_data: + if not initial_data and 'initial_data' not in self._configuration_arg('player_skip'): query = {'videoId': video_id} query.update(self._get_checkok_params()) initial_data = self._extract_response( diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py index 1eaa0ee5fd..39158a8cc1 100644 --- a/yt_dlp/networking/__init__.py +++ b/yt_dlp/networking/__init__.py @@ -3,6 +3,7 @@ import warnings from .common import ( HEADRequest, + PATCHRequest, PUTRequest, Request, RequestDirector, diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index ddceaa9a97..e33769422b 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -505,6 +505,7 @@ class Request: HEADRequest = functools.partial(Request, method='HEAD') +PATCHRequest = functools.partial(Request, method='PATCH') PUTRequest = functools.partial(Request, method='PUT') diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 1742cbdfaf..76d401ceaa 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -150,6 +150,15 @@ class _YoutubeDLHelpFormatter(optparse.IndentedHelpFormatter): return opts +_PRESET_ALIASES = { + 'mp3': ['-f', 'ba[acodec^=mp3]/ba/b', '-x', '--audio-format', 'mp3'], + 'aac': ['-f', 'ba[acodec^=aac]/ba[acodec^=mp4a.40.]/ba/b', '-x', '--audio-format', 'aac'], + 'mp4': ['--merge-output-format', 'mp4', '--remux-video', 'mp4', '-S', 'vcodec:h264,lang,quality,res,fps,hdr:12,acodec:aac'], + 'mkv': ['--merge-output-format', 'mkv', '--remux-video', 'mkv'], + 'sleep': ['--sleep-subtitles', '5', '--sleep-requests', '0.75', '--sleep-interval', '10', '--max-sleep-interval', '20'], +} + + class _YoutubeDLOptionParser(optparse.OptionParser): # optparse is deprecated since Python 3.2. So assume a stable interface even for private methods ALIAS_DEST = '_triggered_aliases' @@ -215,6 +224,22 @@ class _YoutubeDLOptionParser(optparse.OptionParser): return e.possibilities[0] raise + def format_option_help(self, formatter=None): + assert formatter, 'Formatter can not be None' + formatted_help = super().format_option_help(formatter=formatter) + formatter.indent() + heading = formatter.format_heading('Preset Aliases') + formatter.indent() + result = [] + for name, args in _PRESET_ALIASES.items(): + option = optparse.Option('-t', help=shlex.join(args)) + formatter.option_strings[option] = f'-t {name}' + result.append(formatter.format_option(option)) + formatter.dedent() + formatter.dedent() + help_lines = '\n'.join(result) + return f'{formatted_help}\n{heading}{help_lines}' + def create_parser(): def _list_from_options_callback(option, opt_str, value, parser, append=True, delim=',', process=str.strip): @@ -317,6 +342,13 @@ def create_parser(): parser.rargs[:0] = shlex.split( opts if value is None else opts.format(*map(shlex.quote, value))) + def _preset_alias_callback(option, opt_str, value, parser): + if not value: + return + if value not in _PRESET_ALIASES: + raise optparse.OptionValueError(f'Unknown preset alias: {value}') + parser.rargs[:0] = _PRESET_ALIASES[value] + general = optparse.OptionGroup(parser, 'General Options') general.add_option( '-h', '--help', dest='print_help', action='store_true', @@ -519,6 +551,15 @@ def create_parser(): 'Alias options can trigger more aliases; so be careful to avoid defining recursive options. ' f'As a safety measure, each alias may be triggered a maximum of {_YoutubeDLOptionParser.ALIAS_TRIGGER_LIMIT} times. ' 'This option can be used multiple times')) + general.add_option( + '-t', '--preset-alias', + metavar='PRESET', dest='_', type='str', + action='callback', callback=_preset_alias_callback, + help=( + 'Applies a predefined set of options. e.g. --preset-alias mp3. ' + f'The following presets are available: {", ".join(_PRESET_ALIASES)}. ' + 'See the "Preset Aliases" section at the end for more info. ' + 'This option can be used multiple times')) network = optparse.OptionGroup(parser, 'Network Options') network.add_option( diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 5bb52df55f..20aa341ca3 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2044,7 +2044,7 @@ def url_or_none(url): if not url or not isinstance(url, str): return None url = url.strip() - return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None + return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?|wss?):)?//', url) else None def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):