Compare commits

...

5 Commits

Author SHA1 Message Date
ShockedPlot7560
0f33950c77
[ie/mixlr] Add extractors (#13561)
Authored by: ShockedPlot7560, seproDev

Co-authored-by: sepro <sepro@sepr0.com>
2025-07-13 01:35:51 +02:00
bashonly
b5fea53f20
[ie] Rework _search_nextjs_v13_data helper (#13711)
Fix 5245231e4a39ecd5595d4337d46d85e150e2430a

Authored by: bashonly
2025-07-12 23:12:05 +00:00
bashonly
5245231e4a
[ie] Add _search_nextjs_v13_data helper (#13398)
* Fixes FranceTVSiteIE livestream extraction
* Fixes GoPlayIE metadata extraction

Authored by: bashonly
2025-07-12 22:12:46 +00:00
Lyuben Ivanov
3ae61e0f31
[ie/BTVPlus] Add extractor (#13541)
Authored by: bubo
2025-07-12 21:56:11 +02:00
bashonly
a5d697f62d
[ie/vimeo] Fix extractor (#13692)
Closes #13180, Closes #13689
Authored by: bashonly
2025-07-12 19:23:22 +00:00
10 changed files with 580 additions and 110 deletions

View File

@ -1901,6 +1901,10 @@ The following extractors use this feature:
#### tver #### tver
* `backend`: Backend API to use for extraction - one of `streaks` (default) or `brightcove` (deprecated) * `backend`: Backend API to use for extraction - one of `streaks` (default) or `brightcove` (deprecated)
#### vimeo
* `client`: Client to extract video data from. One of `android` (default), `ios` or `web`. The `ios` client only works with previously cached OAuth tokens. The `web` client only works when authenticated with credentials or account cookies
* `original_format_policy`: Policy for when to try extracting original formats. One of `always`, `never`, or `auto`. The default `auto` policy tries to avoid exceeding the API rate-limit by only making an extra request when Vimeo publicizes the video's downloadability
**Note**: These options may be changed/removed in the future without concern for backward compatibility **Note**: These options may be changed/removed in the future without concern for backward compatibility
<!-- MANPAGE: MOVE "INSTALLATION" SECTION HERE --> <!-- MANPAGE: MOVE "INSTALLATION" SECTION HERE -->

View File

@ -1959,6 +1959,37 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
with self.assertWarns(DeprecationWarning): with self.assertWarns(DeprecationWarning):
self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {}) self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {})
def test_search_nextjs_v13_data(self):
HTML = R'''
<script>(self.__next_f=self.__next_f||[]).push([0])</script>
<script>self.__next_f.push([2,"0:[\"$\",\"$L0\",null,{\"do_not_add_this\":\"fail\"}]\n"])</script>
<script>self.__next_f.push([1,"1:I[46975,[],\"HTTPAccessFallbackBoundary\"]\n2:I[32630,[\"8183\",\"static/chunks/8183-768193f6a9e33cdd.js\"]]\n"])</script>
<script nonce="abc123">self.__next_f.push([1,"e:[false,[\"$\",\"div\",null,{\"children\":[\"$\",\"$L18\",null,{\"foo\":\"bar\"}]}],false]\n "])</script>
<script>self.__next_f.push([1,"2a:[[\"$\",\"div\",null,{\"className\":\"flex flex-col\",\"children\":[]}],[\"$\",\"$L16\",null,{\"meta\":{\"dateCreated\":1730489700,\"uuid\":\"40cac41d-8d29-4ef5-aa11-75047b9f0907\"}}]]\n"])</script>
<script>self.__next_f.push([1,"df:[\"$undefined\",[\"$\",\"div\",null,{\"children\":[\"$\",\"$L17\",null,{}],\"do_not_include_this_field\":\"fail\"}],[\"$\",\"div\",null,{\"children\":[[\"$\",\"$L19\",null,{\"duplicated_field_name\":{\"x\":1}}],[\"$\",\"$L20\",null,{\"duplicated_field_name\":{\"y\":2}}]]}],\"$undefined\"]\n"])</script>
<script>self.__next_f.push([3,"MzM6WyIkIiwiJEwzMiIsbnVsbCx7ImRlY29kZWQiOiJzdWNjZXNzIn1d"])</script>
'''
EXPECTED = {
'18': {
'foo': 'bar',
},
'16': {
'meta': {
'dateCreated': 1730489700,
'uuid': '40cac41d-8d29-4ef5-aa11-75047b9f0907',
},
},
'19': {
'duplicated_field_name': {'x': 1},
},
'20': {
'duplicated_field_name': {'y': 2},
},
}
self.assertEqual(self.ie._search_nextjs_v13_data(HTML, None), EXPECTED)
self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), {})
self.assertEqual(self.ie._search_nextjs_v13_data(None, None, fatal=False), {})
def test_search_nuxt_json(self): def test_search_nuxt_json(self):
HTML_TMPL = '<script data-ssr="true" id="__NUXT_DATA__" type="application/json">[{}]</script>' HTML_TMPL = '<script data-ssr="true" id="__NUXT_DATA__" type="application/json">[{}]</script>'
VALID_DATA = ''' VALID_DATA = '''

View File

@ -309,6 +309,7 @@ from .brilliantpala import (
BrilliantpalaClassesIE, BrilliantpalaClassesIE,
BrilliantpalaElearnIE, BrilliantpalaElearnIE,
) )
from .btvplus import BTVPlusIE
from .bundesliga import BundesligaIE from .bundesliga import BundesligaIE
from .bundestag import BundestagIE from .bundestag import BundestagIE
from .bunnycdn import BunnyCdnIE from .bunnycdn import BunnyCdnIE
@ -1168,6 +1169,10 @@ from .mixcloud import (
MixcloudPlaylistIE, MixcloudPlaylistIE,
MixcloudUserIE, MixcloudUserIE,
) )
from .mixlr import (
MixlrIE,
MixlrRecoringIE,
)
from .mlb import ( from .mlb import (
MLBIE, MLBIE,
MLBTVIE, MLBTVIE,

View File

@ -0,0 +1,73 @@
from .common import InfoExtractor
from ..utils import (
bug_reports_message,
clean_html,
get_element_by_class,
js_to_json,
mimetype2ext,
strip_or_none,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj
class BTVPlusIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?btvplus\.bg/produkt/(?:predavaniya|seriali|novini)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://btvplus.bg/produkt/predavaniya/67271/btv-reporterite/btv-reporterite-12-07-2025-g',
'info_dict': {
'ext': 'mp4',
'id': '67271',
'title': 'bTV Репортерите - 12.07.2025 г.',
'thumbnail': 'https://cdn.btv.bg/media/images/940x529/Jul2025/2113606319.jpg',
},
}, {
'url': 'https://btvplus.bg/produkt/seriali/66942/sezon-2/plen-sezon-2-epizod-55',
'info_dict': {
'ext': 'mp4',
'id': '66942',
'title': 'Плен - сезон 2, епизод 55',
'thumbnail': 'https://cdn.btv.bg/media/images/940x529/Jun2025/2113595104.jpg',
},
}, {
'url': 'https://btvplus.bg/produkt/novini/67270/btv-novinite-centralna-emisija-12-07-2025',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
player_url = self._search_regex(
r'var\s+videoUrl\s*=\s*[\'"]([^\'"]+)[\'"]',
webpage, 'player URL')
player_config = self._download_json(
urljoin('https://btvplus.bg', player_url), video_id)['config']
videojs_data = self._search_json(
r'videojs\(["\'][^"\']+["\'],', player_config, 'videojs data',
video_id, transform_source=js_to_json)
formats = []
subtitles = {}
for src in traverse_obj(videojs_data, ('sources', lambda _, v: url_or_none(v['src']))):
ext = mimetype2ext(src.get('type'))
if ext == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
src['src'], video_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
self.report_warning(f'Unknown format type {ext}{bug_reports_message()}')
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'title': (
strip_or_none(self._og_search_title(webpage, default=None))
or clean_html(get_element_by_class('product-title', webpage))),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'description': self._og_search_description(webpage, default=None),
}

View File

@ -1783,6 +1783,59 @@ class InfoExtractor:
r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data', r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
video_id, end_pattern='</script>', fatal=fatal, default=default, **kw) video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
def _search_nextjs_v13_data(self, webpage, video_id, fatal=True):
"""Parses Next.js app router flight data that was introduced in Next.js v13"""
nextjs_data = {}
if not fatal and not isinstance(webpage, str):
return nextjs_data
def flatten(flight_data):
if not isinstance(flight_data, list):
return
if len(flight_data) == 4 and flight_data[0] == '$':
_, name, _, data = flight_data
if not isinstance(data, dict):
return
children = data.pop('children', None)
if data and isinstance(name, str) and re.fullmatch(r'\$L[0-9a-f]+', name):
# It is useful hydration JSON data
nextjs_data[name[2:]] = data
flatten(children)
return
for f in flight_data:
flatten(f)
flight_text = ''
# The pattern for the surrounding JS/tag should be strict as it's a hardcoded string in the next.js source
# Ref: https://github.com/vercel/next.js/blob/5a4a08fdc/packages/next/src/server/app-render/use-flight-response.tsx#L189
for flight_segment in re.findall(r'<script\b[^>]*>self\.__next_f\.push\((\[.+?\])\)</script>', webpage):
segment = self._parse_json(flight_segment, video_id, fatal=fatal, errnote=None if fatal else False)
# Some earlier versions of next.js "optimized" away this array structure; this is unsupported
# Ref: https://github.com/vercel/next.js/commit/0123a9d5c9a9a77a86f135b7ae30b46ca986d761
if not isinstance(segment, list) or len(segment) != 2:
self.write_debug(
f'{video_id}: Unsupported next.js flight data structure detected', only_once=True)
continue
# Only use the relevant payload type (1 == data)
# Ref: https://github.com/vercel/next.js/blob/5a4a08fdc/packages/next/src/server/app-render/use-flight-response.tsx#L11-L14
payload_type, chunk = segment
if payload_type == 1:
flight_text += chunk
for f in flight_text.splitlines():
prefix, _, body = f.lstrip().partition(':')
if not re.fullmatch(r'[0-9a-f]+', prefix):
continue
# The body still isn't guaranteed to be valid JSON, so parsing should always be non-fatal
if body.startswith('[') and body.endswith(']'):
flatten(self._parse_json(body, video_id, fatal=False, errnote=False))
elif body.startswith('{') and body.endswith('}'):
data = self._parse_json(body, video_id, fatal=False, errnote=False)
if data is not None:
nextjs_data[prefix] = data
return nextjs_data
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
rectx = re.escape(context_name) rectx = re.escape(context_name)

View File

@ -1,4 +1,3 @@
import json
import re import re
import urllib.parse import urllib.parse
@ -19,7 +18,11 @@ from ..utils import (
unsmuggle_url, unsmuggle_url,
url_or_none, url_or_none,
) )
from ..utils.traversal import find_element, traverse_obj from ..utils.traversal import (
find_element,
get_first,
traverse_obj,
)
class FranceTVBaseInfoExtractor(InfoExtractor): class FranceTVBaseInfoExtractor(InfoExtractor):
@ -258,7 +261,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
'info_dict': { 'info_dict': {
'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', # old: c5bda21d-2c6f-4470-8849-3d8327adb2ba' 'id': 'b2cf9fd8-e971-4757-8651-848f2772df61', # old: ec217ecc-0733-48cf-ac06-af1347b849d1
'ext': 'mp4', 'ext': 'mp4',
'title': '13h15, le dimanche... - Les mystères de Jésus', 'title': '13h15, le dimanche... - Les mystères de Jésus',
'timestamp': 1502623500, 'timestamp': 1502623500,
@ -269,7 +272,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
'add_ie': [FranceTVIE.ie_key()], 'skip': 'Unfortunately, this video is no longer available',
}, { }, {
# geo-restricted # geo-restricted
'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html', 'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html',
@ -287,7 +290,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1441, 'duration': 1441,
}, },
'skip': 'No longer available', 'skip': 'Unfortunately, this video is no longer available',
}, { }, {
# geo-restricted livestream (workflow == 'token-akamai') # geo-restricted livestream (workflow == 'token-akamai')
'url': 'https://www.france.tv/france-4/direct.html', 'url': 'https://www.france.tv/france-4/direct.html',
@ -308,6 +311,19 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
'live_status': 'is_live', 'live_status': 'is_live',
}, },
'params': {'skip_download': 'livestream'}, 'params': {'skip_download': 'livestream'},
}, {
# Not geo-restricted
'url': 'https://www.france.tv/france-2/la-maison-des-maternelles/5574051-nous-sommes-amis-et-nous-avons-fait-un-enfant-ensemble.html',
'info_dict': {
'id': 'b448bfe4-9fe7-11ee-97d8-2ba3426fa3df',
'ext': 'mp4',
'title': 'Nous sommes amis et nous avons fait un enfant ensemble - Émission du jeudi 21 décembre 2023',
'duration': 1065,
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1703147921,
'upload_date': '20231221',
},
'params': {'skip_download': 'm3u8'},
}, { }, {
# france3 # france3
'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',
@ -342,30 +358,16 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
# XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.goplay
def _find_json(self, s):
return self._search_json(
r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None)
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
nextjs_data = self._search_nextjs_v13_data(webpage, display_id)
nextjs_data = traverse_obj( if get_first(nextjs_data, ('isLive', {bool})):
re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*</script>', webpage),
(..., {json.loads}, ..., {self._find_json}, ..., 'children', ..., ..., 'children', ..., ..., 'children'))
if traverse_obj(nextjs_data, (..., ..., 'children', ..., 'isLive', {bool}, any)):
# For livestreams we need the id of the stream instead of the currently airing episode id # For livestreams we need the id of the stream instead of the currently airing episode id
video_id = traverse_obj(nextjs_data, ( video_id = get_first(nextjs_data, ('options', 'id', {str}))
..., ..., 'children', ..., 'children', ..., 'children', ..., 'children', ..., ...,
'children', ..., ..., 'children', ..., ..., 'children', (..., (..., ...)),
'options', 'id', {str}, any))
else: else:
video_id = traverse_obj(nextjs_data, ( video_id = get_first(nextjs_data, ('video', ('playerReplayId', 'siId'), {str}))
..., ..., ..., 'children',
lambda _, v: v['video']['url'] == urllib.parse.urlparse(url).path,
'video', ('playerReplayId', 'siId'), {str}, any))
if not video_id: if not video_id:
raise ExtractorError('Unable to extract video ID') raise ExtractorError('Unable to extract video ID')

View File

@ -5,16 +5,11 @@ import hashlib
import hmac import hmac
import json import json
import os import os
import re
import urllib.parse import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import ExtractorError, int_or_none
ExtractorError, from ..utils.traversal import get_first, traverse_obj
int_or_none,
remove_end,
traverse_obj,
)
class GoPlayIE(InfoExtractor): class GoPlayIE(InfoExtractor):
@ -27,10 +22,10 @@ class GoPlayIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '2baa4560-87a0-421b-bffc-359914e3c387', 'id': '2baa4560-87a0-421b-bffc-359914e3c387',
'ext': 'mp4', 'ext': 'mp4',
'title': 'S22 - Aflevering 1', 'title': 'De Slimste Mens ter Wereld - S22 - Aflevering 1',
'description': r're:In aflevering 1 nemen Daan Alferink, Tess Elst en Xander De Rycke .{66}', 'description': r're:In aflevering 1 nemen Daan Alferink, Tess Elst en Xander De Rycke .{66}',
'series': 'De Slimste Mens ter Wereld', 'series': 'De Slimste Mens ter Wereld',
'episode': 'Episode 1', 'episode': 'Wordt aangekondigd',
'season_number': 22, 'season_number': 22,
'episode_number': 1, 'episode_number': 1,
'season': 'Season 22', 'season': 'Season 22',
@ -52,7 +47,7 @@ class GoPlayIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'ecb79672-92b9-4cd9-a0d7-e2f0250681ee', 'id': 'ecb79672-92b9-4cd9-a0d7-e2f0250681ee',
'ext': 'mp4', 'ext': 'mp4',
'title': 'S11 - Aflevering 1', 'title': 'De Mol - S11 - Aflevering 1',
'description': r're:Tien kandidaten beginnen aan hun verovering van Amerika en ontmoeten .{102}', 'description': r're:Tien kandidaten beginnen aan hun verovering van Amerika en ontmoeten .{102}',
'episode': 'Episode 1', 'episode': 'Episode 1',
'series': 'De Mol', 'series': 'De Mol',
@ -75,21 +70,13 @@ class GoPlayIE(InfoExtractor):
if not self._id_token: if not self._id_token:
raise self.raise_login_required(method='password') raise self.raise_login_required(method='password')
# XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.francetv
def _find_json(self, s):
return self._search_json(
r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None)
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
nextjs_data = traverse_obj( nextjs_data = self._search_nextjs_v13_data(webpage, display_id)
re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*</script>', webpage), meta = get_first(nextjs_data, (
(..., {json.loads}, ..., {self._find_json}, ...)) lambda k, v: k in ('video', 'meta') and v['path'] == urllib.parse.urlparse(url).path))
meta = traverse_obj(nextjs_data, (
..., ..., 'children', ..., ..., 'children',
lambda _, v: v['video']['path'] == urllib.parse.urlparse(url).path, 'video', any))
video_id = meta['uuid'] video_id = meta['uuid']
info_dict = traverse_obj(meta, { info_dict = traverse_obj(meta, {
@ -98,19 +85,18 @@ class GoPlayIE(InfoExtractor):
}) })
if traverse_obj(meta, ('program', 'subtype')) != 'movie': if traverse_obj(meta, ('program', 'subtype')) != 'movie':
for season_data in traverse_obj(nextjs_data, (..., 'children', ..., 'playlists', ...)): for season_data in traverse_obj(nextjs_data, (..., 'playlists', ..., {dict})):
episode_data = traverse_obj( episode_data = traverse_obj(season_data, ('videos', lambda _, v: v['videoId'] == video_id, any))
season_data, ('videos', lambda _, v: v['videoId'] == video_id, any))
if not episode_data: if not episode_data:
continue continue
episode_title = traverse_obj( season_number = traverse_obj(season_data, ('season', {int_or_none}))
episode_data, 'contextualTitle', 'episodeTitle', expected_type=str)
info_dict.update({ info_dict.update({
'title': episode_title or info_dict.get('title'), 'episode': traverse_obj(episode_data, ('episodeTitle', {str})),
'series': remove_end(info_dict.get('title'), f' - {episode_title}'),
'season_number': traverse_obj(season_data, ('season', {int_or_none})),
'episode_number': traverse_obj(episode_data, ('episodeNumber', {int_or_none})), 'episode_number': traverse_obj(episode_data, ('episodeNumber', {int_or_none})),
'season_number': season_number,
'series': self._search_regex(
fr'^(.+)? - S{season_number} - ', info_dict.get('title'), 'series', default=None),
}) })
break break

134
yt_dlp/extractor/mixlr.py Normal file
View File

@ -0,0 +1,134 @@
from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import int_or_none, parse_iso8601, url_or_none, urlhandle_detect_ext
from ..utils.traversal import traverse_obj
class MixlrIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?P<username>[\w-]+)\.mixlr\.com/events/(?P<id>\d+)'
_TESTS = [{
'url': 'https://suncity-104-9fm.mixlr.com/events/4387115',
'info_dict': {
'id': '4387115',
'ext': 'mp3',
'title': r're:SUNCITY 104.9FM\'s live audio \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
'uploader': 'suncity-104-9fm',
'like_count': int,
'thumbnail': r're:https://imagecdn\.mixlr\.com/cdn-cgi/image/[^/?#]+/cd5b34d05fa2cee72d80477724a2f02e.png',
'timestamp': 1751943773,
'upload_date': '20250708',
'release_timestamp': 1751943764,
'release_date': '20250708',
'live_status': 'is_live',
},
}, {
'url': 'https://brcountdown.mixlr.com/events/4395480',
'info_dict': {
'id': '4395480',
'ext': 'aac',
'title': r're:Beats Revolution Countdown Episodio 461 \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
'description': 'md5:5cacd089723f7add3f266bd588315bb3',
'uploader': 'brcountdown',
'like_count': int,
'thumbnail': r're:https://imagecdn\.mixlr\.com/cdn-cgi/image/[^/?#]+/c48727a59f690b87a55d47d123ba0d6d.jpg',
'timestamp': 1752354007,
'upload_date': '20250712',
'release_timestamp': 1752354000,
'release_date': '20250712',
'live_status': 'is_live',
},
}, {
'url': 'https://www.brcountdown.mixlr.com/events/4395480',
'only_matching': True,
}]
def _real_extract(self, url):
username, event_id = self._match_valid_url(url).group('username', 'id')
broadcast_info = self._download_json(
f'https://api.mixlr.com/v3/channels/{username}/events/{event_id}', event_id)
formats = []
format_url = traverse_obj(
broadcast_info, ('included', 0, 'attributes', 'progressive_stream_url', {url_or_none}))
if format_url:
urlh = self._request_webpage(
HEADRequest(format_url), event_id, fatal=False, note='Checking stream')
if urlh and urlh.status == 200:
ext = urlhandle_detect_ext(urlh)
if ext == 'octet-stream':
self.report_warning(
'The server did not return a valid file extension for the stream URL. '
'Assuming an mp3 stream; postprocessing may fail if this is incorrect')
ext = 'mp3'
formats.append({
'url': format_url,
'ext': ext,
'vcodec': 'none',
})
release_timestamp = traverse_obj(
broadcast_info, ('data', 'attributes', 'starts_at', {str}))
if not formats and release_timestamp:
self.raise_no_formats(f'This event will start at {release_timestamp}', expected=True)
return {
'id': event_id,
'uploader': username,
'formats': formats,
'release_timestamp': parse_iso8601(release_timestamp),
**traverse_obj(broadcast_info, ('included', 0, 'attributes', {
'title': ('title', {str}),
'timestamp': ('started_at', {parse_iso8601}),
'concurrent_view_count': ('concurrent_view_count', {int_or_none}),
'like_count': ('heart_count', {int_or_none}),
'is_live': ('live', {bool}),
})),
**traverse_obj(broadcast_info, ('data', 'attributes', {
'title': ('title', {str}),
'description': ('description', {str}),
'timestamp': ('started_at', {parse_iso8601}),
'concurrent_view_count': ('concurrent_view_count', {int_or_none}),
'like_count': ('heart_count', {int_or_none}),
'thumbnail': ('artwork_url', {url_or_none}),
'uploader_id': ('broadcaster_id', {str}),
})),
}
class MixlrRecoringIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?P<username>[\w-]+)\.mixlr\.com/recordings/(?P<id>\d+)'
_TESTS = [{
'url': 'https://biblewayng.mixlr.com/recordings/2375193',
'info_dict': {
'id': '2375193',
'ext': 'mp3',
'title': "God's Jewels and Their Resting Place Bro. Adeniji",
'description': 'Preached February 21, 2024 in the evening',
'uploader_id': '8659190',
'duration': 10968,
'thumbnail': r're:https://imagecdn\.mixlr\.com/cdn-cgi/image/[^/?#]+/ceca120ef707f642abeea6e29cd74238.jpg',
'timestamp': 1708544542,
'upload_date': '20240221',
},
}]
def _real_extract(self, url):
username, recording_id = self._match_valid_url(url).group('username', 'id')
recording_info = self._download_json(
f'https://api.mixlr.com/v3/channels/{username}/recordings/{recording_id}', recording_id)
return {
'id': recording_id,
**traverse_obj(recording_info, ('data', 'attributes', {
'ext': ('file_format', {str}),
'url': ('url', {url_or_none}),
'title': ('title', {str}),
'description': ('description', {str}),
'timestamp': ('created_at', {parse_iso8601}),
'duration': ('duration', {int_or_none}),
'thumbnail': ('artwork_url', {url_or_none}),
'uploader_id': ('user_id', {str}),
})),
}

View File

@ -1,6 +1,3 @@
import json
import re
from .brightcove import BrightcoveNewIE from .brightcove import BrightcoveNewIE
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
@ -11,7 +8,12 @@ from ..utils import (
str_or_none, str_or_none,
url_or_none, url_or_none,
) )
from ..utils.traversal import require, traverse_obj, value from ..utils.traversal import (
get_first,
require,
traverse_obj,
value,
)
class NineNowIE(InfoExtractor): class NineNowIE(InfoExtractor):
@ -101,20 +103,11 @@ class NineNowIE(InfoExtractor):
}] }]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId={}' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId={}'
# XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.francetv and yt_dlp.extractor.goplay
def _find_json(self, s):
return self._search_json(
r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None)
def _real_extract(self, url): def _real_extract(self, url):
display_id, video_type = self._match_valid_url(url).group('id', 'type') display_id, video_type = self._match_valid_url(url).group('id', 'type')
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
common_data = traverse_obj( common_data = get_first(self._search_nextjs_v13_data(webpage, display_id), ('payload', {dict}))
re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*</script>', webpage),
(..., {json.loads}, ..., {self._find_json},
lambda _, v: v['payload'][video_type]['slug'] == display_id,
'payload', any, {require('video data')}))
if traverse_obj(common_data, (video_type, 'video', 'drm', {bool})): if traverse_obj(common_data, (video_type, 'video', 'drm', {bool})):
self.report_drm(display_id) self.report_drm(display_id)

View File

@ -21,6 +21,7 @@ from ..utils import (
js_to_json, js_to_json,
jwt_decode_hs256, jwt_decode_hs256,
merge_dicts, merge_dicts,
mimetype2ext,
parse_filesize, parse_filesize,
parse_iso8601, parse_iso8601,
parse_qs, parse_qs,
@ -28,9 +29,11 @@ from ..utils import (
smuggle_url, smuggle_url,
str_or_none, str_or_none,
traverse_obj, traverse_obj,
try_call,
try_get, try_get,
unified_timestamp, unified_timestamp,
unsmuggle_url, unsmuggle_url,
url_basename,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
urlhandle_detect_ext, urlhandle_detect_ext,
@ -45,14 +48,56 @@ class VimeoBaseInfoExtractor(InfoExtractor):
_REFERER_HINT = ( _REFERER_HINT = (
'Cannot download embed-only video without embedding URL. Please call yt-dlp ' 'Cannot download embed-only video without embedding URL. Please call yt-dlp '
'with the URL of the page that embeds this video.') 'with the URL of the page that embeds this video.')
_IOS_CLIENT_AUTH = 'MTMxNzViY2Y0NDE0YTQ5YzhjZTc0YmU0NjVjNDQxYzNkYWVjOWRlOTpHKzRvMmgzVUh4UkxjdU5FRW80cDNDbDhDWGR5dVJLNUJZZ055dHBHTTB4V1VzaG41bEx1a2hiN0NWYWNUcldSSW53dzRUdFRYZlJEZmFoTTArOTBUZkJHS3R4V2llYU04Qnl1bERSWWxUdXRidjNqR2J4SHFpVmtFSUcyRktuQw=='
_IOS_CLIENT_HEADERS = { _DEFAULT_CLIENT = 'android'
_CLIENT_HEADERS = {
'Accept': 'application/vnd.vimeo.*+json; version=3.4.10', 'Accept': 'application/vnd.vimeo.*+json; version=3.4.10',
'Accept-Language': 'en', 'Accept-Language': 'en',
'User-Agent': 'Vimeo/11.10.0 (com.vimeo; build:250424.164813.0; iOS 18.4.1) Alamofire/5.9.0 VimeoNetworking/5.0.0',
} }
_IOS_OAUTH_CACHE_KEY = 'oauth-token-ios' _CLIENT_CONFIGS = {
_ios_oauth_token = None 'android': {
'CACHE_KEY': 'oauth-token-android',
'CACHE_ONLY': False,
'VIEWER_JWT': False,
'REQUIRES_AUTH': False,
'AUTH': 'NzRmYTg5YjgxMWExY2JiNzUwZDg1MjhkMTYzZjQ4YWYyOGEyZGJlMTp4OGx2NFd3QnNvY1lkamI2UVZsdjdDYlNwSDUrdm50YzdNNThvWDcwN1JrenJGZC9tR1lReUNlRjRSVklZeWhYZVpRS0tBcU9YYzRoTGY2Z1dlVkJFYkdJc0dMRHpoZWFZbU0reDRqZ1dkZ1diZmdIdGUrNUM5RVBySlM0VG1qcw==',
'USER_AGENT': 'com.vimeo.android.videoapp (OnePlus, ONEPLUS A6003, OnePlus, Android 14/34 Version 11.8.1) Kotlin VimeoNetworking/3.12.0',
'VIDEOS_FIELDS': (
'uri', 'name', 'description', 'type', 'link', 'player_embed_url', 'duration', 'width',
'language', 'height', 'embed', 'created_time', 'modified_time', 'release_time', 'content_rating',
'content_rating_class', 'rating_mod_locked', 'license', 'privacy', 'pictures', 'tags', 'stats',
'categories', 'uploader', 'metadata', 'user', 'files', 'download', 'app', 'play', 'status',
'resource_key', 'badge', 'upload', 'transcode', 'is_playable', 'has_audio',
),
},
'ios': {
'CACHE_KEY': 'oauth-token-ios',
'CACHE_ONLY': True,
'VIEWER_JWT': False,
'REQUIRES_AUTH': False,
'AUTH': 'MTMxNzViY2Y0NDE0YTQ5YzhjZTc0YmU0NjVjNDQxYzNkYWVjOWRlOTpHKzRvMmgzVUh4UkxjdU5FRW80cDNDbDhDWGR5dVJLNUJZZ055dHBHTTB4V1VzaG41bEx1a2hiN0NWYWNUcldSSW53dzRUdFRYZlJEZmFoTTArOTBUZkJHS3R4V2llYU04Qnl1bERSWWxUdXRidjNqR2J4SHFpVmtFSUcyRktuQw==',
'USER_AGENT': 'Vimeo/11.10.0 (com.vimeo; build:250424.164813.0; iOS 18.4.1) Alamofire/5.9.0 VimeoNetworking/5.0.0',
'VIDEOS_FIELDS': (
'uri', 'name', 'description', 'type', 'link', 'player_embed_url', 'duration',
'width', 'language', 'height', 'embed', 'created_time', 'modified_time', 'release_time',
'content_rating', 'content_rating_class', 'rating_mod_locked', 'license', 'config_url',
'embed_player_config_url', 'privacy', 'pictures', 'tags', 'stats', 'categories', 'uploader',
'metadata', 'user', 'files', 'download', 'app', 'play', 'status', 'resource_key', 'badge',
'upload', 'transcode', 'is_playable', 'has_audio',
),
},
'web': {
'VIEWER_JWT': True,
'REQUIRES_AUTH': True,
'USER_AGENT': None,
'VIDEOS_FIELDS': (
'config_url', 'created_time', 'description', 'license',
'metadata.connections.comments.total', 'metadata.connections.likes.total',
'release_time', 'stats.plays',
),
},
}
_oauth_tokens = {}
_viewer_info = None _viewer_info = None
@staticmethod @staticmethod
@ -105,8 +150,8 @@ class VimeoBaseInfoExtractor(InfoExtractor):
raise ExtractorError('Unable to log in') raise ExtractorError('Unable to log in')
def _real_initialize(self): def _real_initialize(self):
if self._LOGIN_REQUIRED and not self._get_cookies('https://vimeo.com').get('vuid'): if self._LOGIN_REQUIRED and not self._get_cookies('https://vimeo.com').get('vimeo'):
self._raise_login_required() self.raise_login_required()
def _get_video_password(self): def _get_video_password(self):
password = self.get_param('videopassword') password = self.get_param('videopassword')
@ -277,52 +322,88 @@ class VimeoBaseInfoExtractor(InfoExtractor):
'_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'),
} }
def _fetch_oauth_token(self): def _fetch_oauth_token(self, client):
if not self._ios_oauth_token: client_config = self._CLIENT_CONFIGS[client]
self._ios_oauth_token = self.cache.load(self._NETRC_MACHINE, self._IOS_OAUTH_CACHE_KEY)
if not self._ios_oauth_token: if client_config['VIEWER_JWT']:
self._ios_oauth_token = self._download_json( return f'jwt {self._fetch_viewer_info()["jwt"]}'
cache_key = client_config['CACHE_KEY']
if not self._oauth_tokens.get(cache_key):
self._oauth_tokens[cache_key] = self.cache.load(self._NETRC_MACHINE, cache_key)
if not self._oauth_tokens.get(cache_key):
if client_config['CACHE_ONLY']:
raise ExtractorError(
f'The {client} client is unable to fetch new OAuth tokens '
f'and is only intended for use with previously cached tokens', expected=True)
self._oauth_tokens[cache_key] = self._download_json(
'https://api.vimeo.com/oauth/authorize/client', None, 'https://api.vimeo.com/oauth/authorize/client', None,
'Fetching OAuth token', 'Failed to fetch OAuth token', f'Fetching {client} OAuth token', f'Failed to fetch {client} OAuth token',
headers={ headers={
'Authorization': f'Basic {self._IOS_CLIENT_AUTH}', 'Authorization': f'Basic {client_config["AUTH"]}',
**self._IOS_CLIENT_HEADERS, 'User-Agent': client_config['USER_AGENT'],
**self._CLIENT_HEADERS,
}, data=urlencode_postdata({ }, data=urlencode_postdata({
'grant_type': 'client_credentials', 'grant_type': 'client_credentials',
'scope': 'private public create edit delete interact upload purchased stats', 'scope': 'private public create edit delete interact upload purchased stats video_files',
}, quote_via=urllib.parse.quote))['access_token'] }, quote_via=urllib.parse.quote))['access_token']
self.cache.store(self._NETRC_MACHINE, self._IOS_OAUTH_CACHE_KEY, self._ios_oauth_token) self.cache.store(self._NETRC_MACHINE, cache_key, self._oauth_tokens[cache_key])
return self._ios_oauth_token return f'Bearer {self._oauth_tokens[cache_key]}'
def _call_videos_api(self, video_id, unlisted_hash=None, path=None, *, force_client=None, query=None, **kwargs):
client = force_client or self._configuration_arg('client', [self._DEFAULT_CLIENT], ie_key=VimeoIE)[0]
if client not in self._CLIENT_CONFIGS:
raise ExtractorError(
f'Unsupported API client "{client}" requested. '
f'Supported clients are: {", ".join(self._CLIENT_CONFIGS)}', expected=True)
client_config = self._CLIENT_CONFIGS[client]
if client_config['REQUIRES_AUTH'] and not self._get_cookies('https://vimeo.com').get('vimeo'):
self.raise_login_required(f'The {client} client requires authentication')
def _call_videos_api(self, video_id, unlisted_hash=None, **kwargs):
return self._download_json( return self._download_json(
join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), join_nonempty(
video_id, 'Downloading API JSON', headers={ 'https://api.vimeo.com/videos',
'Authorization': f'Bearer {self._fetch_oauth_token()}', join_nonempty(video_id, unlisted_hash, delim=':'),
**self._IOS_CLIENT_HEADERS, path, delim='/'),
}, query={ video_id, f'Downloading {client} API JSON', f'Unable to download {client} API JSON',
'fields': ','.join(( headers=filter_dict({
'config_url', 'embed_player_config_url', 'player_embed_url', 'download', 'play', 'Authorization': self._fetch_oauth_token(client),
'files', 'description', 'license', 'release_time', 'created_time', 'stats.plays', 'User-Agent': client_config['USER_AGENT'],
'metadata.connections.comments.total', 'metadata.connections.likes.total')), **self._CLIENT_HEADERS,
}), query={
'fields': ','.join(client_config['VIDEOS_FIELDS']),
**(query or {}),
}, **kwargs) }, **kwargs)
def _extract_original_format(self, url, video_id, unlisted_hash=None, api_data=None): def _extract_original_format(self, url, video_id, unlisted_hash=None):
# Original/source formats are only available when logged in # Original/source formats are only available when logged in
if not self._get_cookies('https://vimeo.com/').get('vimeo'): if not self._get_cookies('https://vimeo.com/').get('vimeo'):
return return None
query = {'action': 'load_download_config'} policy = self._configuration_arg('original_format_policy', ['auto'], ie_key=VimeoIE)[0]
if unlisted_hash: if policy == 'never':
query['unlisted_hash'] = unlisted_hash return None
download_data = self._download_json(
url, video_id, 'Loading download config JSON', fatal=False, try:
query=query, headers={'X-Requested-With': 'XMLHttpRequest'}, download_data = self._download_json(
expected_status=(403, 404)) or {} url, video_id, 'Loading download config JSON', query=filter_dict({
source_file = download_data.get('source_file') 'action': 'load_download_config',
download_url = try_get(source_file, lambda x: x['download_url']) 'unlisted_hash': unlisted_hash,
}), headers={
'Accept': 'application/json',
'X-Requested-With': 'XMLHttpRequest',
})
except ExtractorError as error:
self.write_debug(f'Unable to load download config JSON: {error.cause}')
download_data = None
source_file = traverse_obj(download_data, ('source_file', {dict})) or {}
download_url = traverse_obj(source_file, ('download_url', {url_or_none}))
if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'):
source_name = source_file.get('public_name', 'Original') source_name = source_file.get('public_name', 'Original')
if self._is_valid_url(download_url, video_id, f'{source_name} video'): if self._is_valid_url(download_url, video_id, f'{source_name} video'):
@ -340,8 +421,27 @@ class VimeoBaseInfoExtractor(InfoExtractor):
'quality': 1, 'quality': 1,
} }
original_response = api_data or self._call_videos_api( # Most web client API requests are subject to rate-limiting (429) when logged-in.
video_id, unlisted_hash, fatal=False, expected_status=(403, 404)) # Requesting only the 'privacy' field is NOT rate-limited,
# so first we should check if video even has 'download' formats available
try:
privacy_info = self._call_videos_api(
video_id, unlisted_hash, force_client='web', query={'fields': 'privacy'})
except ExtractorError as error:
self.write_debug(f'Unable to download privacy info: {error.cause}')
return None
if not traverse_obj(privacy_info, ('privacy', 'download', {bool})):
msg = f'{video_id}: Vimeo says this video is not downloadable'
if policy != 'always':
self.write_debug(
f'{msg}, so yt-dlp is not attempting to extract the original/source format. '
f'To try anyways, use --extractor-args "vimeo:original_format_policy=always"')
return None
self.write_debug(f'{msg}; attempting to extract original/source format anyways')
original_response = self._call_videos_api(
video_id, unlisted_hash, force_client='web', query={'fields': 'download'}, fatal=False)
for download_data in traverse_obj(original_response, ('download', ..., {dict})): for download_data in traverse_obj(original_response, ('download', ..., {dict})):
download_url = download_data.get('link') download_url = download_data.get('link')
if not download_url or download_data.get('quality') != 'source': if not download_url or download_data.get('quality') != 'source':
@ -919,6 +1019,92 @@ class VimeoIE(VimeoBaseInfoExtractor):
raise ExtractorError('Wrong video password', expected=True) raise ExtractorError('Wrong video password', expected=True)
return checked return checked
def _get_subtitles(self, video_id, unlisted_hash):
subs = {}
text_tracks = self._call_videos_api(
video_id, unlisted_hash, path='texttracks', query={
'include_transcript': 'true',
'fields': ','.join((
'active', 'display_language', 'id', 'language', 'link', 'name', 'type', 'uri',
)),
}, fatal=False)
for tt in traverse_obj(text_tracks, ('data', lambda _, v: url_or_none(v['link']))):
subs.setdefault(tt.get('language'), []).append({
'url': tt['link'],
'ext': 'vtt',
'name': tt.get('display_language'),
})
return subs
def _parse_api_response(self, video, video_id, unlisted_hash=None):
formats, subtitles = [], {}
seen_urls = set()
duration = traverse_obj(video, ('duration', {int_or_none}))
for file in traverse_obj(video, (
(('play', (None, 'progressive')), 'files', 'download'), lambda _, v: url_or_none(v['link']),
)):
format_url = file['link']
if format_url in seen_urls:
continue
seen_urls.add(format_url)
quality = file.get('quality')
ext = determine_ext(format_url)
if quality == 'hls' or ext == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
elif quality == 'dash' or ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles(
format_url, video_id, mpd_id='dash', fatal=False)
for fmt in fmts:
fmt['format_id'] = join_nonempty(
*fmt['format_id'].split('-', 2)[:2], int_or_none(fmt.get('tbr')))
else:
fmt = traverse_obj(file, {
'ext': ('type', {mimetype2ext(default='mp4')}),
'vcodec': ('codec', {str.lower}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'filesize': ('size', {int_or_none}),
'fps': ('fps', {int_or_none}),
})
fmt.update({
'url': format_url,
'format_id': join_nonempty(
'http', traverse_obj(file, 'public_name', 'rendition'), quality),
'tbr': try_call(lambda: fmt['filesize'] * 8 / duration / 1024),
})
formats.append(fmt)
continue
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if traverse_obj(video, ('metadata', 'connections', 'texttracks', 'total', {int})):
self._merge_subtitles(self.extract_subtitles(video_id, unlisted_hash), target=subtitles)
return {
**traverse_obj(video, {
'title': ('name', {str}),
'uploader': ('user', 'name', {str}),
'uploader_id': ('user', 'link', {url_basename}),
'uploader_url': ('user', 'link', {url_or_none}),
'release_timestamp': ('live', 'scheduled_start_time', {int_or_none}),
'thumbnails': ('pictures', 'sizes', lambda _, v: url_or_none(v['link']), {
'url': 'link',
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
}),
}),
'id': video_id,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
'live_status': {
'streaming': 'is_live',
'done': 'was_live',
}.get(traverse_obj(video, ('live', 'status', {str}))),
}
def _extract_from_api(self, video_id, unlisted_hash=None): def _extract_from_api(self, video_id, unlisted_hash=None):
for retry in (False, True): for retry in (False, True):
try: try:
@ -934,10 +1120,13 @@ class VimeoIE(VimeoBaseInfoExtractor):
continue continue
raise raise
info = self._parse_config(self._download_json( if config_url := traverse_obj(video, ('config_url', {url_or_none})):
video['config_url'], video_id), video_id) info = self._parse_config(self._download_json(config_url, video_id), video_id)
else:
info = self._parse_api_response(video, video_id, unlisted_hash)
source_format = self._extract_original_format( source_format = self._extract_original_format(
f'https://vimeo.com/{video_id}', video_id, unlisted_hash, api_data=video) f'https://vimeo.com/{video_id}', video_id, unlisted_hash)
if source_format: if source_format:
info['formats'].append(source_format) info['formats'].append(source_format)