Compare commits

...

11 Commits

Author SHA1 Message Date
rdamas
1c739bf53e
[ie/ERRArhiiv] Add extractor (#15667)
Closes #15663
Authored by: rdamas
2026-01-27 16:53:38 +00:00
bashonly
e08fdaaec2
[ie/franceinfo] Fix extraction (#15704)
Closes #15701
Authored by: bashonly
2026-01-27 15:40:47 +00:00
Romain Reignier
ac3a566434
[ie/franceinfo] Support new domain URLs (#15669)
Closes #13173
Authored by: romainreignier
2026-01-27 14:09:16 +00:00
Alexander Bocken
1f4b26c39f
[ie/TheChosen] Support new URL format (#15687)
Closes #15686
Authored by: AlexBocken
2026-01-27 14:08:22 +00:00
bashonly
14998eef63
[ie/patreon] Extract inlined media (#15498)
Closes #15473
Authored by: bashonly
2026-01-27 12:52:49 +00:00
bashonly
a893774096
[ie/dailymotion] Support browser impersonation (#15697)
Fix 2b61a2a4b20b499d6497c9212207f72a52b922a6

Closes #15526
Authored by: bashonly
2026-01-27 12:47:19 +00:00
nlurker
a810871608
[ie/pbs] Fix extraction (#15083)
Closes #13299
Authored by: nlurker
2026-01-27 12:45:19 +00:00
Md5Lukas
f9a06197f5
[ie/boosty] Improve metadata extraction (#15543)
Authored by: Sytm
2026-01-27 12:39:10 +00:00
Mivik
a421eb06d1
[ie/neteasemusic] Fix merged lyrics extraction (#15052)
Authored by: Mivik
2026-01-27 12:30:11 +00:00
wesson09
bc6ff877dd
[ie/wat.tv] Improve DRM detection (#15659)
Closes #15647
Authored by: wesson09
2026-01-27 12:29:09 +00:00
Subrat Lima
1effa06dbf
[ie/volejtv] Fix and add extractors (#13226)
Closes #13203
Authored by: subrat-lima
2026-01-27 12:22:55 +00:00
11 changed files with 509 additions and 94 deletions

View File

@ -564,7 +564,10 @@ from .eroprofile import (
EroProfileAlbumIE,
EroProfileIE,
)
from .err import ERRJupiterIE
from .err import (
ERRArhiivIE,
ERRJupiterIE,
)
from .ertgr import (
ERTFlixCodenameIE,
ERTFlixIE,
@ -2360,7 +2363,11 @@ from .voicy import (
VoicyChannelIE,
VoicyIE,
)
from .volejtv import VolejTVIE
from .volejtv import (
VolejTVCategoryPlaylistIE,
VolejTVClubPlaylistIE,
VolejTVIE,
)
from .voxmedia import (
VoxMediaIE,
VoxMediaVolumeIE,

View File

@ -21,21 +21,44 @@ class BoostyIE(InfoExtractor):
'url': 'https://boosty.to/kuplinov/posts/e55d050c-e3bb-4873-a7db-ac7a49b40c38',
'info_dict': {
'id': 'd7473824-352e-48e2-ae53-d4aa39459968',
'title': 'phasma_3',
'title': 'Бан? А! Бан! (Phasmophobia)',
'alt_title': 'Бан? А! Бан! (Phasmophobia)',
'channel': 'Kuplinov',
'channel_id': '7958701',
'timestamp': 1655031975,
'upload_date': '20220612',
'release_timestamp': 1655049000,
'release_date': '20220612',
'modified_timestamp': 1668680993,
'modified_date': '20221117',
'modified_timestamp': 1743328648,
'modified_date': '20250330',
'tags': ['куплинов', 'phasmophobia'],
'like_count': int,
'ext': 'mp4',
'duration': 105,
'view_count': int,
'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?',
'thumbnail': r're:^https://iv\.okcdn\.ru/videoPreview\?',
},
}, {
# single ok_video with truncated title
'url': 'https://boosty.to/kuplinov/posts/cc09b7f9-121e-40b8-9392-4a075ef2ce53',
'info_dict': {
'id': 'fb5ea762-6303-4557-9a17-157947326810',
'title': 'Какая там активность была? Не слышу! Повтори еще пару раз! (Phas',
'alt_title': 'Какая там активность была? Не слышу! Повтори еще пару раз! (Phasmophobia)',
'channel': 'Kuplinov',
'channel_id': '7958701',
'timestamp': 1655031930,
'upload_date': '20220612',
'release_timestamp': 1655048400,
'release_date': '20220612',
'modified_timestamp': 1743328616,
'modified_date': '20250330',
'tags': ['куплинов', 'phasmophobia'],
'like_count': int,
'ext': 'mp4',
'duration': 39,
'view_count': int,
'thumbnail': r're:^https://iv\.okcdn\.ru/videoPreview\?',
},
}, {
# multiple ok_video
@ -109,36 +132,41 @@ class BoostyIE(InfoExtractor):
'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?',
},
}],
'skip': 'post has been deleted',
}, {
# single external video (youtube)
'url': 'https://boosty.to/denischuzhoy/posts/6094a487-bcec-4cf8-a453-43313b463c38',
'url': 'https://boosty.to/futuremusicproduction/posts/32a8cae2-3252-49da-b285-0e014bc6e565',
'info_dict': {
'id': 'EXelTnve5lY',
'title': 'Послание Президента Федеральному Собранию | Класс народа',
'upload_date': '20210425',
'channel': 'Денис Чужой',
'tags': 'count:10',
'id': '-37FW_YQ3B4',
'title': 'Afro | Deep House FREE FLP',
'media_type': 'video',
'upload_date': '20250829',
'timestamp': 1756466005,
'channel': 'Future Music Production',
'tags': 'count:0',
'like_count': int,
'ext': 'mp4',
'duration': 816,
'ext': 'm4a',
'duration': 170,
'view_count': int,
'thumbnail': r're:^https://i\.ytimg\.com/',
'age_limit': 0,
'availability': 'public',
'categories': list,
'channel_follower_count': int,
'channel_id': 'UCCzVNbWZfYpBfyofCCUD_0w',
'channel_is_verified': bool,
'channel_id': 'UCKVYrFBYmci1e-T8NeHw2qg',
'channel_url': r're:^https://www\.youtube\.com/',
'comment_count': int,
'description': str,
'heatmap': 'count:100',
'live_status': str,
'playable_in_embed': bool,
'uploader': str,
'uploader_id': str,
'uploader_url': r're:^https://www\.youtube\.com/',
},
'expected_warnings': [
'Remote components challenge solver script',
'n challenge solving failed',
],
}]
_MP4_TYPES = ('tiny', 'lowest', 'low', 'medium', 'high', 'full_hd', 'quad_hd', 'ultra_hd')
@ -207,13 +235,14 @@ class BoostyIE(InfoExtractor):
video_id = item.get('id') or post_id
entries.append({
'id': video_id,
'alt_title': post_title,
'formats': self._extract_formats(item.get('playerUrls'), video_id),
**common_metadata,
**traverse_obj(item, {
'title': ('title', {str}),
'duration': ('duration', {int_or_none}),
'view_count': ('viewsCounter', {int_or_none}),
'thumbnail': (('previewUrl', 'defaultPreview'), {url_or_none}),
'thumbnail': (('preview', 'defaultPreview'), {url_or_none}),
}, get_all=False)})
if not entries and not post.get('hasAccess'):

View File

@ -366,8 +366,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
@staticmethod
def _generate_blockbuster_headers():
# Randomize our HTTP header fingerprint to bust the HTTP Error 403 block
# See https://github.com/yt-dlp/yt-dlp/issues/15526
"""Randomize our HTTP header fingerprint to bust the HTTP Error 403 block"""
def random_letters(minimum, maximum):
# Omit vowels so we don't generate valid header names like 'authorization', etc
@ -378,6 +377,43 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
for _ in range(random.randint(2, 8))
}
def _extract_dailymotion_m3u8_formats_and_subtitles(self, media_url, video_id, live=False):
"""See https://github.com/yt-dlp/yt-dlp/issues/15526"""
ERROR_NOTE = 'Unable to download m3u8 information'
last_error = None
for note, kwargs in (
('Downloading m3u8 information', {}),
('Retrying m3u8 download with randomized headers', {
'headers': self._generate_blockbuster_headers(),
}),
('Retrying m3u8 download with Chrome impersonation', {
'impersonate': 'chrome',
'require_impersonation': True,
}),
('Retrying m3u8 download with Firefox impersonation', {
'impersonate': 'firefox',
'require_impersonation': True,
}),
):
try:
m3u8_doc = self._download_webpage(media_url, video_id, note, ERROR_NOTE, **kwargs)
break
except ExtractorError as e:
last_error = e.orig_msg
self.write_debug(f'{video_id}: {last_error}')
else:
if 'impersonation' not in last_error:
self.report_warning(last_error, video_id=video_id)
last_error = None
return [], {}, last_error
formats, subtitles = self._parse_m3u8_formats_and_subtitles(
m3u8_doc, media_url, 'mp4', m3u8_id='hls', live=live, fatal=False)
return formats, subtitles, last_error
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url)
video_id, is_playlist, playlist_id = self._match_valid_url(url).group('id', 'is_playlist', 'playlist_id')
@ -431,6 +467,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
is_live = media.get('isOnAir')
formats = []
subtitles = {}
expected_error = None
for quality, media_list in metadata['qualities'].items():
for m in media_list:
@ -439,9 +476,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
if not media_url or media_type == 'application/vnd.lumberjack.manifest':
continue
if media_type == 'application/x-mpegURL':
fmt, subs = self._extract_m3u8_formats_and_subtitles(
media_url, video_id, 'mp4', live=is_live, m3u8_id='hls',
fatal=False, headers=self._generate_blockbuster_headers())
fmt, subs, expected_error = self._extract_dailymotion_m3u8_formats_and_subtitles(
media_url, video_id, live=is_live)
formats.extend(fmt)
self._merge_subtitles(subs, target=subtitles)
else:
@ -458,6 +494,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'width': width,
})
formats.append(f)
if not formats and expected_error:
self.raise_no_formats(expected_error, expected=True)
for f in formats:
f['url'] = f['url'].split('#')[0]
if not f.get('fps') and f['format_id'].endswith('@60'):

View File

@ -2,6 +2,7 @@ from .common import InfoExtractor
from ..utils import (
clean_html,
int_or_none,
parse_iso8601,
str_or_none,
url_or_none,
)
@ -222,3 +223,70 @@ class ERRJupiterIE(InfoExtractor):
'episode_id': ('id', {str_or_none}),
}) if data.get('type') == 'episode' else {}),
}
class ERRArhiivIE(InfoExtractor):
_VALID_URL = r'https://arhiiv\.err\.ee/video/(?:vaata/)?(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://arhiiv.err.ee/video/kontsertpalad',
'info_dict': {
'id': 'kontsertpalad',
'ext': 'mp4',
'title': 'Kontsertpalad: 255 | L. Beethoveni sonaat c-moll, "Pateetiline"',
'description': 'md5:a70f4ff23c3618f3be63f704bccef063',
'series': 'Kontsertpalad',
'episode_id': 255,
'timestamp': 1666152162,
'upload_date': '20221019',
'release_year': 1970,
'modified_timestamp': 1718620982,
'modified_date': '20240617',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://arhiiv.err.ee/video/vaata/koalitsioonileppe-allkirjastamine',
'info_dict': {
'id': 'koalitsioonileppe-allkirjastamine',
'ext': 'mp4',
'title': 'Koalitsioonileppe allkirjastamine',
'timestamp': 1710728222,
'upload_date': '20240318',
'release_timestamp': 1611532800,
'release_date': '20210125',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
f'https://arhiiv.err.ee/api/v1/content/video/{video_id}', video_id)
formats, subtitles = [], {}
if hls_url := traverse_obj(data, ('media', 'src', 'hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(data, ('media', 'src', 'dash', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(data, ('info', {
'title': ('title', {str}),
'series': ('seriesTitle', {str}, filter),
'series_id': ('seriesId', {str}, filter),
'episode_id': ('episode', {int_or_none}),
'description': ('synopsis', {str}, filter),
'timestamp': ('uploadDate', {parse_iso8601}),
'modified_timestamp': ('dateModified', {parse_iso8601}),
'release_timestamp': ('date', {parse_iso8601}),
'release_year': ('year', {int_or_none}),
})),
}

View File

@ -371,15 +371,16 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
class FranceTVInfoIE(FranceTVBaseInfoExtractor):
IE_NAME = 'francetvinfo.fr'
_VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)'
IE_NAME = 'franceinfo'
IE_DESC = 'franceinfo.fr (formerly francetvinfo.fr)'
_VALID_URL = r'https?://(?:www|mobile|france3-regions)\.france(?:tv)?info.fr/(?:[^/?#]+/)*(?P<id>[^/?#&.]+)'
_TESTS = [{
'url': 'https://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-jeudi-22-aout-2019_3561461.html',
'info_dict': {
'id': 'd12458ee-5062-48fe-bfdd-a30d6a01b793',
'ext': 'mp4',
'title': 'Soir 3',
'title': 'Soir 3 - Émission du jeudi 22 août 2019',
'upload_date': '20190822',
'timestamp': 1566510730,
'thumbnail': r're:^https?://.*\.jpe?g$',
@ -398,7 +399,7 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
'info_dict': {
'id': '7d204c9e-a2d3-11eb-9e4c-000d3a23d482',
'ext': 'mp4',
'title': 'Covid-19 : une situation catastrophique à New Dehli - Édition du mercredi 21 avril 2021',
'title': 'Journal 20h00 - Covid-19 : une situation catastrophique à New Dehli',
'thumbnail': r're:^https?://.*\.jpe?g$',
'duration': 76,
'timestamp': 1619028518,
@ -438,6 +439,18 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
'thumbnail': r're:https://[^/?#]+/v/[^/?#]+/x1080',
},
'add_ie': ['Dailymotion'],
'skip': 'Broken Dailymotion link',
}, {
'url': 'https://www.franceinfo.fr/monde/usa/presidentielle/donald-trump/etats-unis-un-risque-d-embrasement-apres-la-mort-d-un-manifestant_7764542.html',
'info_dict': {
'id': 'f920fcc2-fa20-11f0-ac98-57a09c50f7ce',
'ext': 'mp4',
'title': 'Affaires sensibles - Manifestant tué Le risque d\'embrasement',
'duration': 118,
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1769367756,
'upload_date': '20260125',
},
}, {
'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin',
'only_matching': True,
@ -445,6 +458,9 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
# "<figure id=" pattern (#28792)
'url': 'https://www.francetvinfo.fr/culture/patrimoine/incendie-de-notre-dame-de-paris/notre-dame-de-paris-de-l-incendie-de-la-cathedrale-a-sa-reconstruction_4372291.html',
'only_matching': True,
}, {
'url': 'https://www.franceinfo.fr/replay-jt/france-2/20-heures/robert-de-niro-portrait-d-un-monument-du-cinema_7245456.html',
'only_matching': True,
}]
def _real_extract(self, url):
@ -460,7 +476,7 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
video_id = (
traverse_obj(webpage, (
{find_element(tag='button', attr='data-cy', value='francetv-player-wrapper', html=True)},
{find_element(tag='(button|div)', attr='data-cy', value='francetv-player-wrapper', html=True, regex=True)},
{extract_attributes}, 'id'))
or self._search_regex(
(r'player\.load[^;]+src:\s*["\']([^"\']+)',

View File

@ -104,9 +104,9 @@ class FrontroGroupBaseIE(FrontoBaseIE):
class TheChosenIE(FrontroVideoBaseIE):
_CHANNEL_ID = '12884901895'
_VALID_URL = r'https?://(?:www\.)?watch\.thechosen\.tv/video/(?P<id>[0-9]+)'
_VALID_URL = r'https?://(?:www\.)?watch\.thechosen\.tv/watch/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://watch.thechosen.tv/video/184683594325',
'url': 'https://watch.thechosen.tv/watch/184683594325',
'md5': '3f878b689588c71b38ec9943c54ff5b0',
'info_dict': {
'id': '184683594325',
@ -124,7 +124,7 @@ class TheChosenIE(FrontroVideoBaseIE):
'modified_date': str,
},
}, {
'url': 'https://watch.thechosen.tv/video/184683596189',
'url': 'https://watch.thechosen.tv/watch/184683596189',
'md5': 'd581562f9d29ce82f5b7770415334151',
'info_dict': {
'id': '184683596189',
@ -147,7 +147,7 @@ class TheChosenIE(FrontroVideoBaseIE):
class TheChosenGroupIE(FrontroGroupBaseIE):
_CHANNEL_ID = '12884901895'
_VIDEO_EXTRACTOR = TheChosenIE
_VIDEO_URL_TMPL = 'https://watch.thechosen.tv/video/%s'
_VIDEO_URL_TMPL = 'https://watch.thechosen.tv/watch/%s'
_VALID_URL = r'https?://(?:www\.)?watch\.thechosen\.tv/group/(?P<id>[0-9]+)'
_TESTS = [{

View File

@ -156,18 +156,36 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'id': '17241424',
'ext': 'mp3',
'title': 'Opus 28',
'upload_date': '20080211',
'timestamp': 1202745600,
'upload_date': '20060912',
'timestamp': 1158076800,
'duration': 263,
'thumbnail': r're:^http.*\.jpg',
'album': 'Piano Solos Vol. 2',
'album': 'Piano Solos, Vol. 2',
'album_artist': 'Dustin O\'Halloran',
'average_rating': int,
'description': '[00:05.00]纯音乐,请欣赏\n',
'description': 'md5:b566b92c55ca348df65d206c5d689576',
'album_artists': ['Dustin O\'Halloran'],
'creators': ['Dustin O\'Halloran'],
'subtitles': {'lyrics': [{'ext': 'lrc'}]},
},
}, {
'url': 'https://music.163.com/#/song?id=2755669231',
'info_dict': {
'id': '2755669231',
'ext': 'mp3',
'title': '十二月-Departure',
'upload_date': '20251111',
'timestamp': 1762876800,
'duration': 188,
'thumbnail': r're:^http.*\.jpg',
'album': '',
'album_artist': 'ひとひら',
'average_rating': int,
'description': 'md5:deee249c8c9c3e2c54ecdab36e87d174',
'album_artists': ['ひとひら'],
'creators': ['ひとひら'],
'subtitles': {'lyrics': [{'ext': 'lrc', 'data': 'md5:d32b4425a5d6c9fa249ca6e803dd0401'}]},
},
}, {
'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846',
'md5': 'b896be78d8d34bd7bb665b26710913ff',
@ -241,9 +259,16 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'lyrics': [{'data': original, 'ext': 'lrc'}],
}
lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)'
original_ts_texts = re.findall(lyrics_expr, original)
translation_ts_dict = dict(re.findall(lyrics_expr, translated))
def collect_lyrics(lrc):
lyrics_expr = r'\[([0-9]{2}):([0-9]{2})[:.]([0-9]{2,})\]([^\n]+)'
matches = re.findall(lyrics_expr, lrc)
return (
(f'[{minute}:{sec}.{msec}]', text)
for minute, sec, msec, text in matches
)
original_ts_texts = collect_lyrics(original)
translation_ts_dict = dict(collect_lyrics(translated))
merged = '\n'.join(
join_nonempty(f'{timestamp}{text}', translation_ts_dict.get(timestamp, ''), delim=' / ')

View File

@ -1,6 +1,5 @@
import functools
import itertools
import urllib.parse
from .common import InfoExtractor
from .sproutvideo import VidsIoIE
@ -11,15 +10,23 @@ from ..utils import (
ExtractorError,
clean_html,
determine_ext,
extract_attributes,
float_or_none,
int_or_none,
mimetype2ext,
parse_iso8601,
smuggle_url,
str_or_none,
update_url_query,
url_or_none,
urljoin,
)
from ..utils.traversal import require, traverse_obj, value
from ..utils.traversal import (
find_elements,
require,
traverse_obj,
value,
)
class PatreonBaseIE(InfoExtractor):
@ -121,6 +128,7 @@ class PatreonIE(PatreonBaseIE):
'channel_is_verified': True,
'chapters': 'count:4',
'timestamp': 1423689666,
'media_type': 'video',
},
'params': {
'noplaylist': True,
@ -161,7 +169,7 @@ class PatreonIE(PatreonBaseIE):
'uploader_url': 'https://www.patreon.com/loish',
'description': 'md5:e2693e97ee299c8ece47ffdb67e7d9d2',
'title': 'VIDEO // sketchbook flipthrough',
'uploader': 'Loish ',
'uploader': 'Loish',
'tags': ['sketchbook', 'video'],
'channel_id': '1641751',
'channel_url': 'https://www.patreon.com/loish',
@ -274,8 +282,73 @@ class PatreonIE(PatreonBaseIE):
'channel_id': '9346307',
},
'params': {'getcomments': True},
}, {
# Inlined media in post; uses _extract_from_media_api
'url': 'https://www.patreon.com/posts/scottfalco-146966245',
'info_dict': {
'id': '146966245',
'ext': 'mp4',
'title': 'scottfalco 1080',
'description': 'md5:a3f29bbd0a46b4821ec3400957c98aa2',
'uploader': 'Insanimate',
'uploader_id': '2828146',
'uploader_url': 'https://www.patreon.com/Insanimate',
'channel_id': '6260877',
'channel_url': 'https://www.patreon.com/Insanimate',
'channel_follower_count': int,
'comment_count': int,
'like_count': int,
'duration': 7.833333,
'timestamp': 1767061800,
'upload_date': '20251230',
},
}]
_RETURN_TYPE = 'video'
_HTTP_HEADERS = {
# Must be all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, and Vimeo.
# patreon.com URLs redirect to www.patreon.com; this matters when requesting mux.com m3u8s
'referer': 'https://www.patreon.com/',
}
def _extract_from_media_api(self, media_id):
attributes = traverse_obj(
self._call_api(f'media/{media_id}', media_id, fatal=False),
('data', 'attributes', {dict}))
if not attributes:
return None
info_dict = traverse_obj(attributes, {
'title': ('file_name', {lambda x: x.rpartition('.')[0]}),
'timestamp': ('created_at', {parse_iso8601}),
'duration': ('display', 'duration', {float_or_none}),
})
info_dict['id'] = media_id
playback_url = traverse_obj(
attributes, ('display', (None, 'viewer_playback_data'), 'url', {url_or_none}, any))
download_url = traverse_obj(attributes, ('download_url', {url_or_none}))
if playback_url and mimetype2ext(attributes.get('mimetype')) == 'm3u8':
info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(
playback_url, media_id, 'mp4', fatal=False, headers=self._HTTP_HEADERS)
for f in info_dict['formats']:
f['http_headers'] = self._HTTP_HEADERS
if transcript_url := traverse_obj(attributes, ('display', 'transcript_url', {url_or_none})):
info_dict['subtitles'].setdefault('en', []).append({
'url': transcript_url,
'ext': 'vtt',
})
elif playback_url or download_url:
info_dict['formats'] = [{
# If playback_url is available, download_url is a duplicate lower resolution format
'url': playback_url or download_url,
'vcodec': 'none' if attributes.get('media_type') != 'video' else None,
}]
if not info_dict.get('formats'):
return None
return info_dict
def _real_extract(self, url):
video_id = self._match_id(url)
@ -299,6 +372,7 @@ class PatreonIE(PatreonBaseIE):
'comment_count': ('comment_count', {int_or_none}),
})
seen_media_ids = set()
entries = []
idx = 0
for include in traverse_obj(post, ('included', lambda _, v: v['type'])):
@ -320,6 +394,8 @@ class PatreonIE(PatreonBaseIE):
'url': download_url,
'alt_title': traverse_obj(media_attributes, ('file_name', {str})),
})
if media_id := traverse_obj(include, ('id', {str})):
seen_media_ids.add(media_id)
elif include_type == 'user':
info.update(traverse_obj(include, {
@ -340,34 +416,29 @@ class PatreonIE(PatreonBaseIE):
'channel_follower_count': ('attributes', 'patron_count', {int_or_none}),
}))
# Must be all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, and Vimeo.
# patreon.com URLs redirect to www.patreon.com; this matters when requesting mux.com m3u8s
headers = {'referer': 'https://www.patreon.com/'}
if embed_url := traverse_obj(attributes, ('embed', 'url', {url_or_none})):
# Convert useless vimeo.com URLs to useful player.vimeo.com embed URLs
vimeo_id, vimeo_hash = self._search_regex(
r'//vimeo\.com/(\d+)(?:/([\da-f]+))?', embed_url,
'vimeo id', group=(1, 2), default=(None, None))
if vimeo_id:
embed_url = update_url_query(
f'https://player.vimeo.com/video/{vimeo_id}',
{'h': vimeo_hash or []})
if VimeoIE.suitable(embed_url):
entry = self.url_result(
VimeoIE._smuggle_referrer(embed_url, self._HTTP_HEADERS['referer']),
VimeoIE, url_transparent=True)
else:
entry = self.url_result(smuggle_url(embed_url, self._HTTP_HEADERS))
# handle Vimeo embeds
if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo':
v_url = urllib.parse.unquote(self._html_search_regex(
r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)',
traverse_obj(attributes, ('embed', 'html', {str})), 'vimeo url', fatal=False) or '')
if url_or_none(v_url) and self._request_webpage(
v_url, video_id, 'Checking Vimeo embed URL', headers=headers,
fatal=False, errnote=False, expected_status=429): # 429 is TLS fingerprint rejection
entries.append(self.url_result(
VimeoIE._smuggle_referrer(v_url, headers['referer']),
VimeoIE, url_transparent=True))
embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none}))
if embed_url and (urlh := self._request_webpage(
embed_url, video_id, 'Checking embed URL', headers=headers,
fatal=False, errnote=False, expected_status=403)):
# Vimeo's Cloudflare anti-bot protection will return HTTP status 200 for 404, so we need
# to check for "Sorry, we couldn&amp;rsquo;t find that page" in the meta description tag
meta_description = clean_html(self._html_search_meta(
'description', self._webpage_read_content(urlh, embed_url, video_id, fatal=False), default=None))
# Password-protected vids.io embeds return 403 errors w/o --video-password or session cookie
if ((urlh.status != 403 and meta_description != 'Sorry, we couldnt find that page')
or VidsIoIE.suitable(embed_url)):
entries.append(self.url_result(smuggle_url(embed_url, headers)))
if urlh := self._request_webpage(
embed_url, video_id, 'Checking embed URL', headers=self._HTTP_HEADERS,
fatal=False, errnote=False, expected_status=(403, 429), # Ignore Vimeo 429's
):
# Password-protected vids.io embeds return 403 errors w/o --video-password or session cookie
if VidsIoIE.suitable(embed_url) or urlh.status != 403:
entries.append(entry)
post_file = traverse_obj(attributes, ('post_file', {dict}))
if post_file:
@ -381,13 +452,27 @@ class PatreonIE(PatreonBaseIE):
})
elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
post_file['url'], video_id, headers=headers)
post_file['url'], video_id, headers=self._HTTP_HEADERS)
for f in formats:
f['http_headers'] = self._HTTP_HEADERS
entries.append({
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'http_headers': headers,
})
if media_id := traverse_obj(post_file, ('media_id', {int}, {str_or_none})):
seen_media_ids.add(media_id)
for media_id in traverse_obj(attributes, (
'content', {find_elements(attr='data-media-id', value=r'\d+', regex=True, html=True)},
..., {extract_attributes}, 'data-media-id',
)):
# Inlined media may be duplicates of what was extracted above
if media_id in seen_media_ids:
continue
if media := self._extract_from_media_api(media_id):
entries.append(media)
seen_media_ids.add(media_id)
can_view_post = traverse_obj(attributes, 'current_user_can_view')
comments = None

View File

@ -453,6 +453,23 @@ class PBSIE(InfoExtractor):
'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=',
'only_matching': True,
},
{
# Next.js v13+, see https://github.com/yt-dlp/yt-dlp/issues/13299
'url': 'https://www.pbs.org/video/caregiving',
'info_dict': {
'id': '3101776876',
'ext': 'mp4',
'title': 'Caregiving - Caregiving',
'description': 'A documentary revealing Americas caregiving crisis through intimate stories and expert insight.',
'display_id': 'caregiving',
'duration': 6783,
'thumbnail': 'https://image.pbs.org/video-assets/BSrSkcc-asset-mezzanine-16x9-nlcxQts.jpg',
'chapters': [],
},
'params': {
'skip_download': True,
},
},
]
_ERRORS = {
101: 'We\'re sorry, but this video is not yet available.',
@ -506,6 +523,7 @@ class PBSIE(InfoExtractor):
r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',",
r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/
r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/
r'\\"videoTPMediaId\\":\\\"(\d+)\\"', # Next.js v13, e.g. https://www.pbs.org/video/caregiving
r'\bhttps?://player\.pbs\.org/[\w-]+player/(\d+)', # last pattern to avoid false positives
]

View File

@ -1,40 +1,167 @@
import functools
from .common import InfoExtractor
from ..utils import (
InAdvancePagedList,
int_or_none,
join_nonempty,
orderedSet,
str_or_none,
strftime_or_none,
unified_timestamp,
url_or_none,
)
from ..utils.traversal import (
require,
traverse_obj,
)
class VolejTVIE(InfoExtractor):
_VALID_URL = r'https?://volej\.tv/video/(?P<id>\d+)'
class VolejTVBaseIE(InfoExtractor):
TBR_HEIGHT_MAPPING = {
'6000': 1080,
'2400': 720,
'1500': 480,
'800': 360,
}
def _call_api(self, endpoint, display_id, query=None):
return self._download_json(
f'https://api-volejtv-prod.apps.okd4.devopsie.cloud/api/{endpoint}',
display_id, query=query)
class VolejTVIE(VolejTVBaseIE):
IE_NAME = 'volejtv:match'
_VALID_URL = r'https?://volej\.tv/match/(?P<id>\d+)'
_TESTS = [{
'url': 'https://volej.tv/video/725742/',
'url': 'https://volej.tv/match/270579',
'info_dict': {
'id': '725742',
'id': '270579',
'ext': 'mp4',
'description': 'Zápas VK Královo Pole vs VK Prostějov 10.12.2022 v 19:00 na Volej.TV',
'thumbnail': 'https://volej.tv/images/og/16/17186/og.png',
'title': 'VK Královo Pole vs VK Prostějov',
'title': 'SWE-CZE (2024-06-16)',
'categories': ['ženy'],
'series': 'ZLATÁ EVROPSKÁ VOLEJBALOVÁ LIGA',
'season': '2023-2024',
'timestamp': 1718553600,
'upload_date': '20240616',
},
}, {
'url': 'https://volej.tv/video/725605/',
'url': 'https://volej.tv/match/487520',
'info_dict': {
'id': '725605',
'id': '487520',
'ext': 'mp4',
'thumbnail': 'https://volej.tv/images/og/15/17185/og.png',
'title': 'VK Lvi Praha vs VK Euro Sitex Příbram',
'description': 'Zápas VK Lvi Praha vs VK Euro Sitex Příbram 11.12.2022 v 19:00 na Volej.TV',
'thumbnail': r're:https://.+\.(png|jpeg)',
'title': 'FRA-CZE (2024-09-06)',
'categories': ['mládež'],
'series': 'Mistrovství Evropy do 20 let',
'season': '2024-2025',
'timestamp': 1725627600,
'upload_date': '20240906',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
json_data = self._search_json(
r'<\s*!\[CDATA[^=]+=', webpage, 'CDATA', video_id)
formats, subtitle = self._extract_m3u8_formats_and_subtitles(
json_data['urls']['hls'], video_id)
return {
json_data = self._call_api(f'match/{video_id}', video_id)
formats = []
for video in traverse_obj(json_data, ('videos', 0, 'qualities', lambda _, v: url_or_none(v['cloud_front_path']))):
formats.append(traverse_obj(video, {
'url': 'cloud_front_path',
'tbr': ('quality', {int_or_none}),
'format_id': ('id', {str_or_none}),
'height': ('quality', {self.TBR_HEIGHT_MAPPING.get}),
}))
data = {
'id': video_id,
'title': self._html_search_meta(['og:title', 'twitter:title'], webpage),
'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage),
'description': self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage),
**traverse_obj(json_data, {
'series': ('competition_name', {str}),
'season': ('season', {str}),
'timestamp': ('match_time', {unified_timestamp}),
'categories': ('category', ('title'), {str}, filter, all, filter),
'thumbnail': ('poster', {url_or_none}),
}),
'formats': formats,
'subtitles': subtitle,
}
teams = orderedSet(traverse_obj(json_data, ('teams', ..., 'shortcut', {str})))
if len(teams) > 2 and 'FIN' in teams:
teams.remove('FIN')
data['title'] = join_nonempty(
join_nonempty(*teams, delim='-'),
strftime_or_none(data.get('timestamp'), '(%Y-%m-%d)'),
delim=' ')
return data
class VolejTVPlaylistBaseIE(VolejTVBaseIE):
"""Subclasses must set _API_FILTER, _PAGE_SIZE"""
def _get_page(self, playlist_id, page):
return self._call_api(
f'match/{self._API_FILTER}/{playlist_id}', playlist_id,
query={'page': page + 1, 'take': self._PAGE_SIZE, 'order': 'DESC'})
def _entries(self, playlist_id, first_page_data, page):
entries = first_page_data if page == 0 else self._get_page(playlist_id, page)
for match_id in traverse_obj(entries, ('data', ..., 'id')):
yield self.url_result(f'https://volej.tv/match/{match_id}', VolejTVIE)
class VolejTVClubPlaylistIE(VolejTVPlaylistBaseIE):
IE_NAME = 'volejtv:club'
_VALID_URL = r'https?://volej\.tv/klub/(?P<id>\d+)'
_TESTS = [{
'url': 'https://volej.tv/klub/1173',
'info_dict': {
'id': '1173',
'title': 'VK Jihostroj České Budějovice',
},
'playlist_mincount': 30,
}]
_API_FILTER = 'by-team-id-paginated'
_PAGE_SIZE = 6
def _real_extract(self, url):
playlist_id = self._match_id(url)
title = self._call_api(f'team/show/{playlist_id}', playlist_id)['title']
first_page_data = self._get_page(playlist_id, 0)
total_pages = traverse_obj(first_page_data, ('meta', 'pageCount', {int}, {require('page count')}))
return self.playlist_result(InAdvancePagedList(
functools.partial(self._entries, playlist_id, first_page_data),
total_pages, self._PAGE_SIZE), playlist_id, title)
class VolejTVCategoryPlaylistIE(VolejTVPlaylistBaseIE):
IE_NAME = 'volejtv:category'
_VALID_URL = r'https?://volej\.tv/kategorie/(?P<id>[^/$?]+)'
_TESTS = [{
'url': 'https://volej.tv/kategorie/chance-cesky-pohar',
'info_dict': {
'id': 'chance-cesky-pohar',
'title': 'Chance Český pohár',
},
'playlist_mincount': 30,
}]
_API_FILTER = 'by-category-id-paginated'
_PAGE_SIZE = 10
def _get_category(self, playlist_id):
categories = self._call_api('category', playlist_id)
for category in traverse_obj(categories, (lambda _, v: v['slug'] and v['id'] and v['title'])):
if category['slug'] == playlist_id:
return category['id'], category['title']
def _real_extract(self, url):
playlist_id = self._match_id(url)
category_id, title = self._get_category(playlist_id)
first_page_data = self._get_page(category_id, 0)
total_pages = traverse_obj(first_page_data, ('meta', 'pageCount', {int}, {require('page count')}))
return self.playlist_result(InAdvancePagedList(
functools.partial(self._entries, category_id, first_page_data),
total_pages, self._PAGE_SIZE), playlist_id, title)

View File

@ -76,7 +76,7 @@ class WatIE(InfoExtractor):
if error_code == 'GEOBLOCKED':
self.raise_geo_restricted(error_desc, video_info.get('geoList'))
elif error_code == 'DELIVERY_ERROR':
if traverse_obj(video_data, ('delivery', 'code')) == 500:
if traverse_obj(video_data, ('delivery', 'code')) in (403, 500):
self.report_drm(video_id)
error_desc = join_nonempty(
error_desc, traverse_obj(video_data, ('delivery', 'error', {str})), delim=': ')