Compare commits

...

7 Commits

Author SHA1 Message Date
bashonly
1d1358d09f
[ie] Add browser impersonation support to more extractors (#16029)
Closes #7001, Closes #7444, Closes #16004
Authored by: bashonly
2026-02-21 19:24:05 +00:00
blauerdorf
1fe0bf23aa
[ie/spankbang] Fix playlist title extraction (#14132)
Closes #14131
Authored by: blauerdorf
2026-02-21 18:57:20 +00:00
blauerdorf
f05e1cd1f1
[ie/spankbang] Support browser impersonation (#14130)
Closes #14129
Authored by: blauerdorf
2026-02-21 18:51:52 +00:00
bashonly
46d5b6f2b7
[ie/learningonscreen] Fix extractor (#16028)
Closes #15934
Authored by: bashonly, 0xvd
2026-02-21 18:27:33 +00:00
LordMZTE
166356d1a1
[ie/opencast] Support oc-p.uni-jena.de URLs (#16026)
Closes #16023
Authored by: LordMZTE
2026-02-21 18:01:34 +00:00
Sipherdrakon
2485653859
[ie/aenetworks] Fix extractor (#14959)
Closes #14578
Authored by: Sipherdrakon
2026-02-21 17:46:59 +00:00
bashonly
f532a91cef
[ie/soundcloud] Support browser impersonation (#16020)
Closes #15660
Authored by: bashonly
2026-02-21 14:50:22 +00:00
8 changed files with 112 additions and 34 deletions

View File

@ -5,10 +5,12 @@ from ..utils import (
ExtractorError, ExtractorError,
GeoRestrictedError, GeoRestrictedError,
int_or_none, int_or_none,
make_archive_id,
remove_start, remove_start,
traverse_obj,
update_url_query, update_url_query,
url_or_none,
) )
from ..utils.traversal import traverse_obj
class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
@ -29,6 +31,19 @@ class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'historyvault.com': (None, 'historyvault', None), 'historyvault.com': (None, 'historyvault', None),
'biography.com': (None, 'biography', None), 'biography.com': (None, 'biography', None),
} }
_GRAPHQL_QUERY = '''
query getUserVideo($videoId: ID!) {
video(id: $videoId) {
title
publicUrl
programId
tvSeasonNumber
tvSeasonEpisodeNumber
series {
title
}
}
}'''
def _extract_aen_smil(self, smil_url, video_id, auth=None): def _extract_aen_smil(self, smil_url, video_id, auth=None):
query = { query = {
@ -73,19 +88,39 @@ class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
def _extract_aetn_info(self, domain, filter_key, filter_value, url): def _extract_aetn_info(self, domain, filter_key, filter_value, url):
requestor_id, brand, software_statement = self._DOMAIN_MAP[domain] requestor_id, brand, software_statement = self._DOMAIN_MAP[domain]
if filter_key == 'canonical':
webpage = self._download_webpage(url, filter_value)
graphql_video_id = self._search_regex(
r'<meta\b[^>]+\bcontent="[^"]*\btpid/(\d+)"', webpage,
'id') or self._html_search_meta('videoId', webpage, 'GraphQL video ID', fatal=True)
else:
graphql_video_id = filter_value
result = self._download_json( result = self._download_json(
f'https://feeds.video.aetnd.com/api/v2/{brand}/videos', 'https://yoga.appsvcs.aetnd.com/', graphql_video_id,
filter_value, query={f'filter[{filter_key}]': filter_value}) query={
result = traverse_obj( 'brand': brand,
result, ('results', 'mode': 'live',
lambda k, v: k == 0 and v[filter_key] == filter_value), 'platform': 'web',
get_all=False) },
if not result: data=json.dumps({
'operationName': 'getUserVideo',
'variables': {
'videoId': graphql_video_id,
},
'query': self._GRAPHQL_QUERY,
}).encode(),
headers={
'Content-Type': 'application/json',
})
result = traverse_obj(result, ('data', 'video', {dict}))
media_url = traverse_obj(result, ('publicUrl', {url_or_none}))
if not media_url:
raise ExtractorError('Show not found in A&E feed (too new?)', expected=True, raise ExtractorError('Show not found in A&E feed (too new?)', expected=True,
video_id=remove_start(filter_value, '/')) video_id=remove_start(filter_value, '/'))
title = result['title'] title = result['title']
video_id = result['id'] video_id = result['programId']
media_url = result['publicUrl']
theplatform_metadata = self._download_theplatform_metadata(self._search_regex( theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
info = self._parse_theplatform_metadata(theplatform_metadata) info = self._parse_theplatform_metadata(theplatform_metadata)
@ -100,9 +135,13 @@ class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
info.update(self._extract_aen_smil(media_url, video_id, auth)) info.update(self._extract_aen_smil(media_url, video_id, auth))
info.update({ info.update({
'title': title, 'title': title,
'series': result.get('seriesName'), 'display_id': graphql_video_id,
'season_number': int_or_none(result.get('tvSeasonNumber')), '_old_archive_ids': [make_archive_id(self, graphql_video_id)],
'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), **traverse_obj(result, {
'series': ('series', 'title', {str}),
'season_number': ('tvSeasonNumber', {int_or_none}),
'episode_number': ('tvSeasonEpisodeNumber', {int_or_none}),
}),
}) })
return info return info
@ -116,7 +155,7 @@ class AENetworksIE(AENetworksBaseIE):
(?:shows/[^/?#]+/)?videos/[^/?#]+ (?:shows/[^/?#]+/)?videos/[^/?#]+
)''' )'''
_TESTS = [{ _TESTS = [{
'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'url': 'https://www.history.com/shows/mountain-men/season-1/episode-1',
'info_dict': { 'info_dict': {
'id': '22253814', 'id': '22253814',
'ext': 'mp4', 'ext': 'mp4',
@ -139,11 +178,11 @@ class AENetworksIE(AENetworksBaseIE):
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
'add_ie': ['ThePlatform'], 'add_ie': ['ThePlatform'],
'skip': 'Geo-restricted - This content is not available in your location.', 'skip': 'This content requires a valid, unexpired auth token',
}, { }, {
'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'url': 'https://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
'info_dict': { 'info_dict': {
'id': '600587331957', 'id': '147486',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Inlawful Entry', 'title': 'Inlawful Entry',
'description': 'md5:57c12115a2b384d883fe64ca50529e08', 'description': 'md5:57c12115a2b384d883fe64ca50529e08',
@ -160,6 +199,8 @@ class AENetworksIE(AENetworksBaseIE):
'season_number': 9, 'season_number': 9,
'series': 'Duck Dynasty', 'series': 'Duck Dynasty',
'age_limit': 0, 'age_limit': 0,
'display_id': '600587331957',
'_old_archive_ids': ['aenetworks 600587331957'],
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
'add_ie': ['ThePlatform'], 'add_ie': ['ThePlatform'],
@ -186,6 +227,7 @@ class AENetworksIE(AENetworksBaseIE):
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
'add_ie': ['ThePlatform'], 'add_ie': ['ThePlatform'],
'skip': '404 Not Found',
}, { }, {
'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story', 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story',
'info_dict': { 'info_dict': {
@ -209,6 +251,7 @@ class AENetworksIE(AENetworksBaseIE):
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
'add_ie': ['ThePlatform'], 'add_ie': ['ThePlatform'],
'skip': 'This content requires a valid, unexpired auth token',
}, { }, {
'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
'only_matching': True, 'only_matching': True,
@ -259,7 +302,7 @@ class AENetworksListBaseIE(AENetworksBaseIE):
domain, slug = self._match_valid_url(url).groups() domain, slug = self._match_valid_url(url).groups()
_, brand, _ = self._DOMAIN_MAP[domain] _, brand, _ = self._DOMAIN_MAP[domain]
playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS) playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS)
base_url = f'http://watch.{domain}' base_url = f'https://watch.{domain}'
entries = [] entries = []
for item in (playlist.get(self._ITEMS_KEY) or []): for item in (playlist.get(self._ITEMS_KEY) or []):

View File

@ -29,7 +29,7 @@ class LearningOnScreenIE(InfoExtractor):
}] }]
def _real_initialize(self): def _real_initialize(self):
if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'): if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-LOS-LIVE'):
self.raise_login_required(method='session_cookies') self.raise_login_required(method='session_cookies')
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -25,7 +25,7 @@ class MixcloudBaseIE(InfoExtractor):
%s %s
} }
}''' % (lookup_key, username, f', slug: "{slug}"' if slug else '', object_fields), # noqa: UP031 }''' % (lookup_key, username, f', slug: "{slug}"' if slug else '', object_fields), # noqa: UP031
})['data'][lookup_key] }, impersonate=True)['data'][lookup_key]
class MixcloudIE(MixcloudBaseIE): class MixcloudIE(MixcloudBaseIE):

View File

@ -33,7 +33,8 @@ class OpencastBaseIE(InfoExtractor):
vid\.igb\.illinois\.edu| vid\.igb\.illinois\.edu|
cursosabertos\.c3sl\.ufpr\.br| cursosabertos\.c3sl\.ufpr\.br|
mcmedia\.missioncollege\.org| mcmedia\.missioncollege\.org|
clases\.odon\.edu\.uy clases\.odon\.edu\.uy|
oc-p\.uni-jena\.de
)''' )'''
_UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
@ -106,7 +107,7 @@ class OpencastBaseIE(InfoExtractor):
class OpencastIE(OpencastBaseIE): class OpencastIE(OpencastBaseIE):
_VALID_URL = rf'''(?x) _VALID_URL = rf'''(?x)
https?://(?P<host>{OpencastBaseIE._INSTANCES_RE})/paella/ui/watch\.html\? https?://(?P<host>{OpencastBaseIE._INSTANCES_RE})/paella[0-9]*/ui/watch\.html\?
(?:[^#]+&)?id=(?P<id>{OpencastBaseIE._UUID_RE})''' (?:[^#]+&)?id=(?P<id>{OpencastBaseIE._UUID_RE})'''
_API_BASE = 'https://%s/search/episode.json?id=%s' _API_BASE = 'https://%s/search/episode.json?id=%s'
@ -131,8 +132,12 @@ class OpencastIE(OpencastBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
host, video_id = self._match_valid_url(url).group('host', 'id') host, video_id = self._match_valid_url(url).group('host', 'id')
return self._parse_mediapackage( response = self._call_api(host, video_id)
self._call_api(host, video_id)['search-results']['result']['mediapackage']) package = traverse_obj(response, (
('search-results', 'result'),
('result', ...), # Path needed for oc-p.uni-jena.de
'mediapackage', {dict}, any)) or {}
return self._parse_mediapackage(package)
class OpencastPlaylistIE(OpencastBaseIE): class OpencastPlaylistIE(OpencastBaseIE):

View File

@ -6,6 +6,7 @@ import re
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from ..networking import HEADRequest from ..networking import HEADRequest
from ..networking.exceptions import HTTPError from ..networking.exceptions import HTTPError
from ..networking.impersonate import ImpersonateTarget
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
float_or_none, float_or_none,
@ -833,6 +834,30 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE):
'entries': self._entries(base_url, playlist_id), 'entries': self._entries(base_url, playlist_id),
} }
@functools.cached_property
def _browser_impersonate_target(self):
available_targets = self._downloader._get_available_impersonate_targets()
if not available_targets:
# impersonate=True gives a generic warning when no impersonation targets are available
return True
# Any browser target older than chrome-116 is 403'd by Datadome
MIN_SUPPORTED_TARGET = ImpersonateTarget('chrome', '116', 'windows', '10')
version_as_float = lambda x: float(x.version) if x.version else 0
# Always try to use the newest Chrome target available
filtered = sorted([
target[0] for target in available_targets
if target[0].client == 'chrome' and target[0].os in ('windows', 'macos')
], key=version_as_float)
if not filtered or version_as_float(filtered[-1]) < version_as_float(MIN_SUPPORTED_TARGET):
# All available targets are inadequate or newest available Chrome target is too old, so
# warn the user to upgrade their dependency to a version with the minimum supported target
return MIN_SUPPORTED_TARGET
return filtered[-1]
def _entries(self, url, playlist_id): def _entries(self, url, playlist_id):
# Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200. # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200.
# https://developers.soundcloud.com/blog/offset-pagination-deprecated # https://developers.soundcloud.com/blog/offset-pagination-deprecated
@ -847,7 +872,9 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE):
try: try:
response = self._call_api( response = self._call_api(
url, playlist_id, query=query, headers=self._HEADERS, url, playlist_id, query=query, headers=self._HEADERS,
note=f'Downloading track page {i + 1}') note=f'Downloading track page {i + 1}',
# See: https://github.com/yt-dlp/yt-dlp/issues/15660
impersonate=self._browser_impersonate_target)
break break
except ExtractorError as e: except ExtractorError as e:
# Downloading page may result in intermittent 502 HTTP error # Downloading page may result in intermittent 502 HTTP error

View File

@ -3,6 +3,7 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
clean_html,
determine_ext, determine_ext,
merge_dicts, merge_dicts,
parse_duration, parse_duration,
@ -12,6 +13,7 @@ from ..utils import (
urlencode_postdata, urlencode_postdata,
urljoin, urljoin,
) )
from ..utils.traversal import find_element, traverse_obj, trim_str
class SpankBangIE(InfoExtractor): class SpankBangIE(InfoExtractor):
@ -122,7 +124,7 @@ class SpankBangIE(InfoExtractor):
}), headers={ }), headers={
'Referer': url, 'Referer': url,
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
}) }, impersonate=True)
for format_id, format_url in stream.items(): for format_id, format_url in stream.items():
if format_url and isinstance(format_url, list): if format_url and isinstance(format_url, list):
@ -178,9 +180,9 @@ class SpankBangPlaylistIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._match_valid_url(url) mobj = self._match_valid_url(url)
playlist_id = mobj.group('id') playlist_id = mobj.group('id')
country = self.get_param('geo_bypass_country') or 'US'
webpage = self._download_webpage( self._set_cookie('.spankbang.com', 'country', country.upper())
url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) webpage = self._download_webpage(url, playlist_id, impersonate=True)
entries = [self.url_result( entries = [self.url_result(
urljoin(url, mobj.group('path')), urljoin(url, mobj.group('path')),
@ -189,8 +191,8 @@ class SpankBangPlaylistIE(InfoExtractor):
r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/[^"\'](?:(?!\1).)*)\1', r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/[^"\'](?:(?!\1).)*)\1',
webpage)] webpage)]
title = self._html_search_regex( title = traverse_obj(webpage, (
r'<em>([^<]+)</em>\s+playlist\s*<', webpage, 'playlist title', {find_element(tag='h1', attr='data-testid', value='playlist-title')},
fatal=False) {clean_html}, {trim_str(end=' Playlist')}))
return self.playlist_result(entries, playlist_id, title) return self.playlist_result(entries, playlist_id, title)

View File

@ -51,7 +51,8 @@ class TruthIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
status = self._download_json(f'https://truthsocial.com/api/v1/statuses/{video_id}', video_id) status = self._download_json(
f'https://truthsocial.com/api/v1/statuses/{video_id}', video_id, impersonate=True)
uploader_id = strip_or_none(traverse_obj(status, ('account', 'username'))) uploader_id = strip_or_none(traverse_obj(status, ('account', 'username')))
return { return {
'id': video_id, 'id': video_id,

View File

@ -268,7 +268,7 @@ class XHamsterIE(InfoExtractor):
display_id = mobj.group('display_id') or mobj.group('display_id_2') display_id = mobj.group('display_id') or mobj.group('display_id_2')
desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url) desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url)
webpage, urlh = self._download_webpage_handle(desktop_url, video_id) webpage, urlh = self._download_webpage_handle(desktop_url, video_id, impersonate=True)
error = self._html_search_regex( error = self._html_search_regex(
r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>',