Compare commits

...

7 Commits

Author SHA1 Message Date
bashonly
1d1358d09f
[ie] Add browser impersonation support to more extractors (#16029)
Closes #7001, Closes #7444, Closes #16004
Authored by: bashonly
2026-02-21 19:24:05 +00:00
blauerdorf
1fe0bf23aa
[ie/spankbang] Fix playlist title extraction (#14132)
Closes #14131
Authored by: blauerdorf
2026-02-21 18:57:20 +00:00
blauerdorf
f05e1cd1f1
[ie/spankbang] Support browser impersonation (#14130)
Closes #14129
Authored by: blauerdorf
2026-02-21 18:51:52 +00:00
bashonly
46d5b6f2b7
[ie/learningonscreen] Fix extractor (#16028)
Closes #15934
Authored by: bashonly, 0xvd
2026-02-21 18:27:33 +00:00
LordMZTE
166356d1a1
[ie/opencast] Support oc-p.uni-jena.de URLs (#16026)
Closes #16023
Authored by: LordMZTE
2026-02-21 18:01:34 +00:00
Sipherdrakon
2485653859
[ie/aenetworks] Fix extractor (#14959)
Closes #14578
Authored by: Sipherdrakon
2026-02-21 17:46:59 +00:00
bashonly
f532a91cef
[ie/soundcloud] Support browser impersonation (#16020)
Closes #15660
Authored by: bashonly
2026-02-21 14:50:22 +00:00
8 changed files with 112 additions and 34 deletions

View File

@ -5,10 +5,12 @@ from ..utils import (
ExtractorError,
GeoRestrictedError,
int_or_none,
make_archive_id,
remove_start,
traverse_obj,
update_url_query,
url_or_none,
)
from ..utils.traversal import traverse_obj
class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
@ -29,6 +31,19 @@ class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'historyvault.com': (None, 'historyvault', None),
'biography.com': (None, 'biography', None),
}
_GRAPHQL_QUERY = '''
query getUserVideo($videoId: ID!) {
video(id: $videoId) {
title
publicUrl
programId
tvSeasonNumber
tvSeasonEpisodeNumber
series {
title
}
}
}'''
def _extract_aen_smil(self, smil_url, video_id, auth=None):
query = {
@ -73,19 +88,39 @@ class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
def _extract_aetn_info(self, domain, filter_key, filter_value, url):
requestor_id, brand, software_statement = self._DOMAIN_MAP[domain]
if filter_key == 'canonical':
webpage = self._download_webpage(url, filter_value)
graphql_video_id = self._search_regex(
r'<meta\b[^>]+\bcontent="[^"]*\btpid/(\d+)"', webpage,
'id') or self._html_search_meta('videoId', webpage, 'GraphQL video ID', fatal=True)
else:
graphql_video_id = filter_value
result = self._download_json(
f'https://feeds.video.aetnd.com/api/v2/{brand}/videos',
filter_value, query={f'filter[{filter_key}]': filter_value})
result = traverse_obj(
result, ('results',
lambda k, v: k == 0 and v[filter_key] == filter_value),
get_all=False)
if not result:
'https://yoga.appsvcs.aetnd.com/', graphql_video_id,
query={
'brand': brand,
'mode': 'live',
'platform': 'web',
},
data=json.dumps({
'operationName': 'getUserVideo',
'variables': {
'videoId': graphql_video_id,
},
'query': self._GRAPHQL_QUERY,
}).encode(),
headers={
'Content-Type': 'application/json',
})
result = traverse_obj(result, ('data', 'video', {dict}))
media_url = traverse_obj(result, ('publicUrl', {url_or_none}))
if not media_url:
raise ExtractorError('Show not found in A&E feed (too new?)', expected=True,
video_id=remove_start(filter_value, '/'))
title = result['title']
video_id = result['id']
media_url = result['publicUrl']
video_id = result['programId']
theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
info = self._parse_theplatform_metadata(theplatform_metadata)
@ -100,9 +135,13 @@ class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
info.update(self._extract_aen_smil(media_url, video_id, auth))
info.update({
'title': title,
'series': result.get('seriesName'),
'season_number': int_or_none(result.get('tvSeasonNumber')),
'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')),
'display_id': graphql_video_id,
'_old_archive_ids': [make_archive_id(self, graphql_video_id)],
**traverse_obj(result, {
'series': ('series', 'title', {str}),
'season_number': ('tvSeasonNumber', {int_or_none}),
'episode_number': ('tvSeasonEpisodeNumber', {int_or_none}),
}),
})
return info
@ -116,7 +155,7 @@ class AENetworksIE(AENetworksBaseIE):
(?:shows/[^/?#]+/)?videos/[^/?#]+
)'''
_TESTS = [{
'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
'url': 'https://www.history.com/shows/mountain-men/season-1/episode-1',
'info_dict': {
'id': '22253814',
'ext': 'mp4',
@ -139,11 +178,11 @@ class AENetworksIE(AENetworksBaseIE):
},
'params': {'skip_download': 'm3u8'},
'add_ie': ['ThePlatform'],
'skip': 'Geo-restricted - This content is not available in your location.',
'skip': 'This content requires a valid, unexpired auth token',
}, {
'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
'url': 'https://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
'info_dict': {
'id': '600587331957',
'id': '147486',
'ext': 'mp4',
'title': 'Inlawful Entry',
'description': 'md5:57c12115a2b384d883fe64ca50529e08',
@ -160,6 +199,8 @@ class AENetworksIE(AENetworksBaseIE):
'season_number': 9,
'series': 'Duck Dynasty',
'age_limit': 0,
'display_id': '600587331957',
'_old_archive_ids': ['aenetworks 600587331957'],
},
'params': {'skip_download': 'm3u8'},
'add_ie': ['ThePlatform'],
@ -186,6 +227,7 @@ class AENetworksIE(AENetworksBaseIE):
},
'params': {'skip_download': 'm3u8'},
'add_ie': ['ThePlatform'],
'skip': '404 Not Found',
}, {
'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story',
'info_dict': {
@ -209,6 +251,7 @@ class AENetworksIE(AENetworksBaseIE):
},
'params': {'skip_download': 'm3u8'},
'add_ie': ['ThePlatform'],
'skip': 'This content requires a valid, unexpired auth token',
}, {
'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
'only_matching': True,
@ -259,7 +302,7 @@ class AENetworksListBaseIE(AENetworksBaseIE):
domain, slug = self._match_valid_url(url).groups()
_, brand, _ = self._DOMAIN_MAP[domain]
playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS)
base_url = f'http://watch.{domain}'
base_url = f'https://watch.{domain}'
entries = []
for item in (playlist.get(self._ITEMS_KEY) or []):

View File

@ -29,7 +29,7 @@ class LearningOnScreenIE(InfoExtractor):
}]
def _real_initialize(self):
if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'):
if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-LOS-LIVE'):
self.raise_login_required(method='session_cookies')
def _real_extract(self, url):

View File

@ -25,7 +25,7 @@ class MixcloudBaseIE(InfoExtractor):
%s
}
}''' % (lookup_key, username, f', slug: "{slug}"' if slug else '', object_fields), # noqa: UP031
})['data'][lookup_key]
}, impersonate=True)['data'][lookup_key]
class MixcloudIE(MixcloudBaseIE):

View File

@ -33,7 +33,8 @@ class OpencastBaseIE(InfoExtractor):
vid\.igb\.illinois\.edu|
cursosabertos\.c3sl\.ufpr\.br|
mcmedia\.missioncollege\.org|
clases\.odon\.edu\.uy
clases\.odon\.edu\.uy|
oc-p\.uni-jena\.de
)'''
_UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
@ -106,7 +107,7 @@ class OpencastBaseIE(InfoExtractor):
class OpencastIE(OpencastBaseIE):
_VALID_URL = rf'''(?x)
https?://(?P<host>{OpencastBaseIE._INSTANCES_RE})/paella/ui/watch\.html\?
https?://(?P<host>{OpencastBaseIE._INSTANCES_RE})/paella[0-9]*/ui/watch\.html\?
(?:[^#]+&)?id=(?P<id>{OpencastBaseIE._UUID_RE})'''
_API_BASE = 'https://%s/search/episode.json?id=%s'
@ -131,8 +132,12 @@ class OpencastIE(OpencastBaseIE):
def _real_extract(self, url):
host, video_id = self._match_valid_url(url).group('host', 'id')
return self._parse_mediapackage(
self._call_api(host, video_id)['search-results']['result']['mediapackage'])
response = self._call_api(host, video_id)
package = traverse_obj(response, (
('search-results', 'result'),
('result', ...), # Path needed for oc-p.uni-jena.de
'mediapackage', {dict}, any)) or {}
return self._parse_mediapackage(package)
class OpencastPlaylistIE(OpencastBaseIE):

View File

@ -6,6 +6,7 @@ import re
from .common import InfoExtractor, SearchInfoExtractor
from ..networking import HEADRequest
from ..networking.exceptions import HTTPError
from ..networking.impersonate import ImpersonateTarget
from ..utils import (
ExtractorError,
float_or_none,
@ -833,6 +834,30 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE):
'entries': self._entries(base_url, playlist_id),
}
@functools.cached_property
def _browser_impersonate_target(self):
available_targets = self._downloader._get_available_impersonate_targets()
if not available_targets:
# impersonate=True gives a generic warning when no impersonation targets are available
return True
# Any browser target older than chrome-116 is 403'd by Datadome
MIN_SUPPORTED_TARGET = ImpersonateTarget('chrome', '116', 'windows', '10')
version_as_float = lambda x: float(x.version) if x.version else 0
# Always try to use the newest Chrome target available
filtered = sorted([
target[0] for target in available_targets
if target[0].client == 'chrome' and target[0].os in ('windows', 'macos')
], key=version_as_float)
if not filtered or version_as_float(filtered[-1]) < version_as_float(MIN_SUPPORTED_TARGET):
# All available targets are inadequate or newest available Chrome target is too old, so
# warn the user to upgrade their dependency to a version with the minimum supported target
return MIN_SUPPORTED_TARGET
return filtered[-1]
def _entries(self, url, playlist_id):
# Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200.
# https://developers.soundcloud.com/blog/offset-pagination-deprecated
@ -847,7 +872,9 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE):
try:
response = self._call_api(
url, playlist_id, query=query, headers=self._HEADERS,
note=f'Downloading track page {i + 1}')
note=f'Downloading track page {i + 1}',
# See: https://github.com/yt-dlp/yt-dlp/issues/15660
impersonate=self._browser_impersonate_target)
break
except ExtractorError as e:
# Downloading page may result in intermittent 502 HTTP error

View File

@ -3,6 +3,7 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
determine_ext,
merge_dicts,
parse_duration,
@ -12,6 +13,7 @@ from ..utils import (
urlencode_postdata,
urljoin,
)
from ..utils.traversal import find_element, traverse_obj, trim_str
class SpankBangIE(InfoExtractor):
@ -122,7 +124,7 @@ class SpankBangIE(InfoExtractor):
}), headers={
'Referer': url,
'X-Requested-With': 'XMLHttpRequest',
})
}, impersonate=True)
for format_id, format_url in stream.items():
if format_url and isinstance(format_url, list):
@ -178,9 +180,9 @@ class SpankBangPlaylistIE(InfoExtractor):
def _real_extract(self, url):
mobj = self._match_valid_url(url)
playlist_id = mobj.group('id')
webpage = self._download_webpage(
url, playlist_id, headers={'Cookie': 'country=US; mobile=on'})
country = self.get_param('geo_bypass_country') or 'US'
self._set_cookie('.spankbang.com', 'country', country.upper())
webpage = self._download_webpage(url, playlist_id, impersonate=True)
entries = [self.url_result(
urljoin(url, mobj.group('path')),
@ -189,8 +191,8 @@ class SpankBangPlaylistIE(InfoExtractor):
r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/[^"\'](?:(?!\1).)*)\1',
webpage)]
title = self._html_search_regex(
r'<em>([^<]+)</em>\s+playlist\s*<', webpage, 'playlist title',
fatal=False)
title = traverse_obj(webpage, (
{find_element(tag='h1', attr='data-testid', value='playlist-title')},
{clean_html}, {trim_str(end=' Playlist')}))
return self.playlist_result(entries, playlist_id, title)

View File

@ -51,7 +51,8 @@ class TruthIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
status = self._download_json(f'https://truthsocial.com/api/v1/statuses/{video_id}', video_id)
status = self._download_json(
f'https://truthsocial.com/api/v1/statuses/{video_id}', video_id, impersonate=True)
uploader_id = strip_or_none(traverse_obj(status, ('account', 'username')))
return {
'id': video_id,

View File

@ -268,7 +268,7 @@ class XHamsterIE(InfoExtractor):
display_id = mobj.group('display_id') or mobj.group('display_id_2')
desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url)
webpage, urlh = self._download_webpage_handle(desktop_url, video_id)
webpage, urlh = self._download_webpage_handle(desktop_url, video_id, impersonate=True)
error = self._html_search_regex(
r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>',