Compare commits

...

5 Commits

Author SHA1 Message Date
doe1080
7794374de8
[ie/twitter:broadcast] Support events URLs (#13248)
Closes #12989
Authored by: doe1080
2025-05-23 19:25:56 +00:00
bashonly
538eb30567
[ie/podchaser] Fix extractor (#13271)
Closes #13269
Authored by: bashonly
2025-05-23 17:42:24 +00:00
doe1080
f8051e3a61
[ie/toutiao] Add extractor (#13246)
Closes #12125
Authored by: doe1080
2025-05-23 17:29:55 +00:00
bashonly
52f9729c9a
[ie/twitcasting] Fix password-protected livestream support (#13097)
Closes #13096
Authored by: bashonly
2025-05-23 12:58:53 +00:00
bashonly
1a8a03ea8d
[ie/patreon] Fix referer header used for embeds (#13276)
Fix e0d6c0822930f6e63f574d46d946a58b73ecd10c

Closes #13263
Authored by: bashonly
2025-05-23 12:53:36 +00:00
6 changed files with 222 additions and 32 deletions

View File

@ -2147,6 +2147,7 @@ from .toggle import (
from .toggo import ToggoIE from .toggo import ToggoIE
from .tonline import TOnlineIE from .tonline import TOnlineIE
from .toongoggles import ToonGogglesIE from .toongoggles import ToonGogglesIE
from .toutiao import ToutiaoIE
from .toutv import TouTvIE from .toutv import TouTvIE
from .toypics import ( from .toypics import (
ToypicsIE, ToypicsIE,

View File

@ -340,8 +340,9 @@ class PatreonIE(PatreonBaseIE):
'channel_follower_count': ('attributes', 'patron_count', {int_or_none}), 'channel_follower_count': ('attributes', 'patron_count', {int_or_none}),
})) }))
# all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, Vimeo # Must be all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, and Vimeo.
headers = {'referer': url} # patreon.com URLs redirect to www.patreon.com; this matters when requesting mux.com m3u8s
headers = {'referer': 'https://www.patreon.com/'}
# handle Vimeo embeds # handle Vimeo embeds
if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo':
@ -352,7 +353,7 @@ class PatreonIE(PatreonBaseIE):
v_url, video_id, 'Checking Vimeo embed URL', headers=headers, v_url, video_id, 'Checking Vimeo embed URL', headers=headers,
fatal=False, errnote=False, expected_status=429): # 429 is TLS fingerprint rejection fatal=False, errnote=False, expected_status=429): # 429 is TLS fingerprint rejection
entries.append(self.url_result( entries.append(self.url_result(
VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), VimeoIE._smuggle_referrer(v_url, headers['referer']),
VimeoIE, url_transparent=True)) VimeoIE, url_transparent=True))
embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none}))

View File

@ -5,11 +5,13 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
OnDemandPagedList, OnDemandPagedList,
float_or_none, float_or_none,
int_or_none,
orderedSet,
str_or_none, str_or_none,
str_to_int,
traverse_obj,
unified_timestamp, unified_timestamp,
url_or_none,
) )
from ..utils.traversal import require, traverse_obj
class PodchaserIE(InfoExtractor): class PodchaserIE(InfoExtractor):
@ -21,24 +23,25 @@ class PodchaserIE(InfoExtractor):
'id': '104365585', 'id': '104365585',
'title': 'Ep. 285 freeze me off', 'title': 'Ep. 285 freeze me off',
'description': 'cam ahn', 'description': 'cam ahn',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:https?://.+/.+\.jpg',
'ext': 'mp3', 'ext': 'mp3',
'categories': ['Comedy'], 'categories': ['Comedy', 'News', 'Politics', 'Arts'],
'tags': ['comedy', 'dark humor'], 'tags': ['comedy', 'dark humor'],
'series': 'Cum Town', 'series': 'The Adam Friedland Show Podcast',
'duration': 3708, 'duration': 3708,
'timestamp': 1636531259, 'timestamp': 1636531259,
'upload_date': '20211110', 'upload_date': '20211110',
'average_rating': 4.0, 'average_rating': 4.0,
'series_id': '36924',
}, },
}, { }, {
'url': 'https://www.podchaser.com/podcasts/the-bone-zone-28853', 'url': 'https://www.podchaser.com/podcasts/the-bone-zone-28853',
'info_dict': { 'info_dict': {
'id': '28853', 'id': '28853',
'title': 'The Bone Zone', 'title': 'The Bone Zone',
'description': 'Podcast by The Bone Zone', 'description': r're:The official home of the Bone Zone podcast.+',
}, },
'playlist_count': 275, 'playlist_mincount': 275,
}, { }, {
'url': 'https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes', 'url': 'https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes',
'info_dict': { 'info_dict': {
@ -51,19 +54,33 @@ class PodchaserIE(InfoExtractor):
@staticmethod @staticmethod
def _parse_episode(episode, podcast): def _parse_episode(episode, podcast):
return { info = traverse_obj(episode, {
'id': str(episode.get('id')), 'id': ('id', {int}, {str_or_none}, {require('episode ID')}),
'title': episode.get('title'), 'title': ('title', {str}),
'description': episode.get('description'), 'description': ('description', {str}),
'url': episode.get('audio_url'), 'url': ('audio_url', {url_or_none}),
'thumbnail': episode.get('image_url'), 'thumbnail': ('image_url', {url_or_none}),
'duration': str_to_int(episode.get('length')), 'duration': ('length', {int_or_none}),
'timestamp': unified_timestamp(episode.get('air_date')), 'timestamp': ('air_date', {unified_timestamp}),
'average_rating': float_or_none(episode.get('rating')), 'average_rating': ('rating', {float_or_none}),
'categories': list(set(traverse_obj(podcast, (('summary', None), 'categories', ..., 'text')))), })
'tags': traverse_obj(podcast, ('tags', ..., 'text')), info.update(traverse_obj(podcast, {
'series': podcast.get('title'), 'series': ('title', {str}),
} 'series_id': ('id', {int}, {str_or_none}),
'categories': (('summary', None), 'categories', ..., 'text', {str}, filter, all, {orderedSet}),
'tags': ('tags', ..., 'text', {str}),
}))
info['vcodec'] = 'none'
if info.get('series_id'):
podcast_slug = traverse_obj(podcast, ('slug', {str})) or 'podcast'
episode_slug = traverse_obj(episode, ('slug', {str})) or 'episode'
info['webpage_url'] = '/'.join((
'https://www.podchaser.com/podcasts',
'-'.join((podcast_slug[:30].rstrip('-'), info['series_id'])),
'-'.join((episode_slug[:30].rstrip('-'), info['id']))))
return info
def _call_api(self, path, *args, **kwargs): def _call_api(self, path, *args, **kwargs):
return self._download_json(f'https://api.podchaser.com/{path}', *args, **kwargs) return self._download_json(f'https://api.podchaser.com/{path}', *args, **kwargs)
@ -93,5 +110,5 @@ class PodchaserIE(InfoExtractor):
OnDemandPagedList(functools.partial(self._fetch_page, podcast_id, podcast), self._PAGE_SIZE), OnDemandPagedList(functools.partial(self._fetch_page, podcast_id, podcast), self._PAGE_SIZE),
str_or_none(podcast.get('id')), podcast.get('title'), podcast.get('description')) str_or_none(podcast.get('id')), podcast.get('title'), podcast.get('description'))
episode = self._call_api(f'episodes/{episode_id}', episode_id) episode = self._call_api(f'podcasts/{podcast_id}/episodes/{episode_id}/player_ids', episode_id)
return self._parse_episode(episode, podcast) return self._parse_episode(episode, podcast)

121
yt_dlp/extractor/toutiao.py Normal file
View File

@ -0,0 +1,121 @@
import json
import urllib.parse
from .common import InfoExtractor
from ..utils import (
float_or_none,
int_or_none,
str_or_none,
try_call,
url_or_none,
)
from ..utils.traversal import find_element, traverse_obj
class ToutiaoIE(InfoExtractor):
IE_NAME = 'toutiao'
IE_DESC = '今日头条'
_VALID_URL = r'https?://www\.toutiao\.com/video/(?P<id>\d+)/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.toutiao.com/video/7505382061495176511/',
'info_dict': {
'id': '7505382061495176511',
'ext': 'mp4',
'title': '新疆多地现不明飞行物,目击者称和月亮一样亮,几秒内突然加速消失,气象部门回应',
'comment_count': int,
'duration': 9.753,
'like_count': int,
'release_date': '20250517',
'release_timestamp': 1747483344,
'thumbnail': r're:https?://p\d+-sign\.toutiaoimg\.com/.+$',
'uploader': '极目新闻',
'uploader_id': 'MS4wLjABAAAAeateBb9Su8I3MJOZozmvyzWktmba5LMlliRDz1KffnM',
'view_count': int,
},
}, {
'url': 'https://www.toutiao.com/video/7479446610359878153/',
'info_dict': {
'id': '7479446610359878153',
'ext': 'mp4',
'title': '小伙竟然利用两块磁铁制作成磁力减震器,简直太有创意了!',
'comment_count': int,
'duration': 118.374,
'like_count': int,
'release_date': '20250308',
'release_timestamp': 1741444368,
'thumbnail': r're:https?://p\d+-sign\.toutiaoimg\.com/.+$',
'uploader': '小莉创意发明',
'uploader_id': 'MS4wLjABAAAA4f7d4mwtApALtHIiq-QM20dwXqe32NUz0DeWF7wbHKw',
'view_count': int,
},
}]
def _real_initialize(self):
if self._get_cookies('https://www.toutiao.com').get('ttwid'):
return
urlh = self._request_webpage(
'https://ttwid.bytedance.com/ttwid/union/register/', None,
'Fetching ttwid', 'Unable to fetch ttwid', headers={
'Content-Type': 'application/json',
}, data=json.dumps({
'aid': 24,
'needFid': False,
'region': 'cn',
'service': 'www.toutiao.com',
'union': True,
}).encode(),
)
if ttwid := try_call(lambda: self._get_cookies(urlh.url)['ttwid'].value):
self._set_cookie('.toutiao.com', 'ttwid', ttwid)
return
self.raise_login_required()
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_data = traverse_obj(webpage, (
{find_element(tag='script', id='RENDER_DATA')},
{urllib.parse.unquote}, {json.loads}, 'data', 'initialVideo',
))
formats = []
for video in traverse_obj(video_data, (
'videoPlayInfo', 'video_list', lambda _, v: v['main_url'],
)):
formats.append({
'url': video['main_url'],
**traverse_obj(video, ('video_meta', {
'acodec': ('audio_profile', {str}),
'asr': ('audio_sample_rate', {int_or_none}),
'audio_channels': ('audio_channels', {float_or_none}, {int_or_none}),
'ext': ('vtype', {str}),
'filesize': ('size', {int_or_none}),
'format_id': ('definition', {str}),
'fps': ('fps', {int_or_none}),
'height': ('vheight', {int_or_none}),
'tbr': ('real_bitrate', {float_or_none(scale=1000)}),
'vcodec': ('codec_type', {str}),
'width': ('vwidth', {int_or_none}),
})),
})
return {
'id': video_id,
'formats': formats,
**traverse_obj(video_data, {
'comment_count': ('commentCount', {int_or_none}),
'duration': ('videoPlayInfo', 'video_duration', {float_or_none}),
'like_count': ('repinCount', {int_or_none}),
'release_timestamp': ('publishTime', {int_or_none}),
'thumbnail': (('poster', 'coverUrl'), {url_or_none}, any),
'title': ('title', {str}),
'uploader': ('userInfo', 'name', {str}),
'uploader_id': ('userInfo', 'userId', {str_or_none}),
'view_count': ('playCount', {int_or_none}),
'webpage_url': ('detailUrl', {url_or_none}),
}),
}

View File

@ -1,4 +1,5 @@
import base64 import base64
import hashlib
import itertools import itertools
import re import re
@ -16,6 +17,7 @@ from ..utils import (
str_to_int, str_to_int,
try_get, try_get,
unified_timestamp, unified_timestamp,
update_url_query,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin, urljoin,
@ -171,6 +173,10 @@ class TwitCastingIE(InfoExtractor):
'player': 'pc_web', 'player': 'pc_web',
}) })
password_params = {
'word': hashlib.md5(video_password.encode()).hexdigest(),
} if video_password else None
formats = [] formats = []
# low: 640x360, medium: 1280x720, high: 1920x1080 # low: 640x360, medium: 1280x720, high: 1920x1080
qq = qualities(['low', 'medium', 'high']) qq = qualities(['low', 'medium', 'high'])
@ -178,7 +184,7 @@ class TwitCastingIE(InfoExtractor):
'tc-hls', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), 'tc-hls', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]),
)): )):
formats.append({ formats.append({
'url': m3u8_url, 'url': update_url_query(m3u8_url, password_params),
'format_id': f'hls-{quality}', 'format_id': f'hls-{quality}',
'ext': 'mp4', 'ext': 'mp4',
'quality': qq(quality), 'quality': qq(quality),
@ -192,7 +198,7 @@ class TwitCastingIE(InfoExtractor):
'llfmp4', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), 'llfmp4', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]),
)): )):
formats.append({ formats.append({
'url': ws_url, 'url': update_url_query(ws_url, password_params),
'format_id': f'ws-{mode}', 'format_id': f'ws-{mode}',
'ext': 'mp4', 'ext': 'mp4',
'quality': qq(mode), 'quality': qq(mode),

View File

@ -20,7 +20,6 @@ from ..utils import (
remove_end, remove_end,
str_or_none, str_or_none,
strip_or_none, strip_or_none,
traverse_obj,
truncate_string, truncate_string,
try_call, try_call,
try_get, try_get,
@ -29,6 +28,7 @@ from ..utils import (
url_or_none, url_or_none,
xpath_text, xpath_text,
) )
from ..utils.traversal import require, traverse_obj
class TwitterBaseIE(InfoExtractor): class TwitterBaseIE(InfoExtractor):
@ -1596,8 +1596,8 @@ class TwitterAmplifyIE(TwitterBaseIE):
class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
IE_NAME = 'twitter:broadcast' IE_NAME = 'twitter:broadcast'
_VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P<id>[0-9a-zA-Z]{13})'
_VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?P<type>broadcasts|events)/(?P<id>\w+)'
_TESTS = [{ _TESTS = [{
# untitled Periscope video # untitled Periscope video
'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj', 'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj',
@ -1605,6 +1605,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
'id': '1yNGaQLWpejGj', 'id': '1yNGaQLWpejGj',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Andrea May Sahouri - Periscope Broadcast', 'title': 'Andrea May Sahouri - Periscope Broadcast',
'display_id': '1yNGaQLWpejGj',
'uploader': 'Andrea May Sahouri', 'uploader': 'Andrea May Sahouri',
'uploader_id': 'andreamsahouri', 'uploader_id': 'andreamsahouri',
'uploader_url': 'https://twitter.com/andreamsahouri', 'uploader_url': 'https://twitter.com/andreamsahouri',
@ -1612,6 +1613,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
'upload_date': '20200601', 'upload_date': '20200601',
'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=',
'view_count': int, 'view_count': int,
'concurrent_view_count': int,
'live_status': 'was_live',
}, },
}, { }, {
'url': 'https://twitter.com/i/broadcasts/1ZkKzeyrPbaxv', 'url': 'https://twitter.com/i/broadcasts/1ZkKzeyrPbaxv',
@ -1619,6 +1622,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
'id': '1ZkKzeyrPbaxv', 'id': '1ZkKzeyrPbaxv',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Starship | SN10 | High-Altitude Flight Test', 'title': 'Starship | SN10 | High-Altitude Flight Test',
'display_id': '1ZkKzeyrPbaxv',
'uploader': 'SpaceX', 'uploader': 'SpaceX',
'uploader_id': 'SpaceX', 'uploader_id': 'SpaceX',
'uploader_url': 'https://twitter.com/SpaceX', 'uploader_url': 'https://twitter.com/SpaceX',
@ -1626,6 +1630,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
'upload_date': '20210303', 'upload_date': '20210303',
'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=',
'view_count': int, 'view_count': int,
'concurrent_view_count': int,
'live_status': 'was_live',
}, },
}, { }, {
'url': 'https://twitter.com/i/broadcasts/1OyKAVQrgzwGb', 'url': 'https://twitter.com/i/broadcasts/1OyKAVQrgzwGb',
@ -1633,6 +1639,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
'id': '1OyKAVQrgzwGb', 'id': '1OyKAVQrgzwGb',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Starship Flight Test', 'title': 'Starship Flight Test',
'display_id': '1OyKAVQrgzwGb',
'uploader': 'SpaceX', 'uploader': 'SpaceX',
'uploader_id': 'SpaceX', 'uploader_id': 'SpaceX',
'uploader_url': 'https://twitter.com/SpaceX', 'uploader_url': 'https://twitter.com/SpaceX',
@ -1640,21 +1647,58 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
'upload_date': '20230420', 'upload_date': '20230420',
'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=',
'view_count': int, 'view_count': int,
'concurrent_view_count': int,
'live_status': 'was_live',
},
}, {
'url': 'https://x.com/i/events/1910629646300762112',
'info_dict': {
'id': '1LyxBWDRNqyKN',
'ext': 'mp4',
'title': '#ガンニバル ウォッチパーティー',
'concurrent_view_count': int,
'display_id': '1910629646300762112',
'live_status': 'was_live',
'release_date': '20250423',
'release_timestamp': 1745409000,
'tags': ['ガンニバル'],
'thumbnail': r're:https?://[^?#]+\.jpg\?token=',
'timestamp': 1745403328,
'upload_date': '20250423',
'uploader': 'ディズニープラス公式',
'uploader_id': 'DisneyPlusJP',
'uploader_url': 'https://twitter.com/DisneyPlusJP',
'view_count': int,
}, },
}] }]
def _real_extract(self, url): def _real_extract(self, url):
broadcast_id = self._match_id(url) broadcast_type, display_id = self._match_valid_url(url).group('type', 'id')
if broadcast_type == 'events':
timeline = self._call_api(
f'live_event/1/{display_id}/timeline.json', display_id)
broadcast_id = traverse_obj(timeline, (
'twitter_objects', 'broadcasts', ..., ('id', 'broadcast_id'),
{str}, any, {require('broadcast ID')}))
else:
broadcast_id = display_id
broadcast = self._call_api( broadcast = self._call_api(
'broadcasts/show.json', broadcast_id, 'broadcasts/show.json', broadcast_id,
{'ids': broadcast_id})['broadcasts'][broadcast_id] {'ids': broadcast_id})['broadcasts'][broadcast_id]
if not broadcast: if not broadcast:
raise ExtractorError('Broadcast no longer exists', expected=True) raise ExtractorError('Broadcast no longer exists', expected=True)
info = self._parse_broadcast_data(broadcast, broadcast_id) info = self._parse_broadcast_data(broadcast, broadcast_id)
info['title'] = broadcast.get('status') or info.get('title') info.update({
info['uploader_id'] = broadcast.get('twitter_username') or info.get('uploader_id') 'display_id': display_id,
info['uploader_url'] = format_field(broadcast, 'twitter_username', 'https://twitter.com/%s', default=None) 'title': broadcast.get('status') or info.get('title'),
'uploader_id': broadcast.get('twitter_username') or info.get('uploader_id'),
'uploader_url': format_field(
broadcast, 'twitter_username', 'https://twitter.com/%s', default=None),
})
if info['live_status'] == 'is_upcoming': if info['live_status'] == 'is_upcoming':
self.raise_no_formats('This live broadcast has not yet started', expected=True)
return info return info
media_key = broadcast['media_key'] media_key = broadcast['media_key']