mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2026-04-26 10:36:08 +00:00
Compare commits
5 Commits
fcd47d2db3
...
2a7e048a60
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2a7e048a60 | ||
|
|
a6ba714005 | ||
|
|
ce9a3591f8 | ||
|
|
d22436e5dc | ||
|
|
abf29e3e72 |
@ -1859,8 +1859,9 @@ The following extractors use this feature:
|
||||
* `player_js_variant`: The player javascript variant to use for n/sig deciphering. The known variants are: `main`, `tcc`, `tce`, `es5`, `es6`, `tv`, `tv_es6`, `phone`, `tablet`. The default is `main`, and the others are for debugging purposes. You can use `actual` to go with what is prescribed by the site
|
||||
* `player_js_version`: The player javascript version to use for n/sig deciphering, in the format of `signature_timestamp@hash` (e.g. `20348@0004de42`). The default is to use what is prescribed by the site, and can be selected with `actual`
|
||||
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
|
||||
* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`
|
||||
* E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
|
||||
* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread,max-depth`. Default is `all,all,all,all,all`
|
||||
* A `max-depth` value of `1` will discard all replies, regardless of the `max-replies` or `max-replies-per-thread` values given
|
||||
* E.g. `all,all,1000,10,2` will get a maximum of 1000 replies total, with up to 10 replies per thread, and only 2 levels of depth (i.e. top-level comments plus their immediate replies). `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
|
||||
* `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8), `missing_pot` (include formats that require a PO Token but are missing one)
|
||||
* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
|
||||
* `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used
|
||||
|
||||
@ -4,8 +4,6 @@ import urllib.parse
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..compat import compat_etree_fromstring
|
||||
from ..networking import Request
|
||||
from ..networking.exceptions import network_exceptions
|
||||
from ..utils import (
|
||||
ExtractorError,
|
||||
clean_html,
|
||||
@ -64,9 +62,6 @@ class FacebookIE(InfoExtractor):
|
||||
class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
|
||||
data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''',
|
||||
]
|
||||
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
|
||||
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
|
||||
_NETRC_MACHINE = 'facebook'
|
||||
IE_NAME = 'facebook'
|
||||
|
||||
_VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
|
||||
@ -469,65 +464,6 @@ class FacebookIE(InfoExtractor):
|
||||
'graphURI': '/api/graphql/',
|
||||
}
|
||||
|
||||
def _perform_login(self, username, password):
|
||||
login_page_req = Request(self._LOGIN_URL)
|
||||
self._set_cookie('facebook.com', 'locale', 'en_US')
|
||||
login_page = self._download_webpage(login_page_req, None,
|
||||
note='Downloading login page',
|
||||
errnote='Unable to download login page')
|
||||
lsd = self._search_regex(
|
||||
r'<input type="hidden" name="lsd" value="([^"]*)"',
|
||||
login_page, 'lsd')
|
||||
lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
|
||||
|
||||
login_form = {
|
||||
'email': username,
|
||||
'pass': password,
|
||||
'lsd': lsd,
|
||||
'lgnrnd': lgnrnd,
|
||||
'next': 'http://facebook.com/home.php',
|
||||
'default_persistent': '0',
|
||||
'legacy_return': '1',
|
||||
'timezone': '-60',
|
||||
'trynum': '1',
|
||||
}
|
||||
request = Request(self._LOGIN_URL, urlencode_postdata(login_form))
|
||||
request.headers['Content-Type'] = 'application/x-www-form-urlencoded'
|
||||
try:
|
||||
login_results = self._download_webpage(request, None,
|
||||
note='Logging in', errnote='unable to fetch login page')
|
||||
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
|
||||
error = self._html_search_regex(
|
||||
r'(?s)<div[^>]+class=(["\']).*?login_error_box.*?\1[^>]*><div[^>]*>.*?</div><div[^>]*>(?P<error>.+?)</div>',
|
||||
login_results, 'login error', default=None, group='error')
|
||||
if error:
|
||||
raise ExtractorError(f'Unable to login: {error}', expected=True)
|
||||
self.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.')
|
||||
return
|
||||
|
||||
fb_dtsg = self._search_regex(
|
||||
r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg', default=None)
|
||||
h = self._search_regex(
|
||||
r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h', default=None)
|
||||
|
||||
if not fb_dtsg or not h:
|
||||
return
|
||||
|
||||
check_form = {
|
||||
'fb_dtsg': fb_dtsg,
|
||||
'h': h,
|
||||
'name_action_selected': 'dont_save',
|
||||
}
|
||||
check_req = Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
|
||||
check_req.headers['Content-Type'] = 'application/x-www-form-urlencoded'
|
||||
check_response = self._download_webpage(check_req, None,
|
||||
note='Confirming login')
|
||||
if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
|
||||
self.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.')
|
||||
except network_exceptions as err:
|
||||
self.report_warning(f'unable to log in: {err}')
|
||||
return
|
||||
|
||||
def _extract_from_url(self, url, video_id):
|
||||
webpage = self._download_webpage(
|
||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
||||
|
||||
@ -32,67 +32,11 @@ from ..utils.traversal import require, traverse_obj
|
||||
|
||||
|
||||
class TwitterBaseIE(InfoExtractor):
|
||||
_NETRC_MACHINE = 'twitter'
|
||||
_API_BASE = 'https://api.x.com/1.1/'
|
||||
_GRAPHQL_API_BASE = 'https://x.com/i/api/graphql/'
|
||||
_BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:(?:twitter|x)\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/'
|
||||
_AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
||||
_LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE'
|
||||
_flow_token = None
|
||||
|
||||
_LOGIN_INIT_DATA = json.dumps({
|
||||
'input_flow_data': {
|
||||
'flow_context': {
|
||||
'debug_overrides': {},
|
||||
'start_location': {
|
||||
'location': 'unknown',
|
||||
},
|
||||
},
|
||||
},
|
||||
'subtask_versions': {
|
||||
'action_list': 2,
|
||||
'alert_dialog': 1,
|
||||
'app_download_cta': 1,
|
||||
'check_logged_in_account': 1,
|
||||
'choice_selection': 3,
|
||||
'contacts_live_sync_permission_prompt': 0,
|
||||
'cta': 7,
|
||||
'email_verification': 2,
|
||||
'end_flow': 1,
|
||||
'enter_date': 1,
|
||||
'enter_email': 2,
|
||||
'enter_password': 5,
|
||||
'enter_phone': 2,
|
||||
'enter_recaptcha': 1,
|
||||
'enter_text': 5,
|
||||
'enter_username': 2,
|
||||
'generic_urt': 3,
|
||||
'in_app_notification': 1,
|
||||
'interest_picker': 3,
|
||||
'js_instrumentation': 1,
|
||||
'menu_dialog': 1,
|
||||
'notifications_permission_prompt': 2,
|
||||
'open_account': 2,
|
||||
'open_home_timeline': 1,
|
||||
'open_link': 1,
|
||||
'phone_verification': 4,
|
||||
'privacy_options': 1,
|
||||
'security_key': 3,
|
||||
'select_avatar': 4,
|
||||
'select_banner': 2,
|
||||
'settings_list': 7,
|
||||
'show_code': 1,
|
||||
'sign_up': 2,
|
||||
'sign_up_review': 4,
|
||||
'tweet_selection_urt': 1,
|
||||
'update_users': 1,
|
||||
'upload_media': 1,
|
||||
'user_recommendations_list': 4,
|
||||
'user_recommendations_urt': 1,
|
||||
'wait_spinner': 3,
|
||||
'web_modal': 1,
|
||||
},
|
||||
}, separators=(',', ':')).encode()
|
||||
|
||||
def _extract_variant_formats(self, variant, video_id):
|
||||
variant_url = variant.get('url')
|
||||
@ -172,135 +116,6 @@ class TwitterBaseIE(InfoExtractor):
|
||||
'x-csrf-token': try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value),
|
||||
})
|
||||
|
||||
def _call_login_api(self, note, headers, query={}, data=None):
|
||||
response = self._download_json(
|
||||
f'{self._API_BASE}onboarding/task.json', None, note,
|
||||
headers=headers, query=query, data=data, expected_status=400)
|
||||
error = traverse_obj(response, ('errors', 0, 'message', {str}))
|
||||
if error:
|
||||
raise ExtractorError(f'Login failed, Twitter API says: {error}', expected=True)
|
||||
elif traverse_obj(response, 'status') != 'success':
|
||||
raise ExtractorError('Login was unsuccessful')
|
||||
|
||||
subtask = traverse_obj(
|
||||
response, ('subtasks', ..., 'subtask_id', {str}), get_all=False)
|
||||
if not subtask:
|
||||
raise ExtractorError('Twitter API did not return next login subtask')
|
||||
|
||||
self._flow_token = response['flow_token']
|
||||
|
||||
return subtask
|
||||
|
||||
def _perform_login(self, username, password):
|
||||
if self.is_logged_in:
|
||||
return
|
||||
|
||||
guest_token = self._fetch_guest_token(None)
|
||||
headers = {
|
||||
**self._set_base_headers(),
|
||||
'content-type': 'application/json',
|
||||
'x-guest-token': guest_token,
|
||||
'x-twitter-client-language': 'en',
|
||||
'x-twitter-active-user': 'yes',
|
||||
'Referer': 'https://x.com/',
|
||||
'Origin': 'https://x.com',
|
||||
}
|
||||
|
||||
def build_login_json(*subtask_inputs):
|
||||
return json.dumps({
|
||||
'flow_token': self._flow_token,
|
||||
'subtask_inputs': subtask_inputs,
|
||||
}, separators=(',', ':')).encode()
|
||||
|
||||
def input_dict(subtask_id, text):
|
||||
return {
|
||||
'subtask_id': subtask_id,
|
||||
'enter_text': {
|
||||
'text': text,
|
||||
'link': 'next_link',
|
||||
},
|
||||
}
|
||||
|
||||
next_subtask = self._call_login_api(
|
||||
'Downloading flow token', headers, query={'flow_name': 'login'}, data=self._LOGIN_INIT_DATA)
|
||||
|
||||
while not self.is_logged_in:
|
||||
if next_subtask == 'LoginJsInstrumentationSubtask':
|
||||
next_subtask = self._call_login_api(
|
||||
'Submitting JS instrumentation response', headers, data=build_login_json({
|
||||
'subtask_id': next_subtask,
|
||||
'js_instrumentation': {
|
||||
'response': '{}',
|
||||
'link': 'next_link',
|
||||
},
|
||||
}))
|
||||
|
||||
elif next_subtask == 'LoginEnterUserIdentifierSSO':
|
||||
next_subtask = self._call_login_api(
|
||||
'Submitting username', headers, data=build_login_json({
|
||||
'subtask_id': next_subtask,
|
||||
'settings_list': {
|
||||
'setting_responses': [{
|
||||
'key': 'user_identifier',
|
||||
'response_data': {
|
||||
'text_data': {
|
||||
'result': username,
|
||||
},
|
||||
},
|
||||
}],
|
||||
'link': 'next_link',
|
||||
},
|
||||
}))
|
||||
|
||||
elif next_subtask == 'LoginEnterAlternateIdentifierSubtask':
|
||||
next_subtask = self._call_login_api(
|
||||
'Submitting alternate identifier', headers,
|
||||
data=build_login_json(input_dict(next_subtask, self._get_tfa_info(
|
||||
'one of username, phone number or email that was not used as --username'))))
|
||||
|
||||
elif next_subtask == 'LoginEnterPassword':
|
||||
next_subtask = self._call_login_api(
|
||||
'Submitting password', headers, data=build_login_json({
|
||||
'subtask_id': next_subtask,
|
||||
'enter_password': {
|
||||
'password': password,
|
||||
'link': 'next_link',
|
||||
},
|
||||
}))
|
||||
|
||||
elif next_subtask == 'AccountDuplicationCheck':
|
||||
next_subtask = self._call_login_api(
|
||||
'Submitting account duplication check', headers, data=build_login_json({
|
||||
'subtask_id': next_subtask,
|
||||
'check_logged_in_account': {
|
||||
'link': 'AccountDuplicationCheck_false',
|
||||
},
|
||||
}))
|
||||
|
||||
elif next_subtask == 'LoginTwoFactorAuthChallenge':
|
||||
next_subtask = self._call_login_api(
|
||||
'Submitting 2FA token', headers, data=build_login_json(input_dict(
|
||||
next_subtask, self._get_tfa_info('two-factor authentication token'))))
|
||||
|
||||
elif next_subtask == 'LoginAcid':
|
||||
next_subtask = self._call_login_api(
|
||||
'Submitting confirmation code', headers, data=build_login_json(input_dict(
|
||||
next_subtask, self._get_tfa_info('confirmation code sent to your email or phone'))))
|
||||
|
||||
elif next_subtask == 'ArkoseLogin':
|
||||
self.raise_login_required('Twitter is requiring captcha for this login attempt', method='cookies')
|
||||
|
||||
elif next_subtask == 'DenyLoginSubtask':
|
||||
self.raise_login_required('Twitter rejected this login attempt as suspicious', method='cookies')
|
||||
|
||||
elif next_subtask == 'LoginSuccessSubtask':
|
||||
raise ExtractorError('Twitter API did not grant auth token cookie')
|
||||
|
||||
else:
|
||||
raise ExtractorError(f'Unrecognized subtask ID "{next_subtask}"')
|
||||
|
||||
self.report_login()
|
||||
|
||||
def _call_api(self, path, video_id, query={}, graphql=False):
|
||||
headers = self._set_base_headers(legacy=not graphql and self._selected_api == 'legacy')
|
||||
headers.update({
|
||||
@ -416,6 +231,7 @@ class TwitterCardIE(InfoExtractor):
|
||||
'live_status': 'not_live',
|
||||
},
|
||||
'add_ie': ['Youtube'],
|
||||
'skip': 'The page does not exist',
|
||||
},
|
||||
{
|
||||
'url': 'https://twitter.com/i/videos/tweet/705235433198714880',
|
||||
@ -617,6 +433,7 @@ class TwitterIE(TwitterBaseIE):
|
||||
'comment_count': int,
|
||||
'_old_archive_ids': ['twitter 852138619213144067'],
|
||||
},
|
||||
'skip': 'Suspended',
|
||||
}, {
|
||||
'url': 'https://twitter.com/i/web/status/910031516746514432',
|
||||
'info_dict': {
|
||||
@ -763,10 +580,10 @@ class TwitterIE(TwitterBaseIE):
|
||||
'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',
|
||||
'info_dict': {
|
||||
'id': '1577719286659006464',
|
||||
'title': 'Ultima - Test',
|
||||
'title': r're:Ultima.* - Test$',
|
||||
'description': 'Test https://t.co/Y3KEZD7Dad',
|
||||
'channel_id': '168922496',
|
||||
'uploader': 'Ultima',
|
||||
'uploader': r're:Ultima.*',
|
||||
'uploader_id': 'UltimaShadowX',
|
||||
'uploader_url': 'https://twitter.com/UltimaShadowX',
|
||||
'upload_date': '20221005',
|
||||
@ -895,11 +712,12 @@ class TwitterIE(TwitterBaseIE):
|
||||
'uploader': r're:Monique Camarra.+?',
|
||||
'uploader_id': 'MoniqueCamarra',
|
||||
'live_status': 'was_live',
|
||||
'release_timestamp': 1658417414,
|
||||
'release_timestamp': 1658417305,
|
||||
'description': r're:Twitter Space participated by Sergej Sumlenny.+',
|
||||
'timestamp': 1658407771,
|
||||
'release_date': '20220721',
|
||||
'upload_date': '20220721',
|
||||
'thumbnail': 'https://pbs.twimg.com/profile_images/1920514378006188033/xQs6J_yI_400x400.jpg',
|
||||
},
|
||||
'add_ie': ['TwitterSpaces'],
|
||||
'params': {'skip_download': 'm3u8'},
|
||||
@ -1010,10 +828,10 @@ class TwitterIE(TwitterBaseIE):
|
||||
'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml',
|
||||
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
|
||||
'age_limit': 0,
|
||||
'uploader': 'Boy Called Mün',
|
||||
'uploader': 'D U N I Y A',
|
||||
'repost_count': int,
|
||||
'upload_date': '20221206',
|
||||
'title': 'Boy Called Mün - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
|
||||
'title': 'D U N I Y A - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
|
||||
'comment_count': int,
|
||||
'like_count': int,
|
||||
'tags': [],
|
||||
@ -1068,6 +886,7 @@ class TwitterIE(TwitterBaseIE):
|
||||
'comment_count': int,
|
||||
'_old_archive_ids': ['twitter 1695424220702888009'],
|
||||
},
|
||||
'skip': 'Suspended',
|
||||
}, {
|
||||
# retweeted_status w/ legacy API
|
||||
'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009',
|
||||
@ -1092,6 +911,7 @@ class TwitterIE(TwitterBaseIE):
|
||||
'_old_archive_ids': ['twitter 1695424220702888009'],
|
||||
},
|
||||
'params': {'extractor_args': {'twitter': {'api': ['legacy']}}},
|
||||
'skip': 'Suspended',
|
||||
}, {
|
||||
# Broadcast embedded in tweet
|
||||
'url': 'https://twitter.com/JessicaDobsonWX/status/1731121063248175384',
|
||||
@ -1135,7 +955,6 @@ class TwitterIE(TwitterBaseIE):
|
||||
}, {
|
||||
# "stale tweet" with typename "TweetWithVisibilityResults"
|
||||
'url': 'https://twitter.com/RobertKennedyJr/status/1724884212803834154',
|
||||
'md5': '511377ff8dfa7545307084dca4dce319',
|
||||
'info_dict': {
|
||||
'id': '1724883339285544960',
|
||||
'ext': 'mp4',
|
||||
@ -1182,6 +1001,30 @@ class TwitterIE(TwitterBaseIE):
|
||||
'age_limit': 0,
|
||||
'_old_archive_ids': ['twitter 1790637656616943991'],
|
||||
},
|
||||
}, {
|
||||
# unified_card with 2 items of type video and photo
|
||||
'url': 'https://x.com/TopHeroes_/status/2001950365332455490',
|
||||
'info_dict': {
|
||||
'id': '2001841416071450628',
|
||||
'ext': 'mp4',
|
||||
'display_id': '2001950365332455490',
|
||||
'title': 'Top Heroes - Forgot to close My heroes solo level up in my phone ✨Unlock the fog,...',
|
||||
'description': r're:Forgot to close My heroes solo level up in my phone ✨Unlock the fog.+',
|
||||
'uploader': 'Top Heroes',
|
||||
'uploader_id': 'TopHeroes_',
|
||||
'uploader_url': 'https://twitter.com/TopHeroes_',
|
||||
'channel_id': '1737324725620326400',
|
||||
'comment_count': int,
|
||||
'like_count': int,
|
||||
'repost_count': int,
|
||||
'age_limit': 0,
|
||||
'duration': 30.278,
|
||||
'thumbnail': 'https://pbs.twimg.com/amplify_video_thumb/2001841416071450628/img/hpy5KpJh4pO17b65.jpg?name=orig',
|
||||
'tags': [],
|
||||
'timestamp': 1766137136,
|
||||
'upload_date': '20251219',
|
||||
'_old_archive_ids': ['twitter 2001950365332455490'],
|
||||
},
|
||||
}, {
|
||||
# onion route
|
||||
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
|
||||
@ -1422,14 +1265,14 @@ class TwitterIE(TwitterBaseIE):
|
||||
if not card:
|
||||
return
|
||||
|
||||
self.write_debug(f'Extracting from card info: {card.get("url")}')
|
||||
card_name = card['name'].split(':')[-1]
|
||||
self.write_debug(f'Extracting from {card_name} card info: {card.get("url")}')
|
||||
binding_values = card['binding_values']
|
||||
|
||||
def get_binding_value(k):
|
||||
o = binding_values.get(k) or {}
|
||||
return try_get(o, lambda x: x[x['type'].lower() + '_value'])
|
||||
|
||||
card_name = card['name'].split(':')[-1]
|
||||
if card_name == 'player':
|
||||
yield {
|
||||
'_type': 'url',
|
||||
@ -1461,7 +1304,7 @@ class TwitterIE(TwitterBaseIE):
|
||||
elif card_name == 'unified_card':
|
||||
unified_card = self._parse_json(get_binding_value('unified_card'), twid)
|
||||
yield from map(extract_from_video_info, traverse_obj(
|
||||
unified_card, ('media_entities', ...), expected_type=dict))
|
||||
unified_card, ('media_entities', lambda _, v: v['type'] == 'video')))
|
||||
# amplify, promo_video_website, promo_video_convo, appplayer,
|
||||
# video_direct_message, poll2choice_video, poll3choice_video,
|
||||
# poll4choice_video, ...
|
||||
|
||||
@ -1065,7 +1065,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
|
||||
return next_continuation
|
||||
|
||||
return traverse_obj(renderer, (
|
||||
('contents', 'items', 'rows'), ..., 'continuationItemRenderer',
|
||||
('contents', 'items', 'rows', 'subThreads'), ..., 'continuationItemRenderer',
|
||||
('continuationEndpoint', ('button', 'buttonRenderer', 'command')),
|
||||
), get_all=False, expected_type=cls._extract_continuation_ep_data)
|
||||
|
||||
|
||||
@ -1660,6 +1660,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'live_status': 'not_live',
|
||||
},
|
||||
'params': {'skip_download': True},
|
||||
}, {
|
||||
# Threaded comments with 4 levels of depth
|
||||
'url': 'https://www.youtube.com/watch?v=f6HNySwZV4c',
|
||||
'info_dict': {
|
||||
'id': 'f6HNySwZV4c',
|
||||
'ext': 'mp4',
|
||||
'title': 'dlptestvideo2',
|
||||
'description': '',
|
||||
'media_type': 'video',
|
||||
'uploader': 'cole-dlp-test-acc',
|
||||
'uploader_id': '@coletdjnz',
|
||||
'uploader_url': 'https://www.youtube.com/@coletdjnz',
|
||||
'channel': 'cole-dlp-test-acc',
|
||||
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
|
||||
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
|
||||
'view_count': int,
|
||||
'like_count': int,
|
||||
'age_limit': 0,
|
||||
'duration': 5,
|
||||
'thumbnail': 'https://i.ytimg.com/vi/f6HNySwZV4c/maxresdefault.jpg',
|
||||
'categories': ['People & Blogs'],
|
||||
'tags': [],
|
||||
'timestamp': 1709856007,
|
||||
'upload_date': '20240308',
|
||||
'release_timestamp': 1709856007,
|
||||
'release_date': '20240308',
|
||||
'playable_in_embed': True,
|
||||
'availability': 'public',
|
||||
'live_status': 'not_live',
|
||||
'comment_count': 15,
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
'getcomments': True,
|
||||
},
|
||||
}]
|
||||
_WEBPAGE_TESTS = [{
|
||||
# <object>
|
||||
@ -2437,6 +2472,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
def extract_thread(contents, entity_payloads):
|
||||
if not parent:
|
||||
tracker['current_page_thread'] = 0
|
||||
|
||||
if max_depth < tracker['current_depth']:
|
||||
return
|
||||
|
||||
for content in contents:
|
||||
if not parent and tracker['total_parent_comments'] >= max_parents:
|
||||
yield
|
||||
@ -2480,6 +2519,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'Detected YouTube comments looping. Stopping comment extraction '
|
||||
f'{"for this thread" if parent else ""} as we probably cannot get any more.')
|
||||
yield
|
||||
break # Safeguard for recursive call in subthreads code path below
|
||||
else:
|
||||
tracker['seen_comment_ids'].add(comment['id'])
|
||||
|
||||
@ -2492,12 +2532,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
|
||||
|
||||
if comment_replies_renderer:
|
||||
subthreads = traverse_obj(comment_replies_renderer, (
|
||||
'subThreads', lambda _, v: v['commentThreadRenderer']))
|
||||
# Recursively extract from `commentThreadRenderer`s in `subThreads`
|
||||
if subthreads:
|
||||
tracker['current_depth'] += 1
|
||||
for entry in extract_thread(subthreads, entity_payloads):
|
||||
if entry:
|
||||
yield entry
|
||||
tracker['current_depth'] -= 1
|
||||
# All of the subThreads' `continuationItemRenderer`s were within the nested
|
||||
# `commentThreadRenderer`s and are now exhausted, so avoid unnecessary recursion below
|
||||
continue
|
||||
|
||||
tracker['current_page_thread'] += 1
|
||||
tracker['current_depth'] += 1
|
||||
# Recursively extract from `continuationItemRenderer`s in `subThreads`
|
||||
comment_entries_iter = self._comment_entries(
|
||||
comment_replies_renderer, ytcfg, video_id,
|
||||
parent=comment.get('id'), tracker=tracker)
|
||||
parent=comment_id, tracker=tracker)
|
||||
yield from itertools.islice(comment_entries_iter, min(
|
||||
max_replies_per_thread, max(0, max_replies - tracker['total_reply_comments'])))
|
||||
tracker['current_depth'] -= 1
|
||||
|
||||
# Keeps track of counts across recursive calls
|
||||
if not tracker:
|
||||
@ -2509,19 +2565,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'total_reply_comments': 0,
|
||||
'seen_comment_ids': set(),
|
||||
'pinned_comment_ids': set(),
|
||||
'current_depth': 1,
|
||||
}
|
||||
|
||||
# TODO: Deprecated
|
||||
# YouTube comments have a max depth of 2
|
||||
max_depth = int_or_none(get_single_config_arg('max_comment_depth'))
|
||||
if max_depth:
|
||||
self._downloader.deprecated_feature('[youtube] max_comment_depth extractor argument is deprecated. '
|
||||
'Set max replies in the max-comments extractor argument instead')
|
||||
if max_depth == 1 and parent:
|
||||
return
|
||||
_max_comments, max_parents, max_replies, max_replies_per_thread, max_depth, *_ = (
|
||||
int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 5)
|
||||
|
||||
_max_comments, max_parents, max_replies, max_replies_per_thread, *_ = (
|
||||
int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 4)
|
||||
if max_depth < tracker['current_depth']:
|
||||
return
|
||||
|
||||
continuation = self._extract_continuation(root_continuation_data)
|
||||
|
||||
@ -2550,6 +2601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
|
||||
tracker['current_page_thread'], comment_prog_str)
|
||||
else:
|
||||
# TODO: `parent` is only truthy in this code path with YT's legacy (non-threaded) comment view
|
||||
note_prefix = '{}Downloading comment{} API JSON page {} {}'.format(
|
||||
' ' if parent else '', ' replies' if parent else '',
|
||||
page_num, comment_prog_str)
|
||||
@ -2566,6 +2618,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
|
||||
check_get_keys=check_get_keys)
|
||||
except ExtractorError as e:
|
||||
# TODO: This code path is not reached since eb5bdbfa70126c7d5355cc0954b63720522e462c
|
||||
# Ignore incomplete data error for replies if retries didn't work.
|
||||
# This is to allow any other parent comments and comment threads to be downloaded.
|
||||
# See: https://github.com/yt-dlp/yt-dlp/issues/4669
|
||||
@ -3307,6 +3360,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
def process_https_formats():
|
||||
proto = 'https'
|
||||
https_fmts = []
|
||||
skip_player_js = 'js' in self._configuration_arg('player_skip')
|
||||
|
||||
for fmt_stream in streaming_formats:
|
||||
if fmt_stream.get('targetDurationSec'):
|
||||
continue
|
||||
@ -3344,13 +3399,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
sc = urllib.parse.parse_qs(fmt_stream.get('signatureCipher'))
|
||||
fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
|
||||
encrypted_sig = try_get(sc, lambda x: x['s'][0])
|
||||
if not all((sc, fmt_url, player_url, encrypted_sig)):
|
||||
msg = f'Some {client_name} client https formats have been skipped as they are missing a url. '
|
||||
if not all((sc, fmt_url, skip_player_js or player_url, encrypted_sig)):
|
||||
msg = f'Some {client_name} client https formats have been skipped as they are missing a URL. '
|
||||
if client_name in ('web', 'web_safari'):
|
||||
msg += 'YouTube is forcing SABR streaming for this client. '
|
||||
else:
|
||||
msg += (
|
||||
f'YouTube may have enabled the SABR-only or Server-Side Ad Placement experiment for '
|
||||
f'YouTube may have enabled the SABR-only streaming experiment for '
|
||||
f'{"your account" if self.is_authenticated else "the current session"}. '
|
||||
)
|
||||
msg += 'See https://github.com/yt-dlp/yt-dlp/issues/12482 for more details'
|
||||
@ -3366,6 +3421,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
# signature
|
||||
# Attempt to load sig spec from cache
|
||||
if encrypted_sig:
|
||||
if skip_player_js:
|
||||
continue
|
||||
spec_cache_id = self._sig_spec_cache_id(player_url, len(encrypted_sig))
|
||||
spec = self._load_sig_spec_from_cache(spec_cache_id)
|
||||
if spec:
|
||||
@ -3379,6 +3436,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
# n challenge
|
||||
query = parse_qs(fmt_url)
|
||||
if query.get('n'):
|
||||
if skip_player_js:
|
||||
continue
|
||||
n_challenge = query['n'][0]
|
||||
if n_challenge in self._player_cache:
|
||||
fmt_url = update_url_query(fmt_url, {'n': self._player_cache[n_challenge]})
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user