Compare commits

...

5 Commits

Author SHA1 Message Date
bashonly
2a7e048a60
[ie/facebook] Remove broken login support (#15434)
Authored by: bashonly
2025-12-30 00:48:11 +00:00
bashonly
a6ba714005
[ie/twitter] Remove broken login support (#15432)
Closes #12616
Authored by: bashonly
2025-12-30 00:22:33 +00:00
bashonly
ce9a3591f8
[ie/twitter] Do not extract non-video posts from unified_cards (#15431)
Closes #15402
Authored by: bashonly
2025-12-30 00:20:44 +00:00
bashonly
d22436e5dc
[ie/youtube] Support comment subthreads (#15419)
* Support newly rolled out comment "subthreads"
* Fix comments extraction: all replies were being missed
* Add a `max-depth` element to the `max_comments` extractor-arg
* Fully remove the deprecated `max_comment_depth` extractor-arg

Closes #15303
Authored by: bashonly
2025-12-29 21:46:29 +00:00
bashonly
abf29e3e72
[ie/youtube] Fix skip_player=js extractor-arg (#15428)
Authored by: bashonly
2025-12-29 21:41:48 +00:00
5 changed files with 114 additions and 275 deletions

View File

@ -1859,8 +1859,9 @@ The following extractors use this feature:
* `player_js_variant`: The player javascript variant to use for n/sig deciphering. The known variants are: `main`, `tcc`, `tce`, `es5`, `es6`, `tv`, `tv_es6`, `phone`, `tablet`. The default is `main`, and the others are for debugging purposes. You can use `actual` to go with what is prescribed by the site
* `player_js_version`: The player javascript version to use for n/sig deciphering, in the format of `signature_timestamp@hash` (e.g. `20348@0004de42`). The default is to use what is prescribed by the site, and can be selected with `actual`
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`
* E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread,max-depth`. Default is `all,all,all,all,all`
* A `max-depth` value of `1` will discard all replies, regardless of the `max-replies` or `max-replies-per-thread` values given
* E.g. `all,all,1000,10,2` will get a maximum of 1000 replies total, with up to 10 replies per thread, and only 2 levels of depth (i.e. top-level comments plus their immediate replies). `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
* `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8), `missing_pot` (include formats that require a PO Token but are missing one)
* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
* `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used

View File

@ -4,8 +4,6 @@ import urllib.parse
from .common import InfoExtractor
from ..compat import compat_etree_fromstring
from ..networking import Request
from ..networking.exceptions import network_exceptions
from ..utils import (
ExtractorError,
clean_html,
@ -64,9 +62,6 @@ class FacebookIE(InfoExtractor):
class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''',
]
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
_NETRC_MACHINE = 'facebook'
IE_NAME = 'facebook'
_VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
@ -469,65 +464,6 @@ class FacebookIE(InfoExtractor):
'graphURI': '/api/graphql/',
}
def _perform_login(self, username, password):
login_page_req = Request(self._LOGIN_URL)
self._set_cookie('facebook.com', 'locale', 'en_US')
login_page = self._download_webpage(login_page_req, None,
note='Downloading login page',
errnote='Unable to download login page')
lsd = self._search_regex(
r'<input type="hidden" name="lsd" value="([^"]*)"',
login_page, 'lsd')
lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
login_form = {
'email': username,
'pass': password,
'lsd': lsd,
'lgnrnd': lgnrnd,
'next': 'http://facebook.com/home.php',
'default_persistent': '0',
'legacy_return': '1',
'timezone': '-60',
'trynum': '1',
}
request = Request(self._LOGIN_URL, urlencode_postdata(login_form))
request.headers['Content-Type'] = 'application/x-www-form-urlencoded'
try:
login_results = self._download_webpage(request, None,
note='Logging in', errnote='unable to fetch login page')
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
error = self._html_search_regex(
r'(?s)<div[^>]+class=(["\']).*?login_error_box.*?\1[^>]*><div[^>]*>.*?</div><div[^>]*>(?P<error>.+?)</div>',
login_results, 'login error', default=None, group='error')
if error:
raise ExtractorError(f'Unable to login: {error}', expected=True)
self.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.')
return
fb_dtsg = self._search_regex(
r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg', default=None)
h = self._search_regex(
r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h', default=None)
if not fb_dtsg or not h:
return
check_form = {
'fb_dtsg': fb_dtsg,
'h': h,
'name_action_selected': 'dont_save',
}
check_req = Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
check_req.headers['Content-Type'] = 'application/x-www-form-urlencoded'
check_response = self._download_webpage(check_req, None,
note='Confirming login')
if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
self.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.')
except network_exceptions as err:
self.report_warning(f'unable to log in: {err}')
return
def _extract_from_url(self, url, video_id):
webpage = self._download_webpage(
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)

View File

@ -32,67 +32,11 @@ from ..utils.traversal import require, traverse_obj
class TwitterBaseIE(InfoExtractor):
_NETRC_MACHINE = 'twitter'
_API_BASE = 'https://api.x.com/1.1/'
_GRAPHQL_API_BASE = 'https://x.com/i/api/graphql/'
_BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:(?:twitter|x)\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/'
_AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
_LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE'
_flow_token = None
_LOGIN_INIT_DATA = json.dumps({
'input_flow_data': {
'flow_context': {
'debug_overrides': {},
'start_location': {
'location': 'unknown',
},
},
},
'subtask_versions': {
'action_list': 2,
'alert_dialog': 1,
'app_download_cta': 1,
'check_logged_in_account': 1,
'choice_selection': 3,
'contacts_live_sync_permission_prompt': 0,
'cta': 7,
'email_verification': 2,
'end_flow': 1,
'enter_date': 1,
'enter_email': 2,
'enter_password': 5,
'enter_phone': 2,
'enter_recaptcha': 1,
'enter_text': 5,
'enter_username': 2,
'generic_urt': 3,
'in_app_notification': 1,
'interest_picker': 3,
'js_instrumentation': 1,
'menu_dialog': 1,
'notifications_permission_prompt': 2,
'open_account': 2,
'open_home_timeline': 1,
'open_link': 1,
'phone_verification': 4,
'privacy_options': 1,
'security_key': 3,
'select_avatar': 4,
'select_banner': 2,
'settings_list': 7,
'show_code': 1,
'sign_up': 2,
'sign_up_review': 4,
'tweet_selection_urt': 1,
'update_users': 1,
'upload_media': 1,
'user_recommendations_list': 4,
'user_recommendations_urt': 1,
'wait_spinner': 3,
'web_modal': 1,
},
}, separators=(',', ':')).encode()
def _extract_variant_formats(self, variant, video_id):
variant_url = variant.get('url')
@ -172,135 +116,6 @@ class TwitterBaseIE(InfoExtractor):
'x-csrf-token': try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value),
})
def _call_login_api(self, note, headers, query={}, data=None):
response = self._download_json(
f'{self._API_BASE}onboarding/task.json', None, note,
headers=headers, query=query, data=data, expected_status=400)
error = traverse_obj(response, ('errors', 0, 'message', {str}))
if error:
raise ExtractorError(f'Login failed, Twitter API says: {error}', expected=True)
elif traverse_obj(response, 'status') != 'success':
raise ExtractorError('Login was unsuccessful')
subtask = traverse_obj(
response, ('subtasks', ..., 'subtask_id', {str}), get_all=False)
if not subtask:
raise ExtractorError('Twitter API did not return next login subtask')
self._flow_token = response['flow_token']
return subtask
def _perform_login(self, username, password):
if self.is_logged_in:
return
guest_token = self._fetch_guest_token(None)
headers = {
**self._set_base_headers(),
'content-type': 'application/json',
'x-guest-token': guest_token,
'x-twitter-client-language': 'en',
'x-twitter-active-user': 'yes',
'Referer': 'https://x.com/',
'Origin': 'https://x.com',
}
def build_login_json(*subtask_inputs):
return json.dumps({
'flow_token': self._flow_token,
'subtask_inputs': subtask_inputs,
}, separators=(',', ':')).encode()
def input_dict(subtask_id, text):
return {
'subtask_id': subtask_id,
'enter_text': {
'text': text,
'link': 'next_link',
},
}
next_subtask = self._call_login_api(
'Downloading flow token', headers, query={'flow_name': 'login'}, data=self._LOGIN_INIT_DATA)
while not self.is_logged_in:
if next_subtask == 'LoginJsInstrumentationSubtask':
next_subtask = self._call_login_api(
'Submitting JS instrumentation response', headers, data=build_login_json({
'subtask_id': next_subtask,
'js_instrumentation': {
'response': '{}',
'link': 'next_link',
},
}))
elif next_subtask == 'LoginEnterUserIdentifierSSO':
next_subtask = self._call_login_api(
'Submitting username', headers, data=build_login_json({
'subtask_id': next_subtask,
'settings_list': {
'setting_responses': [{
'key': 'user_identifier',
'response_data': {
'text_data': {
'result': username,
},
},
}],
'link': 'next_link',
},
}))
elif next_subtask == 'LoginEnterAlternateIdentifierSubtask':
next_subtask = self._call_login_api(
'Submitting alternate identifier', headers,
data=build_login_json(input_dict(next_subtask, self._get_tfa_info(
'one of username, phone number or email that was not used as --username'))))
elif next_subtask == 'LoginEnterPassword':
next_subtask = self._call_login_api(
'Submitting password', headers, data=build_login_json({
'subtask_id': next_subtask,
'enter_password': {
'password': password,
'link': 'next_link',
},
}))
elif next_subtask == 'AccountDuplicationCheck':
next_subtask = self._call_login_api(
'Submitting account duplication check', headers, data=build_login_json({
'subtask_id': next_subtask,
'check_logged_in_account': {
'link': 'AccountDuplicationCheck_false',
},
}))
elif next_subtask == 'LoginTwoFactorAuthChallenge':
next_subtask = self._call_login_api(
'Submitting 2FA token', headers, data=build_login_json(input_dict(
next_subtask, self._get_tfa_info('two-factor authentication token'))))
elif next_subtask == 'LoginAcid':
next_subtask = self._call_login_api(
'Submitting confirmation code', headers, data=build_login_json(input_dict(
next_subtask, self._get_tfa_info('confirmation code sent to your email or phone'))))
elif next_subtask == 'ArkoseLogin':
self.raise_login_required('Twitter is requiring captcha for this login attempt', method='cookies')
elif next_subtask == 'DenyLoginSubtask':
self.raise_login_required('Twitter rejected this login attempt as suspicious', method='cookies')
elif next_subtask == 'LoginSuccessSubtask':
raise ExtractorError('Twitter API did not grant auth token cookie')
else:
raise ExtractorError(f'Unrecognized subtask ID "{next_subtask}"')
self.report_login()
def _call_api(self, path, video_id, query={}, graphql=False):
headers = self._set_base_headers(legacy=not graphql and self._selected_api == 'legacy')
headers.update({
@ -416,6 +231,7 @@ class TwitterCardIE(InfoExtractor):
'live_status': 'not_live',
},
'add_ie': ['Youtube'],
'skip': 'The page does not exist',
},
{
'url': 'https://twitter.com/i/videos/tweet/705235433198714880',
@ -617,6 +433,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'_old_archive_ids': ['twitter 852138619213144067'],
},
'skip': 'Suspended',
}, {
'url': 'https://twitter.com/i/web/status/910031516746514432',
'info_dict': {
@ -763,10 +580,10 @@ class TwitterIE(TwitterBaseIE):
'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',
'info_dict': {
'id': '1577719286659006464',
'title': 'Ultima - Test',
'title': r're:Ultima.* - Test$',
'description': 'Test https://t.co/Y3KEZD7Dad',
'channel_id': '168922496',
'uploader': 'Ultima',
'uploader': r're:Ultima.*',
'uploader_id': 'UltimaShadowX',
'uploader_url': 'https://twitter.com/UltimaShadowX',
'upload_date': '20221005',
@ -895,11 +712,12 @@ class TwitterIE(TwitterBaseIE):
'uploader': r're:Monique Camarra.+?',
'uploader_id': 'MoniqueCamarra',
'live_status': 'was_live',
'release_timestamp': 1658417414,
'release_timestamp': 1658417305,
'description': r're:Twitter Space participated by Sergej Sumlenny.+',
'timestamp': 1658407771,
'release_date': '20220721',
'upload_date': '20220721',
'thumbnail': 'https://pbs.twimg.com/profile_images/1920514378006188033/xQs6J_yI_400x400.jpg',
},
'add_ie': ['TwitterSpaces'],
'params': {'skip_download': 'm3u8'},
@ -1010,10 +828,10 @@ class TwitterIE(TwitterBaseIE):
'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml',
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
'age_limit': 0,
'uploader': 'Boy Called Mün',
'uploader': 'D U N I Y A',
'repost_count': int,
'upload_date': '20221206',
'title': 'Boy Called Mün - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
'title': 'D U N I Y A - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
'comment_count': int,
'like_count': int,
'tags': [],
@ -1068,6 +886,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'_old_archive_ids': ['twitter 1695424220702888009'],
},
'skip': 'Suspended',
}, {
# retweeted_status w/ legacy API
'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009',
@ -1092,6 +911,7 @@ class TwitterIE(TwitterBaseIE):
'_old_archive_ids': ['twitter 1695424220702888009'],
},
'params': {'extractor_args': {'twitter': {'api': ['legacy']}}},
'skip': 'Suspended',
}, {
# Broadcast embedded in tweet
'url': 'https://twitter.com/JessicaDobsonWX/status/1731121063248175384',
@ -1135,7 +955,6 @@ class TwitterIE(TwitterBaseIE):
}, {
# "stale tweet" with typename "TweetWithVisibilityResults"
'url': 'https://twitter.com/RobertKennedyJr/status/1724884212803834154',
'md5': '511377ff8dfa7545307084dca4dce319',
'info_dict': {
'id': '1724883339285544960',
'ext': 'mp4',
@ -1182,6 +1001,30 @@ class TwitterIE(TwitterBaseIE):
'age_limit': 0,
'_old_archive_ids': ['twitter 1790637656616943991'],
},
}, {
# unified_card with 2 items of type video and photo
'url': 'https://x.com/TopHeroes_/status/2001950365332455490',
'info_dict': {
'id': '2001841416071450628',
'ext': 'mp4',
'display_id': '2001950365332455490',
'title': 'Top Heroes - Forgot to close My heroes solo level up in my phone ✨Unlock the fog,...',
'description': r're:Forgot to close My heroes solo level up in my phone ✨Unlock the fog.+',
'uploader': 'Top Heroes',
'uploader_id': 'TopHeroes_',
'uploader_url': 'https://twitter.com/TopHeroes_',
'channel_id': '1737324725620326400',
'comment_count': int,
'like_count': int,
'repost_count': int,
'age_limit': 0,
'duration': 30.278,
'thumbnail': 'https://pbs.twimg.com/amplify_video_thumb/2001841416071450628/img/hpy5KpJh4pO17b65.jpg?name=orig',
'tags': [],
'timestamp': 1766137136,
'upload_date': '20251219',
'_old_archive_ids': ['twitter 2001950365332455490'],
},
}, {
# onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
@ -1422,14 +1265,14 @@ class TwitterIE(TwitterBaseIE):
if not card:
return
self.write_debug(f'Extracting from card info: {card.get("url")}')
card_name = card['name'].split(':')[-1]
self.write_debug(f'Extracting from {card_name} card info: {card.get("url")}')
binding_values = card['binding_values']
def get_binding_value(k):
o = binding_values.get(k) or {}
return try_get(o, lambda x: x[x['type'].lower() + '_value'])
card_name = card['name'].split(':')[-1]
if card_name == 'player':
yield {
'_type': 'url',
@ -1461,7 +1304,7 @@ class TwitterIE(TwitterBaseIE):
elif card_name == 'unified_card':
unified_card = self._parse_json(get_binding_value('unified_card'), twid)
yield from map(extract_from_video_info, traverse_obj(
unified_card, ('media_entities', ...), expected_type=dict))
unified_card, ('media_entities', lambda _, v: v['type'] == 'video')))
# amplify, promo_video_website, promo_video_convo, appplayer,
# video_direct_message, poll2choice_video, poll3choice_video,
# poll4choice_video, ...

View File

@ -1065,7 +1065,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return next_continuation
return traverse_obj(renderer, (
('contents', 'items', 'rows'), ..., 'continuationItemRenderer',
('contents', 'items', 'rows', 'subThreads'), ..., 'continuationItemRenderer',
('continuationEndpoint', ('button', 'buttonRenderer', 'command')),
), get_all=False, expected_type=cls._extract_continuation_ep_data)

View File

@ -1660,6 +1660,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'live_status': 'not_live',
},
'params': {'skip_download': True},
}, {
# Threaded comments with 4 levels of depth
'url': 'https://www.youtube.com/watch?v=f6HNySwZV4c',
'info_dict': {
'id': 'f6HNySwZV4c',
'ext': 'mp4',
'title': 'dlptestvideo2',
'description': '',
'media_type': 'video',
'uploader': 'cole-dlp-test-acc',
'uploader_id': '@coletdjnz',
'uploader_url': 'https://www.youtube.com/@coletdjnz',
'channel': 'cole-dlp-test-acc',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'view_count': int,
'like_count': int,
'age_limit': 0,
'duration': 5,
'thumbnail': 'https://i.ytimg.com/vi/f6HNySwZV4c/maxresdefault.jpg',
'categories': ['People & Blogs'],
'tags': [],
'timestamp': 1709856007,
'upload_date': '20240308',
'release_timestamp': 1709856007,
'release_date': '20240308',
'playable_in_embed': True,
'availability': 'public',
'live_status': 'not_live',
'comment_count': 15,
},
'params': {
'skip_download': True,
'getcomments': True,
},
}]
_WEBPAGE_TESTS = [{
# <object>
@ -2437,6 +2472,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def extract_thread(contents, entity_payloads):
if not parent:
tracker['current_page_thread'] = 0
if max_depth < tracker['current_depth']:
return
for content in contents:
if not parent and tracker['total_parent_comments'] >= max_parents:
yield
@ -2480,6 +2519,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'Detected YouTube comments looping. Stopping comment extraction '
f'{"for this thread" if parent else ""} as we probably cannot get any more.')
yield
break # Safeguard for recursive call in subthreads code path below
else:
tracker['seen_comment_ids'].add(comment['id'])
@ -2492,12 +2532,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
if comment_replies_renderer:
subthreads = traverse_obj(comment_replies_renderer, (
'subThreads', lambda _, v: v['commentThreadRenderer']))
# Recursively extract from `commentThreadRenderer`s in `subThreads`
if subthreads:
tracker['current_depth'] += 1
for entry in extract_thread(subthreads, entity_payloads):
if entry:
yield entry
tracker['current_depth'] -= 1
# All of the subThreads' `continuationItemRenderer`s were within the nested
# `commentThreadRenderer`s and are now exhausted, so avoid unnecessary recursion below
continue
tracker['current_page_thread'] += 1
tracker['current_depth'] += 1
# Recursively extract from `continuationItemRenderer`s in `subThreads`
comment_entries_iter = self._comment_entries(
comment_replies_renderer, ytcfg, video_id,
parent=comment.get('id'), tracker=tracker)
parent=comment_id, tracker=tracker)
yield from itertools.islice(comment_entries_iter, min(
max_replies_per_thread, max(0, max_replies - tracker['total_reply_comments'])))
tracker['current_depth'] -= 1
# Keeps track of counts across recursive calls
if not tracker:
@ -2509,19 +2565,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'total_reply_comments': 0,
'seen_comment_ids': set(),
'pinned_comment_ids': set(),
'current_depth': 1,
}
# TODO: Deprecated
# YouTube comments have a max depth of 2
max_depth = int_or_none(get_single_config_arg('max_comment_depth'))
if max_depth:
self._downloader.deprecated_feature('[youtube] max_comment_depth extractor argument is deprecated. '
'Set max replies in the max-comments extractor argument instead')
if max_depth == 1 and parent:
return
_max_comments, max_parents, max_replies, max_replies_per_thread, max_depth, *_ = (
int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 5)
_max_comments, max_parents, max_replies, max_replies_per_thread, *_ = (
int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 4)
if max_depth < tracker['current_depth']:
return
continuation = self._extract_continuation(root_continuation_data)
@ -2550,6 +2601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
tracker['current_page_thread'], comment_prog_str)
else:
# TODO: `parent` is only truthy in this code path with YT's legacy (non-threaded) comment view
note_prefix = '{}Downloading comment{} API JSON page {} {}'.format(
' ' if parent else '', ' replies' if parent else '',
page_num, comment_prog_str)
@ -2566,6 +2618,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
check_get_keys=check_get_keys)
except ExtractorError as e:
# TODO: This code path is not reached since eb5bdbfa70126c7d5355cc0954b63720522e462c
# Ignore incomplete data error for replies if retries didn't work.
# This is to allow any other parent comments and comment threads to be downloaded.
# See: https://github.com/yt-dlp/yt-dlp/issues/4669
@ -3307,6 +3360,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def process_https_formats():
proto = 'https'
https_fmts = []
skip_player_js = 'js' in self._configuration_arg('player_skip')
for fmt_stream in streaming_formats:
if fmt_stream.get('targetDurationSec'):
continue
@ -3344,13 +3399,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
sc = urllib.parse.parse_qs(fmt_stream.get('signatureCipher'))
fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
encrypted_sig = try_get(sc, lambda x: x['s'][0])
if not all((sc, fmt_url, player_url, encrypted_sig)):
msg = f'Some {client_name} client https formats have been skipped as they are missing a url. '
if not all((sc, fmt_url, skip_player_js or player_url, encrypted_sig)):
msg = f'Some {client_name} client https formats have been skipped as they are missing a URL. '
if client_name in ('web', 'web_safari'):
msg += 'YouTube is forcing SABR streaming for this client. '
else:
msg += (
f'YouTube may have enabled the SABR-only or Server-Side Ad Placement experiment for '
f'YouTube may have enabled the SABR-only streaming experiment for '
f'{"your account" if self.is_authenticated else "the current session"}. '
)
msg += 'See https://github.com/yt-dlp/yt-dlp/issues/12482 for more details'
@ -3366,6 +3421,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# signature
# Attempt to load sig spec from cache
if encrypted_sig:
if skip_player_js:
continue
spec_cache_id = self._sig_spec_cache_id(player_url, len(encrypted_sig))
spec = self._load_sig_spec_from_cache(spec_cache_id)
if spec:
@ -3379,6 +3436,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# n challenge
query = parse_qs(fmt_url)
if query.get('n'):
if skip_player_js:
continue
n_challenge = query['n'][0]
if n_challenge in self._player_cache:
fmt_url = update_url_query(fmt_url, {'n': self._player_cache[n_challenge]})