From 8b8e3e3cb4d3ba0dedf7b1fd00ce68f07da7e588 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 1 Jul 2026 16:46:52 -0500 Subject: [PATCH] [ie/instagram] Add fallback for when impersonation is unavailable (#17113) Fix f49b551a0c4c25358d2afaeda4ee63989d2d56ab Authored by: bashonly --- yt_dlp/extractor/instagram.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 9e1fc59c52..880b78f5d3 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -380,6 +380,8 @@ class InstagramIE(InstagramBaseIE): 'only_matching': True, }] + _SJS_RE = re.compile(r']+\bdata-sjs>(\{.+?\})') + _lsd_token = None @classmethod @@ -397,11 +399,9 @@ class InstagramIE(InstagramBaseIE): if self._is_logged_in: return if not self._lsd_token: - webpage = self._download_webpage( - self._BASE_URL, None, 'Setting up session', - impersonate=True, require_impersonation=True) + webpage = self._download_webpage(self._BASE_URL, None, 'Setting up session', impersonate=True) eqmc = self._search_json( - r']* id="__eqmc"[^>]*>', webpage, 'eqmc JSON', None, default={}) + r']*\bid="__eqmc"[^>]*>', webpage, 'eqmc JSON', None, default={}) self._lsd_token = ( traverse_obj(eqmc, ('l', {str})) or self._search_regex(r'\["LSD",\[\],\{"token":"([^"]+)"', webpage, 'LSD token')) @@ -419,7 +419,7 @@ class InstagramIE(InstagramBaseIE): api_check = self._download_json( f'{self._API_BASE_URL}/web/get_ruling_for_content/', video_id, 'Checking post accessibility', errnote=False, fatal=False, - impersonate=True, require_impersonation=True, headers=self._api_headers, + impersonate=True, headers=self._api_headers, query={'content_type': 'MEDIA', 'target_id': media_id}) or {} csrf_token = self._get_cookies('https://www.instagram.com').get('csrftoken') @@ -432,7 +432,7 @@ class InstagramIE(InstagramBaseIE): response = self._download_json( 'https://www.instagram.com/api/graphql', video_id, - impersonate=True, require_impersonation=True, + fatal=False, impersonate=True, headers=filter_dict({ **self._api_headers, 'X-FB-Friendly-Name': 'PolarisLoggedOutDesktopWWWPostRootContentQuery', @@ -441,10 +441,6 @@ class InstagramIE(InstagramBaseIE): 'X-Requested-With': 'XMLHttpRequest', 'Referer': url, }), data=urlencode_postdata({ - 'av': '0', - '__d': 'www', - '__user': '0', - 'dpr': '1', 'lsd': self._lsd_token, 'fb_api_caller_class': 'RelayModern', 'fb_api_req_friendly_name': 'PolarisLoggedOutDesktopWWWPostRootContentQuery', @@ -455,6 +451,7 @@ class InstagramIE(InstagramBaseIE): media = traverse_obj(response, ('data', 'xig_polaris_media', {dict})) product_info = traverse_obj(media, ('if_not_gated_logged_out', {dict})) + if not product_info: error = join_nonempty('title', 'description', delim=': ', from_dict=api_check) if 'Restricted Video' in error: @@ -466,6 +463,23 @@ class InstagramIE(InstagramBaseIE): # Only raise after getting empty response; sometimes "long"-shortcode posts are public self.raise_login_required( 'This content is only available for registered users who follow this account') + + webpage, urlh = self._download_webpage_handle( + f'https://www.instagram.com/p/{video_id}', video_id) + if urlh.url.startswith(self._LOGIN_URL): + self.raise_login_required( + 'The webpage request was redirected to the login page. ' + 'You have exceeded the rate-limit for accessing posts anonymously') + + media = traverse_obj(webpage, ( + {self._SJS_RE.findall}, ..., {json.loads}, + 'require', ..., ..., ..., '__bbox', 'require', + lambda _, v: v[0] == 'RelayPrefetchedStreamCache', ..., + lambda _, v: v['__bbox']['result']['data']['xig_polaris_media'], + '__bbox', 'result', 'data', 'xig_polaris_media', {dict}, any)) + product_info = traverse_obj(media, ('if_not_gated_logged_out', {dict})) + + if not product_info: raise ExtractorError( 'Instagram sent an empty media response. Check if this post is accessible in your ' f'browser without being logged-in. If it is not, then u{self._login_hint()[1:]}. '