From f569be4602c2a857087e495d5d7ed6060cd97abe Mon Sep 17 00:00:00 2001
From: bashonly <88596187+bashonly@users.noreply.github.com>
Date: Tue, 20 May 2025 13:25:27 -0500
Subject: [PATCH 01/40] [ie/niconico] Fix error handling (#13236)
Closes #11430
Authored by: bashonly
---
yt_dlp/extractor/niconico.py | 47 +++++++++++++++++++++++++-----------
1 file changed, 33 insertions(+), 14 deletions(-)
diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py
index fc050c383b..0d0f7ceef0 100644
--- a/yt_dlp/extractor/niconico.py
+++ b/yt_dlp/extractor/niconico.py
@@ -32,7 +32,7 @@ from ..utils import (
urlencode_postdata,
urljoin,
)
-from ..utils.traversal import find_element, traverse_obj
+from ..utils.traversal import find_element, require, traverse_obj
class NiconicoBaseIE(InfoExtractor):
@@ -283,35 +283,54 @@ class NiconicoIE(NiconicoBaseIE):
lambda _, v: v['id'] == video_fmt['format_id'], 'qualityLevel', {int_or_none}, any)) or -1
yield video_fmt
+ def _extract_server_response(self, webpage, video_id, fatal=True):
+ try:
+ return traverse_obj(
+ self._parse_json(self._html_search_meta('server-response', webpage) or '', video_id),
+ ('data', 'response', {dict}, {require('server response')}))
+ except ExtractorError:
+ if not fatal:
+ return {}
+ raise
+
def _real_extract(self, url):
video_id = self._match_id(url)
try:
webpage, handle = self._download_webpage_handle(
- 'https://www.nicovideo.jp/watch/' + video_id, video_id)
+ f'https://www.nicovideo.jp/watch/{video_id}', video_id,
+ headers=self.geo_verification_headers())
if video_id.startswith('so'):
video_id = self._match_id(handle.url)
- api_data = traverse_obj(
- self._parse_json(self._html_search_meta('server-response', webpage) or '', video_id),
- ('data', 'response', {dict}))
- if not api_data:
- raise ExtractorError('Server response data not found')
+ api_data = self._extract_server_response(webpage, video_id)
except ExtractorError as e:
try:
api_data = self._download_json(
- f'https://www.nicovideo.jp/api/watch/v3/{video_id}?_frontendId=6&_frontendVersion=0&actionTrackId=AAAAAAAAAA_{round(time.time() * 1000)}', video_id,
- note='Downloading API JSON', errnote='Unable to fetch data')['data']
+ f'https://www.nicovideo.jp/api/watch/v3/{video_id}', video_id,
+ 'Downloading API JSON', 'Unable to fetch data', query={
+ '_frontendId': '6',
+ '_frontendVersion': '0',
+ 'actionTrackId': f'AAAAAAAAAA_{round(time.time() * 1000)}',
+ }, headers=self.geo_verification_headers())['data']
except ExtractorError:
if not isinstance(e.cause, HTTPError):
+ # Raise if original exception was from _parse_json or utils.traversal.require
raise
+ # The webpage server response has more detailed error info than the API response
webpage = e.cause.response.read().decode('utf-8', 'replace')
- error_msg = self._html_search_regex(
- r'(?s)',
- webpage, 'error reason', default=None)
- if not error_msg:
+ reason_code = self._extract_server_response(
+ webpage, video_id, fatal=False).get('reasonCode')
+ if not reason_code:
raise
- raise ExtractorError(clean_html(error_msg), expected=True)
+ if reason_code in ('DOMESTIC_VIDEO', 'HIGH_RISK_COUNTRY_VIDEO'):
+ self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+ elif reason_code == 'HIDDEN_VIDEO':
+ raise ExtractorError(
+ 'The viewing period of this video has expired', expected=True)
+ elif reason_code == 'DELETED_VIDEO':
+ raise ExtractorError('This video has been deleted', expected=True)
+ raise ExtractorError(f'Niconico says: {reason_code}')
availability = self._availability(**(traverse_obj(api_data, ('payment', 'video', {
'needs_premium': ('isPremium', {bool}),
From 545c1a5b6f2fe88722b41aef0e7485bf3be3f3f9 Mon Sep 17 00:00:00 2001
From: bashonly <88596187+bashonly@users.noreply.github.com>
Date: Tue, 20 May 2025 13:28:34 -0500
Subject: [PATCH 02/40] [ie/vimeo:event] Add extractor (#13216)
Closes #1608
Authored by: bashonly
---
yt_dlp/extractor/_extractors.py | 1 +
yt_dlp/extractor/vimeo.py | 428 ++++++++++++++++++++++++++++++--
2 files changed, 413 insertions(+), 16 deletions(-)
diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py
index e7dcb9853e..14a0068934 100644
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@@ -2369,6 +2369,7 @@ from .vimeo import (
VHXEmbedIE,
VimeoAlbumIE,
VimeoChannelIE,
+ VimeoEventIE,
VimeoGroupsIE,
VimeoIE,
VimeoLikesIE,
diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py
index fb9af7acf1..09497b699d 100644
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@@ -3,6 +3,7 @@ import functools
import itertools
import json
import re
+import time
import urllib.parse
from .common import InfoExtractor
@@ -13,10 +14,12 @@ from ..utils import (
OnDemandPagedList,
clean_html,
determine_ext,
+ filter_dict,
get_element_by_class,
int_or_none,
join_nonempty,
js_to_json,
+ jwt_decode_hs256,
merge_dicts,
parse_filesize,
parse_iso8601,
@@ -39,6 +42,9 @@ class VimeoBaseInfoExtractor(InfoExtractor):
_NETRC_MACHINE = 'vimeo'
_LOGIN_REQUIRED = False
_LOGIN_URL = 'https://vimeo.com/log_in'
+ _REFERER_HINT = (
+ 'Cannot download embed-only video without embedding URL. Please call yt-dlp '
+ 'with the URL of the page that embeds this video.')
_IOS_CLIENT_AUTH = 'MTMxNzViY2Y0NDE0YTQ5YzhjZTc0YmU0NjVjNDQxYzNkYWVjOWRlOTpHKzRvMmgzVUh4UkxjdU5FRW80cDNDbDhDWGR5dVJLNUJZZ055dHBHTTB4V1VzaG41bEx1a2hiN0NWYWNUcldSSW53dzRUdFRYZlJEZmFoTTArOTBUZkJHS3R4V2llYU04Qnl1bERSWWxUdXRidjNqR2J4SHFpVmtFSUcyRktuQw=='
_IOS_CLIENT_HEADERS = {
'Accept': 'application/vnd.vimeo.*+json; version=3.4.10',
@@ -47,6 +53,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
}
_IOS_OAUTH_CACHE_KEY = 'oauth-token-ios'
_ios_oauth_token = None
+ _viewer_info = None
@staticmethod
def _smuggle_referrer(url, referrer_url):
@@ -60,8 +67,21 @@ class VimeoBaseInfoExtractor(InfoExtractor):
headers['Referer'] = data['referer']
return url, data, headers
+ def _jwt_is_expired(self, token):
+ return jwt_decode_hs256(token)['exp'] - time.time() < 120
+
+ def _fetch_viewer_info(self, display_id=None, fatal=True):
+ if self._viewer_info and not self._jwt_is_expired(self._viewer_info['jwt']):
+ return self._viewer_info
+
+ self._viewer_info = self._download_json(
+ 'https://vimeo.com/_next/viewer', display_id, 'Downloading web token info',
+ 'Failed to download web token info', fatal=fatal, headers={'Accept': 'application/json'})
+
+ return self._viewer_info
+
def _perform_login(self, username, password):
- viewer = self._download_json('https://vimeo.com/_next/viewer', None, 'Downloading login token')
+ viewer = self._fetch_viewer_info()
data = {
'action': 'login',
'email': username,
@@ -96,11 +116,10 @@ class VimeoBaseInfoExtractor(InfoExtractor):
expected=True)
return password
- def _verify_video_password(self, video_id):
+ def _verify_video_password(self, video_id, path=None):
video_password = self._get_video_password()
- token = self._download_json(
- 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info')['xsrft']
- url = f'https://vimeo.com/{video_id}'
+ token = self._fetch_viewer_info(video_id)['xsrft']
+ url = join_nonempty('https://vimeo.com', path, video_id, delim='/')
try:
self._request_webpage(
f'{url}/password', video_id,
@@ -117,6 +136,10 @@ class VimeoBaseInfoExtractor(InfoExtractor):
raise ExtractorError('Wrong password', expected=True)
raise
+ def _extract_config_url(self, webpage, **kwargs):
+ return self._html_search_regex(
+ r'\bdata-config-url="([^"]+)"', webpage, 'config URL', **kwargs)
+
def _extract_vimeo_config(self, webpage, video_id, *args, **kwargs):
vimeo_config = self._search_regex(
r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));',
@@ -164,6 +187,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
sep_pattern = r'/sep/video/'
for files_type in ('hls', 'dash'):
for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items():
+ # TODO: Also extract 'avc_url'? Investigate if there are 'hevc_url', 'av1_url'?
manifest_url = cdn_data.get('url')
if not manifest_url:
continue
@@ -244,7 +268,10 @@ class VimeoBaseInfoExtractor(InfoExtractor):
'formats': formats,
'subtitles': subtitles,
'live_status': live_status,
- 'release_timestamp': traverse_obj(live_event, ('ingest', 'scheduled_start_time', {parse_iso8601})),
+ 'release_timestamp': traverse_obj(live_event, ('ingest', (
+ ('scheduled_start_time', {parse_iso8601}),
+ ('start_time', {int_or_none}),
+ ), any)),
# Note: Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
# at the same time without actual units specified.
'_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'),
@@ -353,7 +380,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
(?:
(?Puser)|
(?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
- (?:.*?/)??
+ (?:(?!event/).*?/)??
(?P
(?:
play_redirect_hls|
@@ -933,8 +960,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
r'vimeo\.com/(?:album|showcase)/([^/]+)', url, 'album id', default=None)
if not album_id:
return
- viewer = self._download_json(
- 'https://vimeo.com/_rv/viewer', album_id, fatal=False)
+ viewer = self._fetch_viewer_info(album_id, fatal=False)
if not viewer:
webpage = self._download_webpage(url, album_id)
viewer = self._parse_json(self._search_regex(
@@ -992,9 +1018,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
raise
errmsg = error.cause.response.read()
if b'Because of its privacy settings, this video cannot be played here' in errmsg:
- raise ExtractorError(
- 'Cannot download embed-only video without embedding URL. Please call yt-dlp '
- 'with the URL of the page that embeds this video.', expected=True)
+ raise ExtractorError(self._REFERER_HINT, expected=True)
# 403 == vimeo.com TLS fingerprint or DC IP block; 429 == player.vimeo.com TLS FP block
status = error.cause.status
dcip_msg = 'If you are using a data center IP or VPN/proxy, your IP may be blocked'
@@ -1039,8 +1063,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
channel_id = self._search_regex(
r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
if channel_id:
- config_url = self._html_search_regex(
- r'\bdata-config-url="([^"]+)"', webpage, 'config URL', default=None)
+ config_url = self._extract_config_url(webpage, default=None)
video_description = clean_html(get_element_by_class('description', webpage))
info_dict.update({
'channel_id': channel_id,
@@ -1333,8 +1356,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):
def _real_extract(self, url):
album_id = self._match_id(url)
- viewer = self._download_json(
- 'https://vimeo.com/_rv/viewer', album_id, fatal=False)
+ viewer = self._fetch_viewer_info(album_id, fatal=False)
if not viewer:
webpage = self._download_webpage(url, album_id)
viewer = self._parse_json(self._search_regex(
@@ -1626,3 +1648,377 @@ class VimeoProIE(VimeoBaseInfoExtractor):
return self.url_result(vimeo_url, VimeoIE, video_id, url_transparent=True,
description=description)
+
+
+class VimeoEventIE(VimeoBaseInfoExtractor):
+ IE_NAME = 'vimeo:event'
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?vimeo\.com/event/(?P\d+)(?:/
+ (?:
+ (?:embed/)?(?P[\da-f]{10})|
+ videos/(?P\d+)
+ )
+ )?'''
+ _EMBED_REGEX = [r'