Compare commits

..

3 Commits

Author SHA1 Message Date
doe1080
6ae3543d5a
[ie] _rta_search: Do not assume age_limit is 0 (#13985)
Authored by: doe1080
2025-08-16 04:28:58 +00:00
doe1080
770119bdd1
[ie] Extract avif storyboard formats from MPD manifests (#14016)
Authored by: doe1080
2025-08-16 03:32:21 +00:00
Arseniy D.
8e3f8065af
[ie/weibo] Fix extractors (#14012)
Closes #14012
Authored by: AzartX47, bashonly

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2025-08-16 03:07:35 +00:00
3 changed files with 43 additions and 32 deletions

View File

@ -1527,11 +1527,11 @@ class InfoExtractor:
r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b', r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
] ]
age_limit = 0 age_limit = None
for marker in AGE_LIMIT_MARKERS: for marker in AGE_LIMIT_MARKERS:
mobj = re.search(marker, html) mobj = re.search(marker, html)
if mobj: if mobj:
age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18))) age_limit = max(age_limit or 0, int(traverse_obj(mobj, 1, default=18)))
return age_limit return age_limit
def _media_rating_search(self, html): def _media_rating_search(self, html):
@ -2968,7 +2968,7 @@ class InfoExtractor:
else: else:
codecs = parse_codecs(codec_str) codecs = parse_codecs(codec_str)
if content_type not in ('video', 'audio', 'text'): if content_type not in ('video', 'audio', 'text'):
if mime_type == 'image/jpeg': if mime_type in ('image/avif', 'image/jpeg'):
content_type = mime_type content_type = mime_type
elif codecs.get('vcodec', 'none') != 'none': elif codecs.get('vcodec', 'none') != 'none':
content_type = 'video' content_type = 'video'
@ -3028,14 +3028,14 @@ class InfoExtractor:
'manifest_url': mpd_url, 'manifest_url': mpd_url,
'filesize': filesize, 'filesize': filesize,
} }
elif content_type == 'image/jpeg': elif content_type in ('image/avif', 'image/jpeg'):
# See test case in VikiIE # See test case in VikiIE
# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
f = { f = {
'format_id': format_id, 'format_id': format_id,
'ext': 'mhtml', 'ext': 'mhtml',
'manifest_url': mpd_url, 'manifest_url': mpd_url,
'format_note': 'DASH storyboards (jpeg)', 'format_note': f'DASH storyboards ({mimetype2ext(mime_type)})',
'acodec': 'none', 'acodec': 'none',
'vcodec': 'none', 'vcodec': 'none',
} }
@ -3177,7 +3177,7 @@ class InfoExtractor:
'url': mpd_url or base_url, 'url': mpd_url or base_url,
'fragment_base_url': base_url, 'fragment_base_url': base_url,
'fragments': [], 'fragments': [],
'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml', 'protocol': 'mhtml' if mime_type in ('image/avif', 'image/jpeg') else 'http_dash_segments',
}) })
if 'initialization_url' in representation_ms_info: if 'initialization_url' in representation_ms_info:
initialization_url = representation_ms_info['initialization_url'] initialization_url = representation_ms_info['initialization_url']
@ -3192,7 +3192,7 @@ class InfoExtractor:
else: else:
# Assuming direct URL to unfragmented media. # Assuming direct URL to unfragmented media.
f['url'] = base_url f['url'] = base_url
if content_type in ('video', 'audio', 'image/jpeg'): if content_type in ('video', 'audio', 'image/avif', 'image/jpeg'):
f['manifest_stream_number'] = stream_numbers[f['url']] f['manifest_stream_number'] = stream_numbers[f['url']]
stream_numbers[f['url']] += 1 stream_numbers[f['url']] += 1
period_entry['formats'].append(f) period_entry['formats'].append(f)

View File

@ -121,7 +121,6 @@ class GenericIE(InfoExtractor):
'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867', 'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
'ext': 'mp4', 'ext': 'mp4',
'title': 'čauky lidi 70 finall', 'title': 'čauky lidi 70 finall',
'age_limit': 0,
'description': 'md5:47b2673a5b76780d9d329783e1fbf5aa', 'description': 'md5:47b2673a5b76780d9d329783e1fbf5aa',
'direct': True, 'direct': True,
'duration': 318.0, 'duration': 318.0,
@ -244,7 +243,6 @@ class GenericIE(InfoExtractor):
'id': 'paris-d-moll', 'id': 'paris-d-moll',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Paris d-moll', 'title': 'Paris d-moll',
'age_limit': 0,
'description': 'md5:319e37ea5542293db37e1e13072fe330', 'description': 'md5:319e37ea5542293db37e1e13072fe330',
'thumbnail': r're:https?://www\.filmarkivet\.se/wp-content/uploads/.+\.jpg', 'thumbnail': r're:https?://www\.filmarkivet\.se/wp-content/uploads/.+\.jpg',
}, },
@ -255,7 +253,6 @@ class GenericIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '60413035', 'id': '60413035',
'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans', 'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans',
'age_limit': 0,
'description': 'md5:bbb4e12e42e78609a74fd421b93b1239', 'description': 'md5:bbb4e12e42e78609a74fd421b93b1239',
'thumbnail': r're:https?://www\.dagbladet\.no/images/.+', 'thumbnail': r're:https?://www\.dagbladet\.no/images/.+',
}, },
@ -267,7 +264,6 @@ class GenericIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'single_clip', 'id': 'single_clip',
'title': 'Single Clip player examples', 'title': 'Single Clip player examples',
'age_limit': 0,
}, },
'playlist_count': 3, 'playlist_count': 3,
}, { }, {
@ -324,7 +320,6 @@ class GenericIE(InfoExtractor):
'id': 'videos-1', 'id': 'videos-1',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Videos & Audio - King Machine (1)', 'title': 'Videos & Audio - King Machine (1)',
'age_limit': 0,
'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.', 'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.',
'thumbnail': r're:https?://media\.indiedb\.com/cache/images/.+\.jpg', 'thumbnail': r're:https?://media\.indiedb\.com/cache/images/.+\.jpg',
'_old_archive_ids': ['generic videos'], '_old_archive_ids': ['generic videos'],
@ -363,7 +358,6 @@ class GenericIE(InfoExtractor):
'id': '21217', 'id': '21217',
'ext': 'mp4', 'ext': 'mp4',
'title': '40 ночей (2016) - BogMedia.org', 'title': '40 ночей (2016) - BogMedia.org',
'age_limit': 0,
'description': 'md5:4e6d7d622636eb7948275432eb256dc3', 'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
'display_id': '40-nochey-2016', 'display_id': '40-nochey-2016',
'thumbnail': r're:https?://bogmedia\.org/contents/videos_screenshots/.+\.jpg', 'thumbnail': r're:https?://bogmedia\.org/contents/videos_screenshots/.+\.jpg',
@ -378,7 +372,6 @@ class GenericIE(InfoExtractor):
'id': '18485', 'id': '18485',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com', 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
'age_limit': 0,
'display_id': 'leningrad-zoj', 'display_id': 'leningrad-zoj',
'thumbnail': r're:https?://youix\.com/contents/videos_screenshots/.+\.jpg', 'thumbnail': r're:https?://youix\.com/contents/videos_screenshots/.+\.jpg',
}, },
@ -419,7 +412,6 @@ class GenericIE(InfoExtractor):
'id': '105', 'id': '105',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Kelis - 4th Of July / Embed Player', 'title': 'Kelis - 4th Of July / Embed Player',
'age_limit': 0,
'display_id': 'kelis-4th-of-july', 'display_id': 'kelis-4th-of-july',
'thumbnail': r're:https?://www\.kvs-demo\.com/contents/videos_screenshots/.+\.jpg', 'thumbnail': r're:https?://www\.kvs-demo\.com/contents/videos_screenshots/.+\.jpg',
}, },
@ -430,9 +422,8 @@ class GenericIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'beltzlaw-1', 'id': 'beltzlaw-1',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Beltz Law Group | Dallas Traffic Ticket, Accident & Criminal Attorney (1)', 'title': str,
'age_limit': 0, 'description': str,
'description': 'md5:5bdf23fcb76801dc3b31e74cabf82147',
'thumbnail': r're:https?://beltzlaw\.com/wp-content/uploads/.+\.jpg', 'thumbnail': r're:https?://beltzlaw\.com/wp-content/uploads/.+\.jpg',
'timestamp': int, # varies 'timestamp': int, # varies
'upload_date': str, 'upload_date': str,
@ -447,7 +438,6 @@ class GenericIE(InfoExtractor):
'id': 'cine-1', 'id': 'cine-1',
'ext': 'webm', 'ext': 'webm',
'title': 'CINE.AR (1)', 'title': 'CINE.AR (1)',
'age_limit': 0,
'description': 'md5:a4e58f9e2291c940e485f34251898c4a', 'description': 'md5:a4e58f9e2291c940e485f34251898c4a',
'thumbnail': r're:https?://cine\.ar/img/.+\.png', 'thumbnail': r're:https?://cine\.ar/img/.+\.png',
'_old_archive_ids': ['generic cine'], '_old_archive_ids': ['generic cine'],
@ -461,7 +451,6 @@ class GenericIE(InfoExtractor):
'id': 'ipy2AcGL', 'id': 'ipy2AcGL',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen', 'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen',
'age_limit': 0,
'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d', 'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d',
'duration': 111.0, 'duration': 111.0,
'thumbnail': r're:https?://images\.nu\.nl/.+\.jpg', 'thumbnail': r're:https?://images\.nu\.nl/.+\.jpg',
@ -477,7 +466,6 @@ class GenericIE(InfoExtractor):
'id': 'porsche-911-gt3-rs-rij-impressie-2', 'id': 'porsche-911-gt3-rs-rij-impressie-2',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Test: Porsche 911 GT3 RS - AutoWeek', 'title': 'Test: Porsche 911 GT3 RS - AutoWeek',
'age_limit': 0,
'description': 'md5:a17b5bd84288448d8f11b838505718fc', 'description': 'md5:a17b5bd84288448d8f11b838505718fc',
'direct': True, 'direct': True,
'thumbnail': r're:https?://images\.autoweek\.nl/.+', 'thumbnail': r're:https?://images\.autoweek\.nl/.+',
@ -493,7 +481,6 @@ class GenericIE(InfoExtractor):
'id': 'k6gl2kt2eq', 'id': 'k6gl2kt2eq',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Breezy HR\'s ATS helps you find & hire employees sooner', 'title': 'Breezy HR\'s ATS helps you find & hire employees sooner',
'age_limit': 0,
'average_rating': 4.5, 'average_rating': 4.5,
'description': 'md5:eee75fdd3044c538003f3be327ba01e1', 'description': 'md5:eee75fdd3044c538003f3be327ba01e1',
'duration': 60.1, 'duration': 60.1,
@ -509,7 +496,6 @@ class GenericIE(InfoExtractor):
'id': 'videojs_hls_test', 'id': 'videojs_hls_test',
'ext': 'mp4', 'ext': 'mp4',
'title': 'video', 'title': 'video',
'age_limit': 0,
'duration': 1800, 'duration': 1800,
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},

View File

@ -52,13 +52,16 @@ class WeiboBaseIE(InfoExtractor):
'_rand': random.random(), '_rand': random.random(),
}) })
def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs): def _weibo_download_json(self, url, video_id, note='Downloading JSON metadata', data=None, headers=None, query=None):
# XXX: Always fatal; _download_webpage_handle only returns False (not a tuple) on error headers = {
webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs) 'Referer': 'https://weibo.com/',
**(headers or {}),
}
webpage, urlh = self._download_webpage_handle(url, video_id, note=note, data=data, headers=headers, query=query)
if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com': if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com':
self._update_visitor_cookies(urlh.url, video_id) self._update_visitor_cookies(urlh.url, video_id)
webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs) webpage = self._download_webpage(url, video_id, note=note, data=data, headers=headers, query=query)
return self._parse_json(webpage, video_id, fatal=fatal) return self._parse_json(webpage, video_id)
def _extract_formats(self, video_info): def _extract_formats(self, video_info):
media_info = traverse_obj(video_info, ('page_info', 'media_info')) media_info = traverse_obj(video_info, ('page_info', 'media_info'))
@ -189,7 +192,8 @@ class WeiboIE(WeiboBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
meta = self._weibo_download_json(f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id) meta = self._weibo_download_json(
'https://weibo.com/ajax/statuses/show', video_id, query={'id': video_id})
mix_media_info = traverse_obj(meta, ('mix_media_info', 'items', ...)) mix_media_info = traverse_obj(meta, ('mix_media_info', 'items', ...))
if not mix_media_info: if not mix_media_info:
return self._parse_video_info(meta) return self._parse_video_info(meta)
@ -205,7 +209,7 @@ class WeiboIE(WeiboBaseIE):
class WeiboVideoIE(WeiboBaseIE): class WeiboVideoIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)' _VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:(?:[\da-f]{32}|\d{16,}))'
_TESTS = [{ _TESTS = [{
'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow', 'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow',
'info_dict': { 'info_dict': {
@ -227,6 +231,27 @@ class WeiboVideoIE(WeiboBaseIE):
'repost_count': int, 'repost_count': int,
'_old_archive_ids': ['weibomobile 4797700463137878'], '_old_archive_ids': ['weibomobile 4797700463137878'],
}, },
}, {
'url': 'https://weibo.com/tv/show/1034:633c288cc043d0ca7808030f1157da64',
'info_dict': {
'id': '4189191225395228',
'ext': 'mp4',
'display_id': 'FBqgOmDxO',
'title': '柴犬柴犬的秒拍视频',
'alt_title': '柴犬柴犬的秒拍视频',
'description': '午睡当然是要甜甜蜜蜜的啦![坏笑] Instagramshibainu.gaku http://t.cn/RHbmjzW \u200B\u200B\u200B',
'uploader': '柴犬柴犬',
'uploader_id': '5926682210',
'uploader_url': 'https://weibo.com/u/5926682210',
'view_count': int,
'like_count': int,
'repost_count': int,
'duration': 53,
'thumbnail': 'https://wx1.sinaimg.cn/large/006t5KMygy1fmu31fsqbej30hs0hstav.jpg',
'timestamp': 1514264429,
'upload_date': '20171226',
'_old_archive_ids': ['weibomobile 4189191225395228'],
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -234,8 +259,8 @@ class WeiboVideoIE(WeiboBaseIE):
post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode() post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode()
video_info = self._weibo_download_json( video_info = self._weibo_download_json(
f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}', 'https://weibo.com/tv/api/component', video_id, data=post_data, headers={'Referer': url},
video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo'] query={'page': f'/tv/show/{video_id}'})['data']['Component_Play_Playinfo']
return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE) return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE)