mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2026-06-12 22:04:42 +00:00
[ie/wikimedia] Rework extractor (#15413)
Closes #16411 Authored by: seproDev
This commit is contained in:
parent
aaa1c78956
commit
3d1f8a4a0d
@ -1,55 +1,133 @@
|
||||
import re
|
||||
import urllib.parse
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
clean_html,
|
||||
get_element_by_class,
|
||||
parse_qs,
|
||||
remove_start,
|
||||
unescapeHTML,
|
||||
urljoin,
|
||||
float_or_none,
|
||||
int_or_none,
|
||||
mimetype2ext,
|
||||
parse_codecs,
|
||||
parse_iso8601,
|
||||
str_or_none,
|
||||
url_or_none,
|
||||
)
|
||||
from ..utils.traversal import require, traverse_obj
|
||||
from ..version import __version__ as YT_DLP_VERSION
|
||||
|
||||
|
||||
class WikimediaIE(InfoExtractor):
|
||||
IE_NAME = 'wikimedia.org'
|
||||
_VALID_URL = r'https?://commons\.wikimedia\.org/wiki/File:(?P<id>[^/#?]+)\.\w+'
|
||||
_VALID_URL = r'https?://commons\.wikimedia\.org/wiki/File:(?P<id>[^/#?]+)'
|
||||
_TESTS = [{
|
||||
'url': 'https://commons.wikimedia.org/wiki/File:Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm',
|
||||
'info_dict': {
|
||||
'url': 're:https?://upload.wikimedia.org/wikipedia',
|
||||
'ext': 'webm',
|
||||
'id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS',
|
||||
'title': 'Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons',
|
||||
'description': 'md5:7cd84f76e7081f1be033d0b155b4a460',
|
||||
'license': 'Creative Commons Attribution 4.0 International',
|
||||
'uploader': 'ZDF/Terra X/Gruppe 5/Luise Wagner, Jonas Sichert, Andreas Hougardy',
|
||||
'subtitles': 'count:4',
|
||||
'id': '83227919',
|
||||
'display_id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm',
|
||||
'title': 'Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS',
|
||||
'description': 'Climate change, Temperature in history of Earth, Video of Terra X.',
|
||||
'uploader': 'ZDF Terra X Redaktion',
|
||||
'duration': 45.327,
|
||||
'categories': 'count:16',
|
||||
'timestamp': 1597848846,
|
||||
'upload_date': '20200819',
|
||||
'license': 'Creative Commons Attribution 4.0',
|
||||
'subtitles': 'count:3',
|
||||
},
|
||||
}, {
|
||||
# url needs unquoting
|
||||
'url': 'https://commons.wikimedia.org/wiki/File:Two-toed_sloth_rail_walking_%E4%BA%8C%E8%B6%BE%E6%A8%B9%E7%8D%BA%E7%88%AC%E8%A1%8C_(HD).webm',
|
||||
'info_dict': {
|
||||
'ext': 'webm',
|
||||
'id': '165082300',
|
||||
'display_id': 'Two-toed_sloth_rail_walking_%E4%BA%8C%E8%B6%BE%E6%A8%B9%E7%8D%BA%E7%88%AC%E8%A1%8C_(HD).webm',
|
||||
'title': 'Two-toed sloth rail walking 二趾樹獺爬行 (HD)',
|
||||
'description': 'md5:3c32e4c7f6103dde4ecd9e9313b23526',
|
||||
'uploader': 'Tvpuppy',
|
||||
'duration': 25.688,
|
||||
'categories': 'count:8',
|
||||
'timestamp': 1747012249,
|
||||
'upload_date': '20250512',
|
||||
'license': 'Creative Commons Attribution 3.0',
|
||||
},
|
||||
}]
|
||||
|
||||
_HTTP_HEADERS = {
|
||||
# Faking a browser user-agent leads to being blocked with a 403.
|
||||
# Follow robot policy as per https://wikitech.wikimedia.org/wiki/Robot_policy
|
||||
'User-Agent': f'yt-dlp/{YT_DLP_VERSION} (https://github.com/yt-dlp/yt-dlp)',
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _parse_ext_and_codecs(s):
|
||||
if not s:
|
||||
return {}
|
||||
if mobj := re.match(r'(?P<mime>[^;]+)(?:;\s*codecs="(?P<codecs>[^"]+)")?', s):
|
||||
return {
|
||||
'ext': mimetype2ext(mobj.group('mime')),
|
||||
**parse_codecs(mobj.group('codecs')),
|
||||
}
|
||||
return {}
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
display_id = self._match_id(url)
|
||||
api_response = traverse_obj(self._download_json(
|
||||
'https://commons.wikimedia.org/w/api.php', display_id, query={
|
||||
'action': 'query',
|
||||
'format': 'json',
|
||||
'titles': f'File:{urllib.parse.unquote(display_id)}',
|
||||
'prop': 'videoinfo',
|
||||
'viprop': 'timestamp|user|url|size|derivatives|timedtext|extmetadata',
|
||||
}, headers=self._HTTP_HEADERS), ('query', 'pages', ..., {dict}, any))
|
||||
|
||||
video_info = traverse_obj(api_response, ('videoinfo', 0, {dict}, {require('video info')}))
|
||||
formats = []
|
||||
if url_or_none(video_info.get('url')):
|
||||
formats.append({
|
||||
'url': video_info['url'],
|
||||
'format_id': 'source',
|
||||
'quality': 10,
|
||||
**traverse_obj(video_info, {
|
||||
'width': ('width', {int_or_none}),
|
||||
'height': ('height', {int_or_none}),
|
||||
'filesize': ('size', {int_or_none}),
|
||||
}),
|
||||
'http_headers': self._HTTP_HEADERS,
|
||||
})
|
||||
for derivative in traverse_obj(video_info, ('derivatives', lambda _, v: url_or_none(v['src']))):
|
||||
formats.append({
|
||||
'url': derivative['src'],
|
||||
**traverse_obj(derivative, {
|
||||
'width': ('width', {int_or_none}),
|
||||
'height': ('height', {int_or_none}),
|
||||
'tbr': ('bandwidth', {int_or_none(scale=1000)}),
|
||||
}),
|
||||
'http_headers': self._HTTP_HEADERS,
|
||||
**self._parse_ext_and_codecs(derivative.get('type')),
|
||||
})
|
||||
|
||||
subtitles = {}
|
||||
for sub in set(re.findall(r'\bsrc\s*=\s*["\'](/w/api[^"]+)["\']', webpage)):
|
||||
sub = urljoin('https://commons.wikimedia.org', unescapeHTML(sub))
|
||||
qs = parse_qs(sub)
|
||||
lang = qs.get('lang', [None])[-1]
|
||||
sub_ext = qs.get('trackformat', [None])[-1]
|
||||
if lang and sub_ext:
|
||||
subtitles.setdefault(lang, []).append({'ext': sub_ext, 'url': sub})
|
||||
for subtitle in traverse_obj(video_info, ('timedtext', lambda _, v: url_or_none(v['src']))):
|
||||
lang = subtitle.get('srclang') or 'unk'
|
||||
subtitles.setdefault(lang, []).append({
|
||||
'url': subtitle['src'],
|
||||
'ext': mimetype2ext(subtitle.get('type')),
|
||||
'http_headers': self._HTTP_HEADERS,
|
||||
})
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'url': self._html_search_regex(r'<source\s[^>]*\bsrc="([^"]+)"', webpage, 'video URL'),
|
||||
'description': clean_html(get_element_by_class('description', webpage)),
|
||||
'title': remove_start(self._og_search_title(webpage), 'File:'),
|
||||
'license': self._html_search_regex(
|
||||
r'licensed under(?: the)? (.+?) license',
|
||||
get_element_by_class('licensetpl', webpage), 'license', default=None),
|
||||
'uploader': self._html_search_regex(
|
||||
r'>\s*Author\s*</td>\s*<td\b[^>]*>\s*([^<]+)\s*</td>', webpage, 'video author', default=None),
|
||||
'id': str_or_none(api_response['pageid']),
|
||||
'display_id': display_id,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
**traverse_obj(video_info, {
|
||||
'title': ('extmetadata', 'ObjectName', 'value', {str}),
|
||||
'timestamp': ('timestamp', {parse_iso8601}),
|
||||
'description': ('extmetadata', 'ImageDescription', 'value', {clean_html}),
|
||||
'uploader': ('user', {str}),
|
||||
'duration': ('duration', {float_or_none}),
|
||||
'license': ('extmetadata', 'UsageTerms', 'value', {str}),
|
||||
'categories': ('extmetadata', 'Categories', 'value', {lambda x: x.split('|')}, ..., {str.strip}, filter),
|
||||
}),
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user