From 7cecd299e4a5ef1f0f044b2fedc26f17e41f15e3 Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 17 Nov 2024 13:32:12 +0100 Subject: [PATCH 1/8] [ie/chaturbate] Don't break embed detection (#11565) Bugfix for 720b3dc453c342bc2e8df7dbc0acaab4479de46c Authored by: seproDev --- yt_dlp/extractor/chaturbate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/chaturbate.py b/yt_dlp/extractor/chaturbate.py index aa70f26a1b..a40b7d39c7 100644 --- a/yt_dlp/extractor/chaturbate.py +++ b/yt_dlp/extractor/chaturbate.py @@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor): 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), } - def _extract_from_webpage(self, video_id, tld): + def _extract_from_html(self, video_id, tld): webpage = self._download_webpage( f'https://chaturbate.{tld}/{video_id}/', video_id, headers=self.geo_verification_headers(), impersonate=True) @@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor): def _real_extract(self, url): video_id, tld = self._match_valid_url(url).group('id', 'tld') - return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) + return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld) From eb15fd5a32d8b35ef515f7a3d1158c03025648ff Mon Sep 17 00:00:00 2001 From: krichbanana <77071421+krichbanana@users.noreply.github.com> Date: Sun, 17 Nov 2024 09:12:26 -0500 Subject: [PATCH 2/8] [ie/kenh14] Add extractor (#3996) Closes #3937 Authored by: krichbanana, pzhlkj6612 Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/kenh14.py | 160 ++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 yt_dlp/extractor/kenh14.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 25a233a2d6..af903f307b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -946,6 +946,10 @@ from .kaltura import KalturaIE from .kankanews import KankaNewsIE from .karaoketv import KaraoketvIE from .kelbyone import KelbyOneIE +from .kenh14 import ( + Kenh14PlaylistIE, + Kenh14VideoIE, +) from .khanacademy import ( KhanAcademyIE, KhanAcademyUnitIE, diff --git a/yt_dlp/extractor/kenh14.py b/yt_dlp/extractor/kenh14.py new file mode 100644 index 0000000000..3c46020e8b --- /dev/null +++ b/yt_dlp/extractor/kenh14.py @@ -0,0 +1,160 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_attribute, + get_elements_html_by_class, + int_or_none, + parse_duration, + parse_iso8601, + remove_start, + strip_or_none, + unescapeHTML, + update_url, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class Kenh14VideoIE(InfoExtractor): + _VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P[0-9]+)\.chn' + _TESTS = [{ + 'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn', + 'md5': '1ed67f9c3a1e74acf15db69590cf6210', + 'info_dict': { + 'id': '316173', + 'ext': 'mp4', + 'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)', + 'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)', + 'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$', + 'tags': [], + 'uploader': 'Unbox Therapy', + 'upload_date': '20220517', + 'view_count': int, + 'duration': 722.86, + 'timestamp': 1652764468, + }, + }, { + 'url': 'https://video.kenh14.vn/video-316174.chn', + 'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd', + 'info_dict': { + 'id': '316174', + 'ext': 'mp4', + 'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu', + 'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc', + 'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$', + 'tags': [], + 'upload_date': '20220517', + 'view_count': int, + 'duration': 70.04, + 'timestamp': 1652766021, + }, + }, { + 'url': 'https://video.kenh14.vn/0-344740.chn', + 'md5': 'b843495d5e728142c8870c09b46df2a9', + 'info_dict': { + 'id': '344740', + 'ext': 'mov', + 'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi', + 'description': 'md5:2a2dbb4a7397169fb21ee68f09160497', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$', + 'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'], + 'uploader': 'Quang Vũ', + 'upload_date': '20241024', + 'view_count': int, + 'duration': 198.88, + 'timestamp': 1729741590, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '') + direct_url = attrs['data-vid'] + + metadata = self._download_json( + 'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format( + remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False) + + formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}] + subtitles = {} + video_data = self._download_json( + f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False) + if hls_url := traverse_obj(video_data, ('hls', {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + hls_url, video_id, m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})): + fmts, subs = self._extract_mpd_formats_and_subtitles( + dash_url, video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + **traverse_obj(metadata, { + 'duration': ('duration', {parse_duration}), + 'uploader': ('author', {strip_or_none}), + 'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}), + 'view_count': ('views', {int_or_none}), + }), + 'id': video_id, + 'title': ( + traverse_obj(metadata, ('title', {strip_or_none})) + or clean_html(self._og_search_title(webpage)) + or clean_html(get_element_by_class('vdbw-title', webpage))), + 'formats': formats, + 'subtitles': subtitles, + 'description': ( + clean_html(self._og_search_description(webpage)) + or clean_html(get_element_by_class('vdbw-sapo', webpage))), + 'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')), + 'tags': traverse_obj(self._html_search_meta('keywords', webpage), ( + {lambda x: x.split(';')}, ..., filter)), + } + + +class Kenh14PlaylistIE(InfoExtractor): + _VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P[0-9]+)\.chn' + _TESTS = [{ + 'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn', + 'info_dict': { + 'id': '71', + 'title': 'Trần Tình (Naked love) mùa 2', + 'description': 'md5:e9522339304956dea931722dd72eddb2', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$', + }, + 'playlist_count': 9, + }, { + 'url': 'https://video.kenh14.vn/playlist/0-72.chn', + 'info_dict': { + 'id': '72', + 'title': 'Lau Lại Đầu Từ', + 'description': 'Cùng xem xưa và nay có gì khác biệt nhé!', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$', + }, + 'playlist_count': 6, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + category_detail = get_element_by_class('category-detail', webpage) or '' + embed_info = traverse_obj( + self._yield_json_ld(webpage, playlist_id), + (lambda _, v: v['name'] and v['alternateName'], any)) or {} + + return self.playlist_from_matches( + get_elements_html_by_class('video-item', webpage), playlist_id, + (clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))), + getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']), + ie=Kenh14VideoIE, playlist_description=( + clean_html(get_element_by_class('description', category_detail)) + or unescapeHTML(embed_info.get('alternateName'))), + thumbnail=traverse_obj( + self._og_search_thumbnail(webpage), + ({url_or_none}, {update_url(query=None)}))) From 10fc719bc7f1eef469389c5219102266ef411f29 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Mon, 18 Nov 2024 01:22:40 +0900 Subject: [PATCH 3/8] [cleanup] Remove dead extractors (#11566) - Removes MildomClipIE, MildomIE, MildomUserVodIE, MildomVodIE - Removes PokemonIE, PokemonWatchIE - Removes VeohIE, VeohUserIE Closes #3373, Closes #7059 Authored by: doe1080 --- yt_dlp/extractor/_extractors.py | 14 -- yt_dlp/extractor/mildom.py | 291 -------------------------------- yt_dlp/extractor/pokemon.py | 136 --------------- yt_dlp/extractor/veoh.py | 189 --------------------- 4 files changed, 630 deletions(-) delete mode 100644 yt_dlp/extractor/mildom.py delete mode 100644 yt_dlp/extractor/pokemon.py delete mode 100644 yt_dlp/extractor/veoh.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index af903f307b..0d849c1697 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1139,12 +1139,6 @@ from .microsoftembed import ( MicrosoftMediusIE, ) from .microsoftstream import MicrosoftStreamIE -from .mildom import ( - MildomClipIE, - MildomIE, - MildomUserVodIE, - MildomVodIE, -) from .minds import ( MindsChannelIE, MindsGroupIE, @@ -1563,10 +1557,6 @@ from .podbayfm import ( ) from .podchaser import PodchaserIE from .podomatic import PodomaticIE -from .pokemon import ( - PokemonIE, - PokemonWatchIE, -) from .pokergo import ( PokerGoCollectionIE, PokerGoIE, @@ -2288,10 +2278,6 @@ from .utreon import UtreonIE from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veo import VeoIE -from .veoh import ( - VeohIE, - VeohUserIE, -) from .vesti import VestiIE from .vevo import ( VevoIE, diff --git a/yt_dlp/extractor/mildom.py b/yt_dlp/extractor/mildom.py deleted file mode 100644 index 88a2b9e891..0000000000 --- a/yt_dlp/extractor/mildom.py +++ /dev/null @@ -1,291 +0,0 @@ -import functools -import json -import uuid - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - OnDemandPagedList, - determine_ext, - dict_get, - float_or_none, - traverse_obj, -) - - -class MildomBaseIE(InfoExtractor): - _GUEST_ID = None - - def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None): - if not self._GUEST_ID: - self._GUEST_ID = f'pc-gp-{uuid.uuid4()}' - - content = self._download_json( - url, video_id, note=note, data=json.dumps(body).encode() if body else None, - headers={'Content-Type': 'application/json'} if body else {}, - query={ - '__guest_id': self._GUEST_ID, - '__platform': 'web', - **(query or {}), - }) - - if content['code'] != 0: - raise ExtractorError( - f'Mildom says: {content["message"]} (code {content["code"]})', - expected=True) - return content['body'] - - -class MildomIE(MildomBaseIE): - IE_NAME = 'mildom' - IE_DESC = 'Record ongoing live by specific user in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/(?P\d+)' - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(f'https://www.mildom.com/{video_id}', video_id) - - enterstudio = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id, - note='Downloading live metadata', query={'user_id': video_id}) - result_video_id = enterstudio.get('log_id', video_id) - - servers = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id, - note='Downloading live server list', query={ - 'user_id': video_id, - 'live_server_type': 'hls', - }) - - playback_token = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/live/token', result_video_id, - note='Obtaining live playback token', body={'host_id': video_id, 'type': 'hls'}) - playback_token = traverse_obj(playback_token, ('data', ..., 'token'), get_all=False) - if not playback_token: - raise ExtractorError('Failed to obtain live playback token') - - formats = self._extract_m3u8_formats( - f'{servers["stream_server"]}/{video_id}_master.m3u8?{playback_token}', - result_video_id, 'mp4', headers={ - 'Referer': 'https://www.mildom.com/', - 'Origin': 'https://www.mildom.com', - }) - - for fmt in formats: - fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/' - - return { - 'id': result_video_id, - 'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'), - 'description': traverse_obj(enterstudio, 'intro', 'live_intro', expected_type=str), - 'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000), - 'uploader': self._html_search_meta('twitter:title', webpage, default=None) or traverse_obj(enterstudio, 'loginname'), - 'uploader_id': video_id, - 'formats': formats, - 'is_live': True, - } - - -class MildomVodIE(MildomBaseIE): - IE_NAME = 'mildom:vod' - IE_DESC = 'VOD in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P\d+)/(?P(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)' - _TESTS = [{ - 'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269', - 'info_dict': { - 'id': '10882672-1597662269', - 'ext': 'mp4', - 'title': '始めてのミルダム配信じゃぃ!', - 'thumbnail': r're:^https?://.*\.(png|jpg)$', - 'upload_date': '20200817', - 'duration': 4138.37, - 'description': 'ゲームをしたくて!', - 'timestamp': 1597662269.0, - 'uploader_id': '10882672', - 'uploader': 'kson組長(けいそん)', - }, - }, { - 'url': 'https://www.mildom.com/playback/10882672/10882672-1597758589870-477', - 'info_dict': { - 'id': '10882672-1597758589870-477', - 'ext': 'mp4', - 'title': '【kson】感染メイズ!麻酔銃で無双する', - 'thumbnail': r're:^https?://.*\.(png|jpg)$', - 'timestamp': 1597759093.0, - 'uploader': 'kson組長(けいそん)', - 'duration': 4302.58, - 'uploader_id': '10882672', - 'description': 'このステージ絶対乗り越えたい', - 'upload_date': '20200818', - }, - }, { - 'url': 'https://www.mildom.com/playback/10882672/10882672-buha9td2lrn97fk2jme0', - 'info_dict': { - 'id': '10882672-buha9td2lrn97fk2jme0', - 'ext': 'mp4', - 'title': '【kson組長】CART RACER!!!', - 'thumbnail': r're:^https?://.*\.(png|jpg)$', - 'uploader_id': '10882672', - 'uploader': 'kson組長(けいそん)', - 'upload_date': '20201104', - 'timestamp': 1604494797.0, - 'duration': 4657.25, - 'description': 'WTF', - }, - }] - - def _real_extract(self, url): - user_id, video_id = self._match_valid_url(url).group('user_id', 'id') - webpage = self._download_webpage(f'https://www.mildom.com/playback/{user_id}/{video_id}', video_id) - - autoplay = self._call_api( - 'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id, - note='Downloading playback metadata', query={ - 'v_id': video_id, - })['playback'] - - formats = [{ - 'url': autoplay['audio_url'], - 'format_id': 'audio', - 'protocol': 'm3u8_native', - 'vcodec': 'none', - 'acodec': 'aac', - 'ext': 'm4a', - }] - for fmt in autoplay['video_link']: - formats.append({ - 'format_id': 'video-{}'.format(fmt['name']), - 'url': fmt['url'], - 'protocol': 'm3u8_native', - 'width': fmt['level'] * autoplay['video_width'] // autoplay['video_height'], - 'height': fmt['level'], - 'vcodec': 'h264', - 'acodec': 'aac', - 'ext': 'mp4', - }) - - return { - 'id': video_id, - 'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'), - 'description': traverse_obj(autoplay, 'video_intro'), - 'timestamp': float_or_none(autoplay.get('publish_time'), scale=1000), - 'duration': float_or_none(autoplay.get('video_length'), scale=1000), - 'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')), - 'uploader': traverse_obj(autoplay, ('author_info', 'login_name')), - 'uploader_id': user_id, - 'formats': formats, - } - - -class MildomClipIE(MildomBaseIE): - IE_NAME = 'mildom:clip' - IE_DESC = 'Clip in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/clip/(?P(?P\d+)-[a-zA-Z0-9]+)' - _TESTS = [{ - 'url': 'https://www.mildom.com/clip/10042245-63921673e7b147ebb0806d42b5ba5ce9', - 'info_dict': { - 'id': '10042245-63921673e7b147ebb0806d42b5ba5ce9', - 'title': '全然違ったよ', - 'timestamp': 1619181890, - 'duration': 59, - 'thumbnail': r're:https?://.+', - 'uploader': 'ざきんぽ', - 'uploader_id': '10042245', - }, - }, { - 'url': 'https://www.mildom.com/clip/10111524-ebf4036e5aa8411c99fb3a1ae0902864', - 'info_dict': { - 'id': '10111524-ebf4036e5aa8411c99fb3a1ae0902864', - 'title': 'かっこいい', - 'timestamp': 1621094003, - 'duration': 59, - 'thumbnail': r're:https?://.+', - 'uploader': '(ルーキー', - 'uploader_id': '10111524', - }, - }, { - 'url': 'https://www.mildom.com/clip/10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', - 'info_dict': { - 'id': '10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', - 'title': 'あ', - 'timestamp': 1614769431, - 'duration': 31, - 'thumbnail': r're:https?://.+', - 'uploader': 'ドルゴルスレンギーン=ダグワドルジ', - 'uploader_id': '10660174', - }, - }] - - def _real_extract(self, url): - user_id, video_id = self._match_valid_url(url).group('user_id', 'id') - webpage = self._download_webpage(f'https://www.mildom.com/clip/{video_id}', video_id) - - clip_detail = self._call_api( - 'https://cloudac-cf-jp.mildom.com/nonolive/videocontent/clip/detail', video_id, - note='Downloading playback metadata', query={ - 'clip_id': video_id, - }) - - return { - 'id': video_id, - 'title': self._html_search_meta( - ('og:description', 'description'), webpage, default=None) or clip_detail.get('title'), - 'timestamp': float_or_none(clip_detail.get('create_time')), - 'duration': float_or_none(clip_detail.get('length')), - 'thumbnail': clip_detail.get('cover'), - 'uploader': traverse_obj(clip_detail, ('user_info', 'loginname')), - 'uploader_id': user_id, - - 'url': clip_detail['url'], - 'ext': determine_ext(clip_detail.get('url'), 'mp4'), - } - - -class MildomUserVodIE(MildomBaseIE): - IE_NAME = 'mildom:user:vod' - IE_DESC = 'Download all VODs from specific user in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/profile/(?P\d+)' - _TESTS = [{ - 'url': 'https://www.mildom.com/profile/10093333', - 'info_dict': { - 'id': '10093333', - 'title': 'Uploads from ねこばたけ', - }, - 'playlist_mincount': 732, - }, { - 'url': 'https://www.mildom.com/profile/10882672', - 'info_dict': { - 'id': '10882672', - 'title': 'Uploads from kson組長(けいそん)', - }, - 'playlist_mincount': 201, - }] - - def _fetch_page(self, user_id, page): - page += 1 - reply = self._call_api( - 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList', - user_id, note=f'Downloading page {page}', query={ - 'user_id': user_id, - 'page': page, - 'limit': '30', - }) - if not reply: - return - for x in reply: - v_id = x.get('v_id') - if not v_id: - continue - yield self.url_result(f'https://www.mildom.com/playback/{user_id}/{v_id}') - - def _real_extract(self, url): - user_id = self._match_id(url) - self.to_screen(f'This will download all VODs belonging to user. To download ongoing live video, use "https://www.mildom.com/{user_id}" instead') - - profile = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/user/profileV2', user_id, - query={'user_id': user_id}, note='Downloading user profile')['user_info'] - - return self.playlist_result( - OnDemandPagedList(functools.partial(self._fetch_page, user_id), 30), - user_id, f'Uploads from {profile["loginname"]}') diff --git a/yt_dlp/extractor/pokemon.py b/yt_dlp/extractor/pokemon.py deleted file mode 100644 index 1769684f72..0000000000 --- a/yt_dlp/extractor/pokemon.py +++ /dev/null @@ -1,136 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - extract_attributes, - int_or_none, - js_to_json, - merge_dicts, -) - - -class PokemonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P[a-z0-9]{32})|/(?:[^/]+/)+(?P[^/?#&]+))' - _TESTS = [{ - 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', - 'md5': '2fe8eaec69768b25ef898cda9c43062e', - 'info_dict': { - 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', - 'ext': 'mp4', - 'title': 'The Ol’ Raise and Switch!', - 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', - }, - 'add_id': ['LimelightMedia'], - }, { - # no data-video-title - 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', - 'info_dict': { - 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', - 'ext': 'mp4', - 'title': "Pokémon : L'ascension de Darkrai", - 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', - }, - 'add_id': ['LimelightMedia'], - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', - 'only_matching': True, - }, { - 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', - 'only_matching': True, - }, { - 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id, display_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, video_id or display_id) - video_data = extract_attributes(self._search_regex( - r'(<[^>]+data-video-id="{}"[^>]*>)'.format(video_id if video_id else '[a-z0-9]{32}'), - webpage, 'video data element')) - video_id = video_data['data-video-id'] - title = video_data.get('data-video-title') or self._html_search_meta( - 'pkm-title', webpage, ' title', default=None) or self._search_regex( - r']+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': f'limelight:media:{video_id}', - 'title': title, - 'description': video_data.get('data-video-summary'), - 'thumbnail': video_data.get('data-video-poster'), - 'series': 'Pokémon', - 'season_number': int_or_none(video_data.get('data-video-season')), - 'episode': title, - 'episode_number': int_or_none(video_data.get('data-video-episode')), - 'ie_key': 'LimelightMedia', - } - - -class PokemonWatchIE(InfoExtractor): - _VALID_URL = r'https?://watch\.pokemon\.com/[a-z]{2}-[a-z]{2}/(?:#/)?player(?:\.html)?\?id=(?P[a-z0-9]{32})' - _API_URL = 'https://www.pokemon.com/api/pokemontv/v2/channels/{0:}' - _TESTS = [{ - 'url': 'https://watch.pokemon.com/en-us/player.html?id=8309a40969894a8e8d5bc1311e9c5667', - 'md5': '62833938a31e61ab49ada92f524c42ff', - 'info_dict': { - 'id': '8309a40969894a8e8d5bc1311e9c5667', - 'ext': 'mp4', - 'title': 'Lillier and the Staff!', - 'description': 'md5:338841b8c21b283d24bdc9b568849f04', - }, - }, { - 'url': 'https://watch.pokemon.com/en-us/#/player?id=3fe7752ba09141f0b0f7756d1981c6b2', - 'only_matching': True, - }, { - 'url': 'https://watch.pokemon.com/de-de/player.html?id=b3c402e111a4459eb47e12160ab0ba07', - 'only_matching': True, - }] - - def _extract_media(self, channel_array, video_id): - for channel in channel_array: - for media in channel.get('media'): - if media.get('id') == video_id: - return media - return None - - def _real_extract(self, url): - video_id = self._match_id(url) - - info = { - '_type': 'url', - 'id': video_id, - 'url': f'limelight:media:{video_id}', - 'ie_key': 'LimelightMedia', - } - - # API call can be avoided entirely if we are listing formats - if self.get_param('listformats', False): - return info - - webpage = self._download_webpage(url, video_id) - build_vars = self._parse_json(self._search_regex( - r'(?s)buildVars\s*=\s*({.*?})', webpage, 'build vars'), - video_id, transform_source=js_to_json) - region = build_vars.get('region') - channel_array = self._download_json(self._API_URL.format(region), video_id) - video_data = self._extract_media(channel_array, video_id) - - if video_data is None: - raise ExtractorError( - f'Video {video_id} does not exist', expected=True) - - info['_type'] = 'url_transparent' - images = video_data.get('images') - - return merge_dicts(info, { - 'title': video_data.get('title'), - 'description': video_data.get('description'), - 'thumbnail': images.get('medium') or images.get('small'), - 'series': 'Pokémon', - 'season_number': int_or_none(video_data.get('season')), - 'episode': video_data.get('title'), - 'episode_number': int_or_none(video_data.get('episode')), - }) diff --git a/yt_dlp/extractor/veoh.py b/yt_dlp/extractor/veoh.py deleted file mode 100644 index aac768f3c6..0000000000 --- a/yt_dlp/extractor/veoh.py +++ /dev/null @@ -1,189 +0,0 @@ -import functools -import json - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - OnDemandPagedList, - int_or_none, - parse_duration, - qualities, - remove_start, - strip_or_none, -) - - -class VeohIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|videos|embed|iphone/#_Watch)/(?P(?:v|e|yapi-)[\da-zA-Z]+)' - - _TESTS = [{ - 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', - 'md5': '620e68e6a3cff80086df3348426c9ca3', - 'info_dict': { - 'id': 'v56314296nk7Zdmz3', - 'ext': 'mp4', - 'title': 'Straight Backs Are Stronger', - 'description': 'md5:203f976279939a6dc664d4001e13f5f4', - 'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th56314296\\.jpg(\\?.*)?', - 'uploader': 'LUMOback', - 'duration': 46, - 'view_count': int, - 'average_rating': int, - 'comment_count': int, - 'age_limit': 0, - 'categories': ['technology_and_gaming'], - 'tags': ['posture', 'posture', 'sensor', 'back', 'pain', 'wearable', 'tech', 'lumo'], - }, - }, { - 'url': 'http://www.veoh.com/embed/v56314296nk7Zdmz3', - 'only_matching': True, - }, { - 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', - 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', - 'info_dict': { - 'id': '27701988', - 'ext': 'mp4', - 'title': 'Chile workers cover up to avoid skin damage', - 'description': 'md5:2bd151625a60a32822873efc246ba20d', - 'uploader': 'afp-news', - 'duration': 123, - }, - 'skip': 'This video has been deleted.', - }, { - 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', - 'md5': '4fde7b9e33577bab2f2f8f260e30e979', - 'note': 'Embedded ooyala video', - 'info_dict': { - 'id': '69525809', - 'ext': 'mp4', - 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', - 'description': 'md5:f5a11c51f8fb51d2315bca0937526891', - 'uploader': 'newsy-videos', - }, - 'skip': 'This video has been deleted.', - }, { - 'url': 'http://www.veoh.com/watch/e152215AJxZktGS', - 'only_matching': True, - }, { - 'url': 'https://www.veoh.com/videos/v16374379WA437rMH', - 'md5': 'cceb73f3909063d64f4b93d4defca1b3', - 'info_dict': { - 'id': 'v16374379WA437rMH', - 'ext': 'mp4', - 'title': 'Phantasmagoria 2, pt. 1-3', - 'description': 'Phantasmagoria: a Puzzle of Flesh', - 'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th16374379\\.jpg(\\?.*)?', - 'uploader': 'davidspackage', - 'duration': 968, - 'view_count': int, - 'average_rating': int, - 'comment_count': int, - 'age_limit': 18, - 'categories': ['technology_and_gaming', 'gaming'], - 'tags': ['puzzle', 'of', 'flesh'], - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - metadata = self._download_json( - 'https://www.veoh.com/watch/getVideo/' + video_id, - video_id) - video = metadata['video'] - title = video['title'] - - thumbnail_url = None - q = qualities(['Regular', 'HQ']) - formats = [] - for f_id, f_url in video.get('src', {}).items(): - if not f_url: - continue - if f_id == 'poster': - thumbnail_url = f_url - else: - formats.append({ - 'format_id': f_id, - 'quality': q(f_id), - 'url': f_url, - }) - - categories = metadata.get('categoryPath') - if not categories: - category = remove_start(strip_or_none(video.get('category')), 'category_') - categories = [category] if category else None - tags = video.get('tags') - - return { - 'id': video_id, - 'title': title, - 'description': video.get('description'), - 'thumbnail': thumbnail_url, - 'uploader': video.get('author', {}).get('nickname'), - 'duration': int_or_none(video.get('lengthBySec')) or parse_duration(video.get('length')), - 'view_count': int_or_none(video.get('views')), - 'formats': formats, - 'average_rating': int_or_none(video.get('rating')), - 'comment_count': int_or_none(video.get('numOfComments')), - 'age_limit': 18 if video.get('contentRatingId') == 2 else 0, - 'categories': categories, - 'tags': tags.split(', ') if tags else None, - } - - -class VeohUserIE(VeohIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?veoh\.com/users/(?P[\w-]+)' - IE_NAME = 'veoh:user' - - _TESTS = [ - { - 'url': 'https://www.veoh.com/users/valentinazoe', - 'info_dict': { - 'id': 'valentinazoe', - 'title': 'valentinazoe (Uploads)', - }, - 'playlist_mincount': 75, - }, - { - 'url': 'https://www.veoh.com/users/PiensaLibre', - 'info_dict': { - 'id': 'PiensaLibre', - 'title': 'PiensaLibre (Uploads)', - }, - 'playlist_mincount': 2, - }] - - _PAGE_SIZE = 16 - - def _fetch_page(self, uploader, page): - response = self._download_json( - 'https://www.veoh.com/users/published/videos', uploader, - note=f'Downloading videos page {page + 1}', - headers={ - 'x-csrf-token': self._TOKEN, - 'content-type': 'application/json;charset=UTF-8', - }, - data=json.dumps({ - 'username': uploader, - 'maxResults': self._PAGE_SIZE, - 'page': page + 1, - 'requestName': 'userPage', - }).encode()) - if not response.get('success'): - raise ExtractorError(response['message']) - - for video in response['videos']: - yield self.url_result(f'https://www.veoh.com/watch/{video["permalinkId"]}', VeohIE, - video['permalinkId'], video.get('title')) - - def _real_initialize(self): - webpage = self._download_webpage( - 'https://www.veoh.com', None, note='Downloading authorization token') - self._TOKEN = self._search_regex( - r'csrfToken:\s*(["\'])(?P[0-9a-zA-Z]{40})\1', webpage, - 'request token', group='token') - - def _real_extract(self, url): - uploader = self._match_id(url) - return self.playlist_result(OnDemandPagedList( - functools.partial(self._fetch_page, uploader), - self._PAGE_SIZE), uploader, f'{uploader} (Uploads)') From d867f99622ef7fba690b08da56c39d739b822bb7 Mon Sep 17 00:00:00 2001 From: ChocoLZS <61224208+ChocoLZS@users.noreply.github.com> Date: Mon, 18 Nov 2024 02:41:57 +0800 Subject: [PATCH 4/8] [ie/PiaLive] Add extractor (#10811) Authored by: ChocoLZS --- yt_dlp/extractor/_extractors.py | 6 +- yt_dlp/extractor/pialive.py | 122 +++++++++++++++++++++++++++++ yt_dlp/extractor/piaulizaportal.py | 70 ----------------- yt_dlp/extractor/uliza.py | 113 ++++++++++++++++++++++++++ 4 files changed, 240 insertions(+), 71 deletions(-) create mode 100644 yt_dlp/extractor/pialive.py delete mode 100644 yt_dlp/extractor/piaulizaportal.py create mode 100644 yt_dlp/extractor/uliza.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 0d849c1697..967010826e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1520,8 +1520,8 @@ from .pgatour import PGATourIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .pialive import PiaLiveIE from .piapro import PiaproIE -from .piaulizaportal import PIAULIZAPortalIE from .picarto import ( PicartoIE, PicartoVodIE, @@ -2250,6 +2250,10 @@ from .ufctv import ( ) from .ukcolumn import UkColumnIE from .uktvplay import UKTVPlayIE +from .uliza import ( + UlizaPlayerIE, + UlizaPortalIE, +) from .umg import UMGDeIE from .unistra import UnistraIE from .unity import UnityIE diff --git a/yt_dlp/extractor/pialive.py b/yt_dlp/extractor/pialive.py new file mode 100644 index 0000000000..7469135c1b --- /dev/null +++ b/yt_dlp/extractor/pialive.py @@ -0,0 +1,122 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_class, + multipart_encode, + str_or_none, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class PiaLiveIE(InfoExtractor): + _VALID_URL = r'https?://player\.pia-live\.jp/stream/(?P[\w-]+)' + _PLAYER_ROOT_URL = 'https://player.pia-live.jp/' + _PIA_LIVE_API_URL = 'https://api.pia-live.jp' + _API_KEY = 'kfds)FKFps-dms9e' + _TESTS = [{ + 'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krUDqGOwN4d61dCWQYOd6CTxl4hjya9dsfEZGsM4uGOUdax60lEI4twsXGXf7crmz8Gk__GhupTrWxA7RFRVt76', + 'info_dict': { + 'id': '88f3109a-f503-4d0f-a9f7-9f39ac745d84', + 'display_id': '2431867_001', + 'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)', + 'live_status': 'was_live', + 'comment_count': int, + }, + 'params': { + 'getcomments': True, + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'skip': 'The video is no longer available', + }, { + 'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krJdu0GVBVbVy01IwpJ6J3qBEm3d9TCTt1d0eWpsZGj7DrOjVOmS7GAWGwyscMgiThopJvzgWC4H5b-7XQjAfRZ', + 'info_dict': { + 'id': '9ce8b8ba-f6d1-4d1f-83a0-18c3148ded93', + 'display_id': '2431867_002', + 'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)', + 'live_status': 'was_live', + 'comment_count': int, + }, + 'params': { + 'getcomments': True, + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'skip': 'The video is no longer available', + }] + + def _extract_var(self, variable, html): + return self._search_regex( + rf'(?:var|const|let)\s+{variable}\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + html, f'variable {variable}', group='value') + + def _real_extract(self, url): + video_key = self._match_id(url) + webpage = self._download_webpage(url, video_key) + + program_code = self._extract_var('programCode', webpage) + article_code = self._extract_var('articleCode', webpage) + title = self._html_extract_title(webpage) + + if get_element_html_by_class('play-end', webpage): + raise ExtractorError('The video is no longer available', expected=True, video_id=program_code) + + if start_info := clean_html(get_element_by_class('play-waiting__date', webpage)): + date, time = self._search_regex( + r'(?P\d{4}/\d{1,2}/\d{1,2})\([月火水木金土日]\)(?P