[ie/kenh14] Simplify

This commit is contained in:
Mozi 2024-10-23 16:14:53 +00:00 committed by bashonly
parent 38ed7e7d0d
commit 09d0a29721
No known key found for this signature in database
GPG Key ID: 783F096F253D15B0
2 changed files with 114 additions and 281 deletions

View File

@ -942,6 +942,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,

View File

@ -1,321 +1,150 @@
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
clean_html, clean_html,
extract_attributes, extract_attributes,
get_element_by_class, get_element_by_class,
get_element_html_by_attribute, get_element_html_by_attribute,
get_elements_by_attribute, get_elements_html_by_class,
get_elements_html_by_attribute,
int_or_none, int_or_none,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
remove_end,
remove_start, remove_start,
str_or_none,
strip_or_none,
unescapeHTML,
url_or_none,
) )
from ..utils.traversal import traverse_obj
class Kenh14IE(InfoExtractor): class Kenh14VideoIE(InfoExtractor):
IE_NAME = 'kenh14' _VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_VALID_URL = r'https?://video.kenh14\.vn/(?:playlist/)?[^/]*-(?P<id>[0-9]+)\.chn'
_TESTS = [{ _TESTS = [{
'url': 'https://video.kenh14.vn/video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn', 'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'note': 'Video URL', 'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'md5': '525b9c4646a7aed819697cfd17dd25a9',
'info_dict': { 'info_dict': {
'id': '316173', 'id': '316173',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)', 'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro Max', 'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'thumbnails': list, 'tags': [],
'formats': list,
'tags': ["iPhone 14 Pro", "iPhone 14 Pro Max", "iPhone 14"],
'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy',
'uploader': 'Unbox Therapy', 'uploader': 'Unbox Therapy',
'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy',
'upload_date': '20220517', 'upload_date': '20220517',
'view_count': int, 'view_count': int,
'release_date': '20220518',
'modified_date': '20220518',
'duration': 722.86, 'duration': 722.86,
'modified_timestamp': 1652848039, 'timestamp': 1652764468,
'release_timestamp': 1652848039,
'timestamp': int,
}
}, {
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'note': 'Playlist URL',
'info_dict': {
'id': '316972',
'ext': 'mp4',
'description': 'md5:4212062bf4c447efbad5f54e6ab8d132',
'thumbnail': 'https://kenh14cdn.com/203336854389633024/2022/6/5/1024-1281-1654416995812779954762.jpg',
'modified_timestamp': 1654376400,
'duration': 4602.09,
'timestamp': 1654398990,
'release_date': '20220604',
'tags': ['Sơn Soho', 'Linh Keen', 'Trần Tình (Naked Love) mùa 2', 'Naked Love', 'trần tình', 'ShowHot'],
'release_timestamp': 1654376400,
'view_count': int,
'modified_date': '20220604',
'title': '[4x5] FINAL - Naked Love EP4 - Moi quan he tieu cuc',
'upload_date': '20220605',
'display_id': 'md5:a27d1fbdeafeb740050de0e697c8e02e',
}
}, {
'url': 'https://video.kenh14.vn/video-316173.chn',
'note': 'javascript-based redirect; set via <body onload="window.location.href=URL">',
'md5': '525b9c4646a7aed819697cfd17dd25a9',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro Max',
'thumbnail': r're:^https?://.*\.jpg$',
'thumbnails': list,
'formats': list,
'tags': ["iPhone 14 Pro", "iPhone 14 Pro Max", "iPhone 14"],
'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy',
'uploader': 'Unbox Therapy',
'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy',
'upload_date': '20220517',
'view_count': int,
'release_date': '20220518',
'modified_date': '20220518',
'duration': 722.86,
'modified_timestamp': 1652848039,
'release_timestamp': 1652848039,
'timestamp': int,
}
}, {
'url': 'https://video.kenh14.vn/0-316173.chn',
'note': 'HTTP 301 redirect to canonical URL',
'md5': '525b9c4646a7aed819697cfd17dd25a9',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro Max',
'thumbnail': r're:^https?://.*\.jpg$',
'thumbnails': list,
'formats': list,
'tags': ["iPhone 14 Pro", "iPhone 14 Pro Max", "iPhone 14"],
'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy',
'uploader': 'Unbox Therapy',
'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy',
'upload_date': '20220517',
'view_count': int,
'release_date': '20220518',
'modified_date': '20220518',
'duration': 722.86,
'modified_timestamp': 1652848039,
'release_timestamp': 1652848039,
'timestamp': int,
}
}, {
'url': 'https://video.kenh14.vn/playlist/0-71.chn',
'note': 'HTTP 301 redirect to canonical playlist URL',
'info_dict': {
'id': '316972',
'ext': 'mp4',
'description': 'md5:4212062bf4c447efbad5f54e6ab8d132',
'thumbnail': 'https://kenh14cdn.com/203336854389633024/2022/6/5/1024-1281-1654416995812779954762.jpg',
'modified_timestamp': 1654376400,
'duration': 4602.09,
'timestamp': 1654398990,
'release_date': '20220604',
'tags': ['Sơn Soho', 'Linh Keen', 'Trần Tình (Naked Love) mùa 2', 'Naked Love', 'trần tình', 'ShowHot'],
'release_timestamp': 1654376400,
'view_count': int,
'modified_date': '20220604',
'title': '[4x5] FINAL - Naked Love EP4 - Moi quan he tieu cuc',
'upload_date': '20220605',
'display_id': 'md5:a27d1fbdeafeb740050de0e697c8e02e',
}
}, {
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'note': 'Playlist URL with --flat-playlist',
'info_dict': {
'id': '316972',
'ext': 'mp4',
'description': 'md5:4212062bf4c447efbad5f54e6ab8d132',
'thumbnail': 'https://videothumbs.mediacdn.vn/kenh14/203336854389633024/2022/6/5/4x5-final-naked-love-ep4-moi-quan-he-tieu-cuc-16544166609501118413296.jpg',
'tags': ['Sơn Soho', 'Linh Keen', 'Trần Tình (Naked Love) mùa 2', 'Naked Love', 'Trần tình', 'ShowHot'],
'title': 'Naked Love - Trần Tình tập 4: Sơn Soho, Linh Keen cùng câu chuyện bước ra khỏi mối quan hệ tiêu cực và yêu bản thân mình',
'display_id': 'md5:db3e1b63976a664b3576f3ef68a17c9a',
}, },
'params': {'extract_flat': 'in_playlist'} }, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
'expected_warnings': [
'Failed to download m3u8 information: HTTP Error 404: NOT FOUND',
],
}] }]
def _try_get_redirect_url(self, webpage): def _real_extract(self, url):
# javascript-based redirect; set via <body onload="window.location.href=URL"> video_id = self._match_id(url)
url = self._search_regex(r"onload=\"window.location.href='([^']*)'", webpage, 'redirect url', default=None, fatal=False) webpage = self._download_webpage(url, video_id)
if url:
return 'https://video.kenh14.vn' + url
def _extract_formats_wrapper(self, video_id, direct_url): attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
formats = [{'url': f'https://{direct_url}'}] direct_url = attrs['data-vid']
formats.extend(self._extract_m3u8_formats(f'https://{direct_url}/master.m3u8', video_id))
self._sort_formats(formats)
return formats
def _extract_video(self, webpage, page_url, fallback_url=''): metadata = self._download_json(
video_id = self._match_id(page_url) 'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
if webpage is None: remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
webpage, page_url = self._download_webpage_wrapper(page_url, fatal=False)
attrs = extract_attributes(self._extract_video_div(webpage))
direct_url = attrs.get('data-vid', fallback_url)
filename = remove_start(direct_url, 'kenh14cdn.com/')
inline_metadata = self._parse_json(attrs.get('data-htmlcode', '{}'), video_id, fatal=False)
display_id = self._get_display_id(page_url)
formats = self._extract_formats_wrapper(video_id, direct_url) return {
result = {
'id': video_id, 'id': video_id,
'title': ( 'title': (
clean_html(self._og_search_title(webpage)) strip_or_none(metadata.get('title'))
or inline_metadata.get('video_title') or clean_html(self._og_search_title(webpage))
or self._extract_title(webpage)), or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats, 'formats': [
'duration': parse_duration(inline_metadata.get('video_duration')), {'url': f'https://{direct_url}', 'format_id': 'http'},
*self._extract_m3u8_formats(f'https://{direct_url}/master.m3u8', video_id, fatal=False),
],
'duration': parse_duration(metadata.get('duration')),
'description': ( 'description': (
clean_html(self._og_search_description(webpage)) clean_html(self._og_search_description(webpage))
or clean_html(inline_metadata.get('video_description')) or clean_html(get_element_by_class('vdbw-sapo', webpage))),
or self._extract_description(webpage)), 'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'thumbnail': ( 'uploader': strip_or_none(metadata.get('author')),
self._og_search_thumbnail(webpage) 'timestamp': parse_iso8601(metadata.get('uploadtime'), delimiter=' '),
or inline_metadata.get('video_thumb')
or attrs.get('data-thumb')),
'release_timestamp': parse_iso8601(self._html_search_meta('article:published_time', webpage)),
'modified_timestamp': parse_iso8601(self._html_search_meta('article:modified_time', webpage)),
'tags': self._extract_page_tags(webpage),
'webpage_url': page_url,
'display_id': display_id,
}
result['timestamp'] = result.get('release_timestamp') or result.get('modified_timestamp')
metadata = self._download_json(f'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={filename}', video_id) or {}
return {
# Note: API result contains 'videoID' but it's a different number that doesn't seem public-facing.
'id': video_id,
'title': metadata.get('title') or result.get('title'),
'formats': formats,
'duration': parse_duration(metadata.get('duration')) or result.get('duration'),
'timestamp': (
parse_iso8601(metadata.get('uploadtime'), delimiter=' ')
or result.get('timestamp')),
'release_timestamp': result.get('release_timestamp'),
'modified_timestamp': result.get('modified_timestamp'),
'thumbnail': metadata.get('thumbnail') or result.get('thumbnail'),
'description': result.get('description'),
'uploader': metadata.get('author'),
'view_count': int_or_none(metadata.get('views')), 'view_count': int_or_none(metadata.get('views')),
'tags': result.get('tags'), 'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
'webpage_url': page_url, {lambda x: x.split(';')}, lambda _, v: v, {str_or_none})),
'display_id': display_id,
} }
def _extract_playlist(self, webpage, url):
playlist_id = self._match_id(url)
display_id = self._get_display_id(url)
def _extract_playlist_videos(): class Kenh14PlaylistIE(InfoExtractor):
for video_listitem in self._get_list_videos(webpage): _VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
attrs = extract_attributes(self._extract_video_div(video_listitem)) _TESTS = [{
direct_url = attrs.get('data-vid') 'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
if not direct_url: 'info_dict': {
self.report_warning(f'Could not find expected video in playlist {playlist_id}') 'id': '71',
continue 'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
video_id = attrs.get('data-item-id') 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
share_url = attrs.get('data-share') },
if not share_url: 'playlist_count': 9,
share_url = f'https://video.kenh14.vn/video-{video_id}.chn' }, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
basename = remove_end(remove_start(direct_url, 'kenh14cdn.com/'), '.mp4') 'info_dict': {
video_result = { 'id': '72',
'_type': 'video', 'title': 'Lau Lại Đầu Từ',
'id': video_id, 'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'webpage_url': share_url, 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
'display_id': display_id, },
'title': self._extract_title(video_listitem), 'playlist_count': 6,
'description': self._extract_description(video_listitem), }]
'thumbnail': f'https://videothumbs.mediacdn.vn/kenh14/{basename}.jpg',
'tags': self._extract_tag_list(video_listitem),
}
if not self.get_param('extract_flat', False):
video_result.update(self._extract_video(None, share_url, direct_url))
else:
video_result.update({'formats': self._extract_formats_wrapper(video_id, direct_url)})
yield video_result
return {
'_type': 'playlist',
'id': playlist_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'tags': self._extract_page_tags(webpage),
'display_id': display_id,
'entries': _extract_playlist_videos(),
}
pass
def _get_list_videos(self, webpage):
# get_elements_html_by_class('video-item', webpage) is NOT doing what I
# expect; it's matching list-video-item for some reason
return get_elements_html_by_attribute('class', 'video-item', webpage)
def _extract_video_div(self, content):
return get_element_html_by_attribute('type', 'VideoStream', content) or ''
def _extract_title(self, elem):
return clean_html(get_element_by_class('video-title', elem) or get_element_by_class('vdbw-title', elem))
def _extract_description(self, elem):
return clean_html(get_element_by_class('video-sapo', elem) or get_element_by_class('vdbw-sapo', elem))
def _extract_tag_list(self, elem):
in_class = list(get_elements_html_by_attribute('class', 'video-tag', elem))
in_attr = list(get_elements_by_attribute('data', 'video-tag', elem))
return list(filter(None, map(clean_html, in_class or in_attr)))
def _extract_page_tags(self, webpage):
tags = self._html_search_meta('article:tag', webpage) or ''
return list(filter(None, map(lambda x: x.strip(), tags.split(','))))
def _download_webpage_wrapper(self, url, fatal=True):
video_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(url, video_id, fatal=fatal)
redirect_url = self._try_get_redirect_url(webpage)
new_urlh = urlh
if redirect_url:
# Page load relies on a javascript redirect
webpage, new_urlh = self._download_webpage_handle(redirect_url, video_id, fatal=fatal)
return webpage, (new_urlh or urlh).geturl()
def _is_playlist(self, url):
return self._search_regex(
r'https?://video.kenh14\.vn/(?P<is_playlist>playlist/|)?.*\.chn',
url, 'url type and display id', default=False, group=('is_playlist'), fatal=False)
def _get_display_id(self, url):
return self._search_regex(
r'https?://video.kenh14\.vn/(?:playlist/|)?(?P<display_id>.*?)-?[0-9]+\.chn',
url, 'url type and display id', group=('display_id'), fatal=False)
def _real_extract(self, url): def _real_extract(self, url):
webpage, url = self._download_webpage_wrapper(url) playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
is_playlist = self._is_playlist(url) category_detail = get_element_by_class('category-detail', webpage)
self.write_debug('deduced page to be a ' + ('playlist' if is_playlist else 'video')) embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName']), get_all=False)
if is_playlist: return self.playlist_from_matches(
return self._extract_playlist(webpage, url) get_elements_html_by_class('video-item', webpage), playlist_id,
else: (clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
return self._extract_video(webpage, url) getter=lambda x: 'https://video.kenh14.vn/video/{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE.ie_key(), playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {lambda x: urllib.parse.urlunparse(urllib.parse.urlparse(x)._replace(query=None))})))