diff --git a/yt_dlp/extractor/eggs.py b/yt_dlp/extractor/eggs.py index 1c7a2852d4..73843497f4 100644 --- a/yt_dlp/extractor/eggs.py +++ b/yt_dlp/extractor/eggs.py @@ -1,123 +1,53 @@ -import re +import secrets from .common import InfoExtractor from ..utils import ( ExtractorError, - unescapeHTML, + str_or_none, + traverse_obj, url_or_none, ) class EggsBaseIE(InfoExtractor): - def _parse_artist_name(self, webpage): - artist = self._search_regex( - r']+class=(["\'])artist_name\1[^>]*>([^<]+)', - webpage, 'artist name', fatal=False, default=None, group=2) - if artist: - return artist.strip() + _API_HEADERS = { + 'Accept': '*/*', + 'apVersion': '8.2.00', + 'deviceName': 'Android', + } - og_title = self._html_search_meta(['og:title'], webpage, 'og:title', default=None) - if og_title: - artist_match = re.search(r'(?P[^()]+)(?:\([^)]*\))?のEggsページ', og_title) - if artist_match: - return artist_match.group('artist').strip() + @staticmethod + def _generate_random_device_id(): + return secrets.token_hex(8) - return 'Unknown Artist' + def _download_eggs_json(self, url, music_id): + headers = self._API_HEADERS.copy() + headers['deviceId'] = self._generate_random_device_id() + return self._download_json(url, video_id=music_id, headers=headers) - def _parse_single_song(self, url, webpage, song_id, default_artist='Unknown Artist'): - track_title = self._search_regex( - r']+class=(["\'])product_name\1[^>]*>\s*

([^<]+)

', - webpage, 'track title', fatal=False, default=None, group=2) + def _extract_music_info(self, data, song_id): + music_info = traverse_obj(data, { + 'id': ('musicId', {str_or_none}, {lambda x: x or song_id}), + 'title': ('musicTitle', {str}, {lambda x: x or 'Unknown Title'}), + 'url': ('musicDataPath', {url_or_none}), + 'uploader': ('artist', 'displayName', {str}, {lambda x: x or 'Unknown Artist'}), + 'thumbnail': ('imageDataPath', {url_or_none}), + 'youtube_url': ('youtubeUrl', {url_or_none}), + 'youtube_id': ('youtubeVideoId', {str_or_none}), + 'source_type': ('sourceType', {int}), + 'vcodec': (None, {lambda x: 'none'}), + }, get_all=False) - if not track_title: - page_title = self._search_regex( - r'(?P<title>[^<]+)', - webpage, 'page title', fatal=False, default=None, group='title') - if page_title: - inner_match = re.search(r'「(?P[^」]+)」', page_title) - if inner_match: - track_title = inner_match.group('inner').strip() + if not music_info.get('url') and not (music_info.get('source_type') == 2 and music_info.get('youtube_url')): + raise ExtractorError('Audio URL not found (possibly an unsupported sourceType)', expected=True) - if not track_title: - track_title = 'Unknown Title' - - artist = default_artist - if not artist or artist == 'Unknown Artist': - artist_regex = r']+class=(["\'])artist_name\1[^>]*>\s*]*>([^<]+)' - fallback_artist = self._search_regex( - artist_regex, webpage, 'artist name', - fatal=False, default=None, group=2) - if fallback_artist: - artist = fallback_artist.strip() - - audio_url = self._search_regex( - r']+class=(["\'])[^"\']*player[^"\']*\1[^>]+data-src=(["\'])(?P[^"\']+)\2', - webpage, 'audio url', fatal=True, group='audio_url') - audio_url = url_or_none(unescapeHTML(audio_url)) - if not audio_url: - raise ExtractorError('Invalid audio URL.', expected=True) - - thumbnail = ( - self._html_search_meta(['og:image'], webpage, 'thumbnail', default=None) - or self._search_regex( - r']*>\s*]+src=(["\'])(?P[^"\']+)\1', - webpage, 'thumbnail', fatal=False, default=None, group='thumb') - ) - - return { - 'id': song_id, - 'url': audio_url, - 'title': track_title, - 'uploader': artist, - 'vcodec': 'none', - 'thumbnail': thumbnail, - } - - def _parse_artist_page(self, webpage, artist_id, artist_name): - song_blocks = re.findall(r'(?s)]+id="songs\d+"[^>]*>.*?', webpage) - entries = [] - - for block in song_blocks: - audio_url = self._search_regex( - r'data-src=(["\'])(?Phttps?://.*?\.(?:mp3|m4a).*?)\1', - block, 'audio url', fatal=False, default=None, group='url') - audio_url = url_or_none(unescapeHTML(audio_url)) - if not audio_url: - continue - - track_id = self._search_regex( - r'data-srcid=(["\'])(?P[^"\'<>]+)\1', - block, 'track id', fatal=False, default=None, group='id') - if not track_id: - continue - - title = self._search_regex( - r'data-srcname=(["\'])(?P[^"\']+)\1', - block, 'track title', fatal=False, default=None, group='title') - if not title: - title = 'Unknown Title' - - thumbnail = self._search_regex( - r'<img[^>]+src=(["\'])(?P<th>[^"\']+)\1', - block, 'thumbnail', fatal=False, default=None, group='th') - - entries.append({ - 'id': track_id, - 'url': audio_url, - 'title': title, - 'uploader': artist_name, - 'vcodec': 'none', - 'thumbnail': thumbnail, - }) - - return entries + return music_info class EggsIE(EggsBaseIE): IE_NAME = 'eggs:single' - _VALID_URL = ( - r'https?://(?:www\.)?eggs\.mu/artist/[^/]+/song/(?P<song_id>[^/]+)' - ) + _VALID_URL = r'https?://eggs\.mu/artist/[^/]+/song/(?P<song_id>[^/]+)' + _TESTS = [{ 'url': 'https://eggs.mu/artist/32_sunny_girl/song/0e95fd1d-4d61-4d5b-8b18-6092c551da90', 'info_dict': { @@ -125,39 +55,105 @@ class EggsIE(EggsBaseIE): 'ext': 'm4a', 'title': 'シネマと信号', 'uploader': 'Sunny Girl', - 'thumbnail': r're:^https?://.*\.jpg(?:\?.*)?$', + 'source_type': 1, + 'thumbnail': r're:https?://.*\.jpg(?:\?.*)?$', }, + }, { + 'url': 'https://eggs.mu/artist/KAMO_3pband/song/1d4bc45f-1af6-47a9-8b30-a70cae350b4f', + 'info_dict': { + 'id': '80cLKA2wnoA', + 'ext': 'mp4', + 'title': 'KAMO「いい女だから」Audio', + 'uploader': 'KAMO', + 'live_status': 'not_live', + 'channel_id': 'UCsHLBw2__5Q9y55skXPotOg', + 'channel_follower_count': int, + 'description': 'md5:d260da711ecbec3e720293dc11401b87', + 'availability': 'public', + 'uploader_id': '@KAMO_band', + 'upload_date': '20240925', + 'thumbnail': 'https://i.ytimg.com/vi/80cLKA2wnoA/maxresdefault.jpg', + 'comment_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCsHLBw2__5Q9y55skXPotOg', + 'view_count': int, + 'duration': 151, + 'like_count': int, + 'channel': 'KAMO', + 'playable_in_embed': True, + 'uploader_url': 'https://www.youtube.com/@KAMO_band', + 'tags': [], + 'timestamp': 1727271121, + 'age_limit': 0, + 'categories': ['People & Blogs'], + }, + 'add_ie': ['Youtube'], + 'params': {'skip_download': 'Youtube'}, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - song_id = mobj.group('song_id') - webpage = self._download_webpage(url, song_id) - artist_name = self._parse_artist_name(webpage) - return self._parse_single_song(url, webpage, song_id, artist_name) + song_id = self._match_valid_url(url).group('song_id') + json_data = self._download_eggs_json( + f'https://app-front-api.eggs.mu/v1/musics/{song_id}', music_id=song_id) + music_info = self._extract_music_info(json_data, song_id) + + if music_info['source_type'] == 2 and music_info['youtube_url']: + return self.url_result( + music_info['youtube_url'], ie='Youtube', video_id=music_info['youtube_id']) + + return music_info class EggsArtistIE(EggsBaseIE): IE_NAME = 'eggs:artist' - _VALID_URL = ( - r'https?://(?:www\.)?eggs\.mu/artist/(?P<artist_id>[^/]+)$' - ) - _TESTS = [{ - 'url': 'https://eggs.mu/artist/32_sunny_girl', - 'info_dict': { - 'id': '32_sunny_girl', - 'title': 'Sunny Girl', + _VALID_URL = r'https?://eggs\.mu/artist/(?P<artist_id>[^/]+)$' + + _TESTS = [ + { + 'url': 'https://eggs.mu/artist/32_sunny_girl', + 'info_dict': { + 'id': '32_sunny_girl', + 'title': 'Sunny Girl', + }, + 'playlist_mincount': 18, }, - 'playlist_count': 18, - }] + { + 'url': 'https://eggs.mu/artist/KAMO_3pband', + 'info_dict': { + 'id': 'KAMO_3pband', + 'title': 'KAMO', + }, + 'playlist_mincount': 2, + }, + ] def _real_extract(self, url): artist_id = self._match_valid_url(url).group('artist_id') - webpage = self._download_webpage(url, artist_id) - artist_name = self._parse_artist_name(webpage) - entries = self._parse_artist_page(webpage, artist_id, artist_name) + json_data = self._download_eggs_json( + f'https://app-front-api.eggs.mu/v1/artists/{artist_id}/musics', music_id=artist_id) + items = traverse_obj(json_data, 'data', default=[]) + entries = [] + display_name = None + + for item in items: + music_info = self._extract_music_info(item, '') + if not music_info['id']: + continue + + if not display_name: + display_name = music_info['uploader'] + + if music_info['source_type'] == 2 and music_info['youtube_url']: + entries.append( + self.url_result( + music_info['youtube_url'], ie='Youtube', video_id=music_info['youtube_id'])) + continue + + if not music_info.get('url'): + continue + + entries.append(music_info) + return self.playlist_result( entries, playlist_id=artist_id, - playlist_title=artist_name, - ) + playlist_title=display_name or artist_id)