diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index a0f302f44c..43bcadfb3a 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -378,56 +378,30 @@ class SoundcloudBaseIE(InfoExtractor): if info.get('policy') == 'BLOCK': self.raise_geo_restricted(metadata_available=True) - user = info.get('user') or {} - - thumbnails = [] - artwork_url = info.get('artwork_url') - thumbnail = artwork_url or user.get('avatar_url') - if url_or_none(thumbnail): - if mobj := re.search(self._IMAGE_REPL_RE, thumbnail): - for image_id, size in self._ARTWORK_MAP.items(): - # Soundcloud serves JPEG regardless of URL's ext *except* for "original" thumb - ext = mobj.group('ext') if image_id == 'original' else 'jpg' - i = { - 'id': image_id, - 'url': re.sub(self._IMAGE_REPL_RE, f'-{image_id}.{ext}', thumbnail), - } - if image_id == 'tiny' and not artwork_url: - size = 18 - elif image_id == 'original': - i['preference'] = 10 - if size: - i.update({ - 'width': size, - 'height': size, - }) - thumbnails.append(i) - else: - thumbnails = [{'url': thumbnail}] - - def extract_count(key): - return int_or_none(info.get(f'{key}_count')) - return { + **traverse_obj(info, { + 'uploader': ('user', 'username', {str}), + 'uploader_id': ('user', ('id', 'permalink'), {str_or_none}, any), + 'uploader_url': ('user', 'permalink_url', {url_or_none}), + 'timestamp': ('created_at', {unified_timestamp}), + 'title': ('title', {str}), + 'track': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {float_or_none(scale=1000)}), + 'webpage_url': ('permalink_url', {url_or_none}), + 'license': ('license', {str}), + 'view_count': ('playback_count', {int_or_none}), + 'like_count': (('favoritings_count', 'likes_count'), {int_or_none}, any), + 'comment_count': ('comment_count', {int_or_none}), + 'repost_count': ('reposts_count', {int_or_none}), + 'release_timestamp': ('release_date', {unified_timestamp}), + 'modified_timestamp': ('last_modified', {unified_timestamp}), + 'genres': ('genre', {str}, filter, all, filter), + 'tags': ('tag_list', {self._TAGS_RE.findall}, ..., ..., filter), + 'artists': ('publisher_metadata', 'artist', {str}, filter, all, filter), + }), 'id': track_id, - 'uploader': user.get('username'), - 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), - 'uploader_url': user.get('permalink_url'), - 'timestamp': unified_timestamp(info.get('created_at')), - 'title': info.get('title'), - 'track': info.get('title'), - 'description': info.get('description'), - 'thumbnails': thumbnails, - 'duration': float_or_none(info.get('duration'), 1000), - 'webpage_url': info.get('permalink_url'), - 'license': info.get('license'), - 'view_count': extract_count('playback'), - 'like_count': extract_count('favoritings') or extract_count('likes'), - 'comment_count': extract_count('comment'), - 'repost_count': extract_count('reposts'), - 'genres': traverse_obj(info, ('genre', {str}, filter, all, filter)), - 'tags': traverse_obj(info, ('tag_list', {self._TAGS_RE.findall}, ..., ..., filter)), - 'artists': traverse_obj(info, ('publisher_metadata', 'artist', {str}, filter, all, filter)), + 'thumbnails': self._extract_thumbnails(info), 'formats': formats if not extract_flat else None, '__post_extractor': self.extract_comments(track_id), } @@ -478,6 +452,43 @@ class SoundcloudBaseIE(InfoExtractor): if not next_url: break + def _extract_thumbnails(self, info): + artwork_url = traverse_obj(info, ('artwork_url', {url_or_none})) + thumbnail_url = artwork_url or traverse_obj(info, ('user', 'avatar_url', {url_or_none})) + if not thumbnail_url: + return None + + thumbnails = [] + if mobj := re.search(self._IMAGE_REPL_RE, thumbnail_url): + for image_id, size in self._ARTWORK_MAP.items(): + # Soundcloud serves JPEG regardless of URL's ext *except* for "original" thumb + ext = mobj.group('ext') if image_id == 'original' else 'jpg' + thumbnail = { + 'id': image_id, + 'url': re.sub(self._IMAGE_REPL_RE, f'-{image_id}.{ext}', thumbnail_url), + } + if image_id == 'tiny' and not artwork_url: + size = 18 + elif image_id == 'original': + thumbnail['preference'] = 10 + # "original" thumb ext doesn't always match ext used for other thumbs, check with HEAD req + req = self._request_webpage( + HEADRequest(thumbnail['url']), str(info['id']), note='Checking thumbnail extension', + errnote=False, fatal=False, headers=self._HEADERS) + if not req: + # If "original" thumb doesn't exist, assume different ext + ext = 'jpg' if ext == 'png' else 'png' + thumbnail['url'] = re.sub(self._IMAGE_REPL_RE, f'-{image_id}.{ext}', thumbnail_url) + if size: + thumbnail.update({ + 'width': size, + 'height': size, + }) + thumbnails.append(thumbnail) + else: + thumbnails = [{'url': thumbnail_url}] + return thumbnails + class SoundcloudIE(SoundcloudBaseIE): """Information extractor for soundcloud.com @@ -522,6 +533,8 @@ class SoundcloudIE(SoundcloudBaseIE): 'thumbnail': r're:https?://[ai]1\.sndcdn\.com/.+\.(?:jpg|png)', 'uploader_url': 'https://soundcloud.com/ethmusic', 'tags': 'count:14', + 'modified_timestamp': 1350184468, + 'modified_date': '20121014', }, 'params': {'skip_download': 'm3u8'}, }, { @@ -547,7 +560,8 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/jaimemf', 'thumbnail': r're:https?://[ai]1\.sndcdn\.com/.+\.(?:jpg|png)', 'genres': ['youtubedl'], - 'tags': [], + 'modified_timestamp': 1386604920, + 'modified_date': '20131209', }, }, { # private link (alt format) @@ -572,7 +586,8 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/jaimemf', 'thumbnail': r're:https?://[ai]1\.sndcdn\.com/.+\.(?:jpg|png)', 'genres': ['youtubedl'], - 'tags': [], + 'modified_timestamp': 1386604920, + 'modified_date': '20131209', }, }, { # downloadable song @@ -598,6 +613,10 @@ class SoundcloudIE(SoundcloudBaseIE): 'genres': ['Dance & EDM'], 'artists': ['80M'], 'tags': 'count:4', + 'release_timestamp': 1506384000, + 'release_date': '20170926', + 'modified_timestamp': 1647390150, + 'modified_date': '20220316', }, 'params': {'skip_download': 'm3u8'}, 'expected_warnings': ['Original download format is only available for registered users'], @@ -627,6 +646,8 @@ class SoundcloudIE(SoundcloudBaseIE): 'genres': ['Trance'], 'artists': ['Ori Uplift'], 'tags': 'count:6', + 'modified_timestamp': 1504258507, + 'modified_date': '20170901', }, 'expected_warnings': ['Original download format is only available for registered users'], }, { @@ -652,7 +673,8 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'uploader_url': 'https://soundcloud.com/garyvee', 'artists': ['MadReal'], - 'tags': [], + 'modified_timestamp': 1488293034, + 'modified_date': '20170228', }, 'params': {'skip_download': 'm3u8'}, }, { @@ -678,6 +700,8 @@ class SoundcloudIE(SoundcloudBaseIE): 'genres': ['Piano'], 'uploader_url': 'https://soundcloud.com/giovannisarani', 'tags': 'count:10', + 'modified_timestamp': 1692623663, + 'modified_date': '20230821', }, 'params': {'skip_download': 'm3u8'}, }, { @@ -696,12 +720,15 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'repost_count': int, 'duration': 213.469, - 'tags': [], 'artists': ['$KORXH'], 'track': 'audio dealer', 'timestamp': 1737143201, 'upload_date': '20250117', 'license': 'all-rights-reserved', + 'release_timestamp': 1736985600, + 'release_date': '20250116', + 'modified_timestamp': 1737143467, + 'modified_date': '20250117', 'thumbnail': r're:https?://[ai]1\.sndcdn\.com/.+\.(?:jpg|png)', 'thumbnails': [ {'id': 'mini', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-mini.jpg'}, @@ -733,12 +760,13 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'duration': 241.601, 'thumbnail': 'https://i1.sndcdn.com/artworks-000209893581-orfv6t-original.jpg', - 'tags': [], 'artists': ['BENDY AND THE INK MACHINE SONG (Build Our Machine) INSTRUMENTAL '], 'track': 'BENDY AND THE INK MACHINE SONG (Build Our Machine) INSTRUMENTAL by DAGAMES', 'timestamp': 1488232827, 'upload_date': '20170227', 'license': 'all-rights-reserved', + 'modified_timestamp': 1645028949, + 'modified_date': '20220216', }, 'params': {'get_comments': True, 'skip_download': 'm3u8'}, }, { @@ -827,7 +855,17 @@ class SoundcloudPlaylistBaseIE(SoundcloudBaseIE): 'uploader': ('user', 'username', {str}), 'uploader_id': ('user', 'id', {str_or_none}), 'uploader_url': ('user', 'permalink_url', {url_or_none}), + 'timestamp': ('created_at', {unified_timestamp}), + 'release_timestamp': (('release_date', 'published_at'), {unified_timestamp}, any), + 'modified_timestamp': ('last_modified', {unified_timestamp}), + 'duration': ('duration', {float_or_none(scale=1000)}), + 'license': ('license', {str}), + 'like_count': ('likes_count', {int_or_none}), + 'repost_count': ('reposts_count', {int_or_none}), + 'genres': ('genre', {str}, filter, all, filter), + 'tags': ('tag_list', {self._TAGS_RE.findall}, ..., ..., filter), }), + thumbnails=self._extract_thumbnails(playlist), ) @@ -835,6 +873,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P[\w\d-]+)/sets/(?P[:\w\d-]+)(?:/(?P[^?/]+))?' IE_NAME = 'soundcloud:set' _TESTS = [{ + # No release date, no tags 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', 'info_dict': { 'id': '2284613', @@ -846,8 +885,68 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): 'album': 'The Royal Concept EP', 'album_artists': ['The Royal Concept'], 'album_type': 'ep', + 'timestamp': 1343497860, + 'upload_date': '20120728', + 'modified_timestamp': 1358471457, + 'modified_date': '20130118', + 'duration': 1398.595, + 'license': 'all-rights-reserved', + 'like_count': 482, + 'repost_count': 99, + 'genres': ['Indie/pop'], + 'thumbnails': [ + {'id': 'mini', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-mini.jpg'}, + {'id': 'tiny', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-tiny.jpg'}, + {'id': 'small', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-small.jpg'}, + {'id': 'badge', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-badge.jpg'}, + {'id': 't67x67', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-t67x67.jpg'}, + {'id': 'large', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-large.jpg'}, + {'id': 't300x300', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-t300x300.jpg'}, + {'id': 'crop', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-crop.jpg'}, + {'id': 't500x500', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-t500x500.jpg'}, + {'id': 'original', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-original.jpg'}, + ], }, 'playlist_mincount': 5, + }, { + # Release date, multiple tags, empty desc + 'url': 'https://soundcloud.com/leviryan/sets/out-of-spite', + 'info_dict': { + 'id': '1524158182', + 'title': 'out of spite', + 'description': '', + 'uploader': 'Levi Ryan', + 'uploader_id': '229146182', + 'uploader_url': 'https://soundcloud.com/leviryan', + 'album': 'out of spite', + 'album_artists': ['Levi Ryan'], + 'album_type': 'album', + 'timestamp': 1667935849, + 'upload_date': '20221108', + 'release_timestamp': 1667865600, + 'release_date': '20221108', + 'modified_timestamp': 1667935903, + 'modified_date': '20221108', + 'duration': 1531.376, + 'license': 'all-rights-reserved', + 'like_count': 185, + 'repost_count': 40, + 'genres': ['Hip-hop & Rap'], + 'tags': ['Drum & Bass', 'Alternative', 'Ambient'], + 'thumbnails': [ + {'id': 'mini', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-mini.jpg'}, + {'id': 'tiny', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-tiny.jpg'}, + {'id': 'small', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-small.jpg'}, + {'id': 'badge', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-badge.jpg'}, + {'id': 't67x67', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-t67x67.jpg'}, + {'id': 'large', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-large.jpg'}, + {'id': 't300x300', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-t300x300.jpg'}, + {'id': 'crop', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-crop.jpg'}, + {'id': 't500x500', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-t500x500.jpg'}, + {'id': 'original', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-original.jpg'}, + ], + }, + 'playlist_count': 8, }, { 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', 'only_matching': True, @@ -1163,6 +1262,31 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): 'album_artists': ['Non-Site Records'], 'album_type': 'playlist', 'album': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', + 'timestamp': 1363395687, + 'upload_date': '20130316', + 'release_timestamp': 1363392000, + 'release_date': '20130316', + 'modified_timestamp': 1444746489, + 'modified_date': '20151013', + 'duration': 2152.685, + 'license': 'all-rights-reserved', + 'like_count': 2, + 'repost_count': 2, + 'genres': ['Downtown'], + 'tags': ['Non-Site Records', 'TILT Brass', 'TILT Creative Brass Band', 'Bowery Poetry Club', + 'Nick Didkovsky', 'Tom Waits', 'Dave Ballou', 'Elliott Sharp', 'AFKA Prince', 'NPG'], + 'thumbnails': [ + {'id': 'mini', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-mini.jpg'}, + {'id': 'tiny', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-tiny.jpg'}, + {'id': 'small', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-small.jpg'}, + {'id': 'badge', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-badge.jpg'}, + {'id': 't67x67', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-t67x67.jpg'}, + {'id': 'large', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-large.jpg'}, + {'id': 't300x300', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-t300x300.jpg'}, + {'id': 'crop', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-crop.jpg'}, + {'id': 't500x500', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-t500x500.jpg'}, + {'id': 'original', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-original.png'}, + ], }, 'playlist_count': 6, }, {