[ie/soundcloud] Improve metadata extraction (#17088)

Closes #7351
Authored by: noseb13eds
This commit is contained in:
noseb13eds 2026-07-01 00:46:54 -04:00 committed by GitHub
parent acc995cf91
commit 8bdfbfd446
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -378,56 +378,30 @@ class SoundcloudBaseIE(InfoExtractor):
if info.get('policy') == 'BLOCK':
self.raise_geo_restricted(metadata_available=True)
user = info.get('user') or {}
thumbnails = []
artwork_url = info.get('artwork_url')
thumbnail = artwork_url or user.get('avatar_url')
if url_or_none(thumbnail):
if mobj := re.search(self._IMAGE_REPL_RE, thumbnail):
for image_id, size in self._ARTWORK_MAP.items():
# Soundcloud serves JPEG regardless of URL's ext *except* for "original" thumb
ext = mobj.group('ext') if image_id == 'original' else 'jpg'
i = {
'id': image_id,
'url': re.sub(self._IMAGE_REPL_RE, f'-{image_id}.{ext}', thumbnail),
}
if image_id == 'tiny' and not artwork_url:
size = 18
elif image_id == 'original':
i['preference'] = 10
if size:
i.update({
'width': size,
'height': size,
})
thumbnails.append(i)
else:
thumbnails = [{'url': thumbnail}]
def extract_count(key):
return int_or_none(info.get(f'{key}_count'))
return {
**traverse_obj(info, {
'uploader': ('user', 'username', {str}),
'uploader_id': ('user', ('id', 'permalink'), {str_or_none}, any),
'uploader_url': ('user', 'permalink_url', {url_or_none}),
'timestamp': ('created_at', {unified_timestamp}),
'title': ('title', {str}),
'track': ('title', {str}),
'description': ('description', {str}),
'duration': ('duration', {float_or_none(scale=1000)}),
'webpage_url': ('permalink_url', {url_or_none}),
'license': ('license', {str}),
'view_count': ('playback_count', {int_or_none}),
'like_count': (('favoritings_count', 'likes_count'), {int_or_none}, any),
'comment_count': ('comment_count', {int_or_none}),
'repost_count': ('reposts_count', {int_or_none}),
'release_timestamp': ('release_date', {unified_timestamp}),
'modified_timestamp': ('last_modified', {unified_timestamp}),
'genres': ('genre', {str}, filter, all, filter),
'tags': ('tag_list', {self._TAGS_RE.findall}, ..., ..., filter),
'artists': ('publisher_metadata', 'artist', {str}, filter, all, filter),
}),
'id': track_id,
'uploader': user.get('username'),
'uploader_id': str_or_none(user.get('id')) or user.get('permalink'),
'uploader_url': user.get('permalink_url'),
'timestamp': unified_timestamp(info.get('created_at')),
'title': info.get('title'),
'track': info.get('title'),
'description': info.get('description'),
'thumbnails': thumbnails,
'duration': float_or_none(info.get('duration'), 1000),
'webpage_url': info.get('permalink_url'),
'license': info.get('license'),
'view_count': extract_count('playback'),
'like_count': extract_count('favoritings') or extract_count('likes'),
'comment_count': extract_count('comment'),
'repost_count': extract_count('reposts'),
'genres': traverse_obj(info, ('genre', {str}, filter, all, filter)),
'tags': traverse_obj(info, ('tag_list', {self._TAGS_RE.findall}, ..., ..., filter)),
'artists': traverse_obj(info, ('publisher_metadata', 'artist', {str}, filter, all, filter)),
'thumbnails': self._extract_thumbnails(info),
'formats': formats if not extract_flat else None,
'__post_extractor': self.extract_comments(track_id),
}
@ -478,6 +452,43 @@ class SoundcloudBaseIE(InfoExtractor):
if not next_url:
break
def _extract_thumbnails(self, info):
artwork_url = traverse_obj(info, ('artwork_url', {url_or_none}))
thumbnail_url = artwork_url or traverse_obj(info, ('user', 'avatar_url', {url_or_none}))
if not thumbnail_url:
return None
thumbnails = []
if mobj := re.search(self._IMAGE_REPL_RE, thumbnail_url):
for image_id, size in self._ARTWORK_MAP.items():
# Soundcloud serves JPEG regardless of URL's ext *except* for "original" thumb
ext = mobj.group('ext') if image_id == 'original' else 'jpg'
thumbnail = {
'id': image_id,
'url': re.sub(self._IMAGE_REPL_RE, f'-{image_id}.{ext}', thumbnail_url),
}
if image_id == 'tiny' and not artwork_url:
size = 18
elif image_id == 'original':
thumbnail['preference'] = 10
# "original" thumb ext doesn't always match ext used for other thumbs, check with HEAD req
req = self._request_webpage(
HEADRequest(thumbnail['url']), str(info['id']), note='Checking thumbnail extension',
errnote=False, fatal=False, headers=self._HEADERS)
if not req:
# If "original" thumb doesn't exist, assume different ext
ext = 'jpg' if ext == 'png' else 'png'
thumbnail['url'] = re.sub(self._IMAGE_REPL_RE, f'-{image_id}.{ext}', thumbnail_url)
if size:
thumbnail.update({
'width': size,
'height': size,
})
thumbnails.append(thumbnail)
else:
thumbnails = [{'url': thumbnail_url}]
return thumbnails
class SoundcloudIE(SoundcloudBaseIE):
"""Information extractor for soundcloud.com
@ -522,6 +533,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'thumbnail': r're:https?://[ai]1\.sndcdn\.com/.+\.(?:jpg|png)',
'uploader_url': 'https://soundcloud.com/ethmusic',
'tags': 'count:14',
'modified_timestamp': 1350184468,
'modified_date': '20121014',
},
'params': {'skip_download': 'm3u8'},
}, {
@ -547,7 +560,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'uploader_url': 'https://soundcloud.com/jaimemf',
'thumbnail': r're:https?://[ai]1\.sndcdn\.com/.+\.(?:jpg|png)',
'genres': ['youtubedl'],
'tags': [],
'modified_timestamp': 1386604920,
'modified_date': '20131209',
},
}, {
# private link (alt format)
@ -572,7 +586,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'uploader_url': 'https://soundcloud.com/jaimemf',
'thumbnail': r're:https?://[ai]1\.sndcdn\.com/.+\.(?:jpg|png)',
'genres': ['youtubedl'],
'tags': [],
'modified_timestamp': 1386604920,
'modified_date': '20131209',
},
}, {
# downloadable song
@ -598,6 +613,10 @@ class SoundcloudIE(SoundcloudBaseIE):
'genres': ['Dance & EDM'],
'artists': ['80M'],
'tags': 'count:4',
'release_timestamp': 1506384000,
'release_date': '20170926',
'modified_timestamp': 1647390150,
'modified_date': '20220316',
},
'params': {'skip_download': 'm3u8'},
'expected_warnings': ['Original download format is only available for registered users'],
@ -627,6 +646,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'genres': ['Trance'],
'artists': ['Ori Uplift'],
'tags': 'count:6',
'modified_timestamp': 1504258507,
'modified_date': '20170901',
},
'expected_warnings': ['Original download format is only available for registered users'],
}, {
@ -652,7 +673,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'repost_count': int,
'uploader_url': 'https://soundcloud.com/garyvee',
'artists': ['MadReal'],
'tags': [],
'modified_timestamp': 1488293034,
'modified_date': '20170228',
},
'params': {'skip_download': 'm3u8'},
}, {
@ -678,6 +700,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'genres': ['Piano'],
'uploader_url': 'https://soundcloud.com/giovannisarani',
'tags': 'count:10',
'modified_timestamp': 1692623663,
'modified_date': '20230821',
},
'params': {'skip_download': 'm3u8'},
}, {
@ -696,12 +720,15 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'repost_count': int,
'duration': 213.469,
'tags': [],
'artists': ['$KORXH'],
'track': 'audio dealer',
'timestamp': 1737143201,
'upload_date': '20250117',
'license': 'all-rights-reserved',
'release_timestamp': 1736985600,
'release_date': '20250116',
'modified_timestamp': 1737143467,
'modified_date': '20250117',
'thumbnail': r're:https?://[ai]1\.sndcdn\.com/.+\.(?:jpg|png)',
'thumbnails': [
{'id': 'mini', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-mini.jpg'},
@ -733,12 +760,13 @@ class SoundcloudIE(SoundcloudBaseIE):
'repost_count': int,
'duration': 241.601,
'thumbnail': 'https://i1.sndcdn.com/artworks-000209893581-orfv6t-original.jpg',
'tags': [],
'artists': ['BENDY AND THE INK MACHINE SONG (Build Our Machine) INSTRUMENTAL '],
'track': 'BENDY AND THE INK MACHINE SONG (Build Our Machine) INSTRUMENTAL by DAGAMES',
'timestamp': 1488232827,
'upload_date': '20170227',
'license': 'all-rights-reserved',
'modified_timestamp': 1645028949,
'modified_date': '20220216',
},
'params': {'get_comments': True, 'skip_download': 'm3u8'},
}, {
@ -827,7 +855,17 @@ class SoundcloudPlaylistBaseIE(SoundcloudBaseIE):
'uploader': ('user', 'username', {str}),
'uploader_id': ('user', 'id', {str_or_none}),
'uploader_url': ('user', 'permalink_url', {url_or_none}),
'timestamp': ('created_at', {unified_timestamp}),
'release_timestamp': (('release_date', 'published_at'), {unified_timestamp}, any),
'modified_timestamp': ('last_modified', {unified_timestamp}),
'duration': ('duration', {float_or_none(scale=1000)}),
'license': ('license', {str}),
'like_count': ('likes_count', {int_or_none}),
'repost_count': ('reposts_count', {int_or_none}),
'genres': ('genre', {str}, filter, all, filter),
'tags': ('tag_list', {self._TAGS_RE.findall}, ..., ..., filter),
}),
thumbnails=self._extract_thumbnails(playlist),
)
@ -835,6 +873,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
_VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[:\w\d-]+)(?:/(?P<token>[^?/]+))?'
IE_NAME = 'soundcloud:set'
_TESTS = [{
# No release date, no tags
'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep',
'info_dict': {
'id': '2284613',
@ -846,8 +885,68 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
'album': 'The Royal Concept EP',
'album_artists': ['The Royal Concept'],
'album_type': 'ep',
'timestamp': 1343497860,
'upload_date': '20120728',
'modified_timestamp': 1358471457,
'modified_date': '20130118',
'duration': 1398.595,
'license': 'all-rights-reserved',
'like_count': 482,
'repost_count': 99,
'genres': ['Indie/pop'],
'thumbnails': [
{'id': 'mini', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-mini.jpg'},
{'id': 'tiny', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-tiny.jpg'},
{'id': 'small', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-small.jpg'},
{'id': 'badge', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-badge.jpg'},
{'id': 't67x67', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-t67x67.jpg'},
{'id': 'large', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-large.jpg'},
{'id': 't300x300', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-t300x300.jpg'},
{'id': 'crop', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-crop.jpg'},
{'id': 't500x500', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-t500x500.jpg'},
{'id': 'original', 'url': 'https://i1.sndcdn.com/artworks-000030896212-o16m9v-original.jpg'},
],
},
'playlist_mincount': 5,
}, {
# Release date, multiple tags, empty desc
'url': 'https://soundcloud.com/leviryan/sets/out-of-spite',
'info_dict': {
'id': '1524158182',
'title': 'out of spite',
'description': '',
'uploader': 'Levi Ryan',
'uploader_id': '229146182',
'uploader_url': 'https://soundcloud.com/leviryan',
'album': 'out of spite',
'album_artists': ['Levi Ryan'],
'album_type': 'album',
'timestamp': 1667935849,
'upload_date': '20221108',
'release_timestamp': 1667865600,
'release_date': '20221108',
'modified_timestamp': 1667935903,
'modified_date': '20221108',
'duration': 1531.376,
'license': 'all-rights-reserved',
'like_count': 185,
'repost_count': 40,
'genres': ['Hip-hop & Rap'],
'tags': ['Drum & Bass', 'Alternative', 'Ambient'],
'thumbnails': [
{'id': 'mini', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-mini.jpg'},
{'id': 'tiny', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-tiny.jpg'},
{'id': 'small', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-small.jpg'},
{'id': 'badge', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-badge.jpg'},
{'id': 't67x67', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-t67x67.jpg'},
{'id': 'large', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-large.jpg'},
{'id': 't300x300', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-t300x300.jpg'},
{'id': 'crop', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-crop.jpg'},
{'id': 't500x500', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-t500x500.jpg'},
{'id': 'original', 'url': 'https://i1.sndcdn.com/artworks-2hmuDCrcvCzzCaXZ-1rztZA-original.jpg'},
],
},
'playlist_count': 8,
}, {
'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token',
'only_matching': True,
@ -1163,6 +1262,31 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
'album_artists': ['Non-Site Records'],
'album_type': 'playlist',
'album': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]',
'timestamp': 1363395687,
'upload_date': '20130316',
'release_timestamp': 1363392000,
'release_date': '20130316',
'modified_timestamp': 1444746489,
'modified_date': '20151013',
'duration': 2152.685,
'license': 'all-rights-reserved',
'like_count': 2,
'repost_count': 2,
'genres': ['Downtown'],
'tags': ['Non-Site Records', 'TILT Brass', 'TILT Creative Brass Band', 'Bowery Poetry Club',
'Nick Didkovsky', 'Tom Waits', 'Dave Ballou', 'Elliott Sharp', 'AFKA Prince', 'NPG'],
'thumbnails': [
{'id': 'mini', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-mini.jpg'},
{'id': 'tiny', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-tiny.jpg'},
{'id': 'small', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-small.jpg'},
{'id': 'badge', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-badge.jpg'},
{'id': 't67x67', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-t67x67.jpg'},
{'id': 'large', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-large.jpg'},
{'id': 't300x300', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-t300x300.jpg'},
{'id': 'crop', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-crop.jpg'},
{'id': 't500x500', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-t500x500.jpg'},
{'id': 'original', 'url': 'https://i1.sndcdn.com/artworks-000043059944-9zwy8g-original.png'},
],
},
'playlist_count': 6,
}, {