From 2b27a203f7573cb491c8bef77cb4d944cee6f8cf Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Thu, 18 Jun 2026 13:58:54 +0900 Subject: [PATCH] [ie/periscope] Improve metadata extraction (#16084) Authored by: doe1080 --- yt_dlp/extractor/periscope.py | 56 +++++++++++++++++------------------ yt_dlp/extractor/twitter.py | 15 ++++++++-- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/yt_dlp/extractor/periscope.py b/yt_dlp/extractor/periscope.py index e3b33c4d98..d0cd2e20ec 100644 --- a/yt_dlp/extractor/periscope.py +++ b/yt_dlp/extractor/periscope.py @@ -1,8 +1,11 @@ from .common import InfoExtractor from ..utils import ( + clean_html, int_or_none, parse_iso8601, + str_or_none, unescapeHTML, + url_or_none, ) from ..utils.traversal import traverse_obj @@ -18,29 +21,24 @@ class PeriscopeBaseIE(InfoExtractor): item_id, query=query) def _parse_broadcast_data(self, broadcast, video_id): - title = broadcast.get('status') or 'Periscope Broadcast' - uploader = broadcast.get('user_display_name') or broadcast.get('username') - title = f'{uploader} - {title}' if uploader else title - thumbnails = [{ - 'url': broadcast[image], - } for image in ('image_url', 'image_url_medium', 'image_url_small') if broadcast.get(image)] - return { - 'id': broadcast.get('id') or video_id, - 'title': title, - 'timestamp': parse_iso8601(broadcast.get('created_at')) or int_or_none( - broadcast.get('created_at_ms'), scale=1000), - 'release_timestamp': int_or_none(broadcast.get('scheduled_start_ms'), scale=1000), - 'uploader': uploader, - 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), - 'thumbnails': thumbnails, - 'view_count': int_or_none(broadcast.get('total_watched')), - 'concurrent_view_count': int_or_none(broadcast.get('total_watching')), - 'tags': broadcast.get('tags'), + 'display_id': video_id, 'live_status': { 'running': 'is_live', 'not_started': 'is_upcoming', }.get(traverse_obj(broadcast, ('state', {str.lower}))) or 'was_live', + **traverse_obj(broadcast, { + 'id': ('id', {str_or_none}), + 'title': ('status', {clean_html}, filter), + 'concurrent_view_count': ('total_watching', {int_or_none}), + 'release_timestamp': (('scheduled_start_ms', 'start_ms'), {int_or_none(scale=1000)}, any), + 'tags': ('tags', ..., {clean_html}, filter, all, filter), + 'thumbnails': (('image_url_small', 'image_url_medium', 'image_url'), {'url': {url_or_none}}), + 'timestamp': ((('created_at', {parse_iso8601}), ('created_at_ms', {int_or_none(scale=1000)})), any), + 'uploader': ('user_display_name', {clean_html}, filter), + 'uploader_id': ('username', {clean_html}, filter), + 'view_count': ('total_watched', {int_or_none}), + }), } @staticmethod @@ -69,22 +67,22 @@ class PeriscopeBaseIE(InfoExtractor): class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' - _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/?#]+/(?P[^/?#]+)' _EMBED_REGEX = [r']+src=([\'"])(?P(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1'] - # Alive example URLs can be found here https://www.periscope.tv/ _TESTS = [{ - 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', - 'md5': '65b57957972e503fcbbaeed8f4fa04ca', + 'url': 'https://www.periscope.tv/LularoeHusbandMike/1mrGmgaXAVqxy', 'info_dict': { - 'id': '56102209', + 'id': '1mrGmgaXAVqxy', 'ext': 'mp4', - 'title': 'Bec Boop - πŸš βœˆοΈπŸ‡¬πŸ‡§ Fly above #London in Emirates Air Line cable car at night πŸ‡¬πŸ‡§βœˆοΈπŸš  #BoopScope πŸŽ€πŸ’—', - 'timestamp': 1438978559, - 'upload_date': '20150807', - 'uploader': 'Bec Boop', - 'uploader_id': '1465763', + 'title': 'πŸŽ‰πŸ‘πŸΌ BROWSE OUR ENTIRE 1,900 +PIECE INVENTORY! πŸ‘πŸΌπŸŽ‰ #lularoe', + 'live_status': 'was_live', + 'tags': 'count:1', + 'thumbnail': r're:https?://prod-fastly-us-east-1\.video\.pscp\.tv/.+', + 'timestamp': 1498621952, + 'upload_date': '20170628', + 'uploader': 'LuLaRoe Husband Mike', + 'uploader_id': 'LularoeHusbandMike', }, - 'skip': 'Expires in 24 hours', }, { 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', 'only_matching': True, diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 1b85f36faf..0002ef67a9 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -10,6 +10,7 @@ from ..jsinterp import js_number_to_string from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, + clean_html, dict_get, filter_dict, float_or_none, @@ -1451,16 +1452,18 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?Pbroadcasts|events)/(?P\w+)' _TESTS = [{ - # untitled Periscope video + # Untitled broadcast 'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj', 'info_dict': { 'id': '1yNGaQLWpejGj', 'ext': 'mp4', - 'title': 'Andrea May Sahouri - Periscope Broadcast', + 'title': 'Andrea May Sahouri - Twitter Broadcast', 'display_id': '1yNGaQLWpejGj', 'uploader': 'Andrea May Sahouri', 'uploader_id': 'andreamsahouri', 'uploader_url': 'https://twitter.com/andreamsahouri', + 'release_date': '20200601', + 'release_timestamp': 1590973647, 'timestamp': 1590973638, 'upload_date': '20200601', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', @@ -1478,6 +1481,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'uploader': 'SpaceX', 'uploader_id': 'SpaceX', 'uploader_url': 'https://twitter.com/SpaceX', + 'release_date': '20210303', + 'release_timestamp': 1614812964, 'timestamp': 1614812942, 'upload_date': '20210303', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', @@ -1495,6 +1500,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'uploader': 'SpaceX', 'uploader_id': 'SpaceX', 'uploader_url': 'https://twitter.com/SpaceX', + 'release_date': '20230420', + 'release_timestamp': 1681994486, 'timestamp': 1681993964, 'upload_date': '20230420', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', @@ -1544,7 +1551,9 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): info = self._parse_broadcast_data(broadcast, broadcast_id) info.update({ 'display_id': display_id, - 'title': broadcast.get('status') or info.get('title'), + 'title': traverse_obj(broadcast, ( + 'status', {clean_html}, filter, + )) or join_nonempty(info.get('uploader'), 'Twitter Broadcast', delim=' - '), 'uploader_id': broadcast.get('twitter_username') or info.get('uploader_id'), 'uploader_url': format_field( broadcast, 'twitter_username', 'https://twitter.com/%s', default=None),