From 0048ed894e5b25f34923be4d8a0db6bbb47b05b7 Mon Sep 17 00:00:00 2001 From: MrDemocracy Date: Thu, 24 Oct 2024 15:37:45 +0200 Subject: [PATCH] [nrk] Made suggested changes, some slight refactoring and updated subtitles test --- test/test_subtitles.py | 9 +- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/nrk.py | 389 +++++++++++++++++--------------- 3 files changed, 217 insertions(+), 182 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index f3b0056179..f98c2b1e5b 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -11,7 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL, is_download_test, md5 from yt_dlp.extractor import ( NPOIE, - NRKTVIE, + NRKIE, PBSIE, CeskaTelevizeIE, ComedyCentralIE, @@ -299,15 +299,16 @@ class TestMTVSubtitles(BaseTestSubtitles): @is_download_test class TestNRKSubtitles(BaseTestSubtitles): - url = 'http://tv.nrk.no/serie/ikke-gjoer-dette-hjemme/DMPV73000411/sesong-2/episode-1' - IE = NRKTVIE + url = 'nrk:DMPV73000411' # http://tv.nrk.no/serie/ikke-gjoer-dette-hjemme/DMPV73000411/sesong-2/episode-1 + IE = NRKIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), {'nb-ttv'}) + self.assertEqual(set(subtitles.keys()), {'nb-ttv', 'no'}) self.assertEqual(md5(subtitles['nb-ttv']), '67e06ff02d0deaf975e68f6cb8f6a149') + self.assertEqual(md5(subtitles['no']), 'fc01036074116d245ddc6ba6f679263b') @is_download_test diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 0f599c9db7..1040b8916c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1386,7 +1386,6 @@ from .nrk import ( NRKSkoleIE, NRKTVDirekteIE, NRKTVEpisodeIE, - NRKTVEpisodesIE, NRKTVSeasonIE, NRKTVSeriesIE, ) diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py index 21311e86a9..9c9f8e5ab2 100644 --- a/yt_dlp/extractor/nrk.py +++ b/yt_dlp/extractor/nrk.py @@ -14,13 +14,12 @@ from ..utils import ( parse_iso8601, str_or_none, traverse_obj, - try_get, url_or_none, urljoin, ) -class NRKBaseIE(InfoExtractor): +class NRKIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] _CDN_REPL_REGEX = r'''(?x):// (?: @@ -32,44 +31,6 @@ class NRKBaseIE(InfoExtractor): _LOGIN_URL = 'https://innlogging.nrk.no/logginn' _AUTH_TOKEN = '' _API_CALL_HEADERS = {'Accept': 'application/json;device=player-core'} - - def _extract_nrk_formats_and_subtitles(self, asset_url, video_id): - - if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): - return self._extract_akamai_formats(asset_url, video_id) - asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only|adap=.+?\b)&?', '', asset_url) - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) - if not formats and re.search(self._CDN_REPL_REGEX, asset_url): - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url), - video_id, 'mp4', 'm3u8_native', fatal=False) - return formats, subtitles - - def _raise_error(self, data): - MESSAGES = { - 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', - 'ProgramRightsHasExpired': 'Programmet har gått ut', - 'NoProgramRights': 'Ikke tilgjengelig', - 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - } - message_type = data.get('messageType', '') - # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: - self.raise_geo_restricted( - msg=MESSAGES.get('ProgramIsGeoBlocked'), - countries=self._GEO_COUNTRIES) - message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) - raise ExtractorError(f'{self.IE_NAME} said: {message}', expected=True) - - def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): - return self._download_json( - urljoin('https://psapi.nrk.no/', path), - video_id, note or f'Downloading {item} JSON', - fatal=fatal, query=query, headers=self._API_CALL_HEADERS) - - -class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) (?: nrk:| @@ -81,7 +42,6 @@ class NRKIE(NRKBaseIE): ) (?P[^?\#&]+) ''' - _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', @@ -157,13 +117,78 @@ class NRKIE(NRKBaseIE): 'only_matching': True, }] + def _extract_nrk_formats_and_subtitles(self, asset_url, video_id): + + if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): + return self._extract_akamai_formats(asset_url, video_id) + asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only|adap=.+?\b)&?', '', asset_url) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) + if not formats and re.search(self._CDN_REPL_REGEX, asset_url): + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url), + video_id, 'mp4', 'm3u8_native', fatal=False) + return formats, subtitles + + def _raise_error(self, data): + MESSAGES = { + 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', + 'ProgramRightsHasExpired': 'Programmet har gått ut', + 'NoProgramRights': 'Ikke tilgjengelig', + 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + } + message_type = data.get('messageType', '') + # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* + if 'IsGeoBlocked' in message_type or traverse_obj(data, ('usageRights', 'isGeoBlocked')) is True: + self.raise_geo_restricted( + msg=MESSAGES.get('ProgramIsGeoBlocked'), + countries=self._GEO_COUNTRIES) + message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) + raise ExtractorError(f'{self.IE_NAME} said: {message}', expected=True) + + def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): + return self._download_json( + urljoin('https://psapi.nrk.no/', path), + video_id, note or f'Downloading {item} JSON', + fatal=fatal, query=query, headers=self._API_CALL_HEADERS) + + def _perform_login(self, username, password): + try: + self._download_json( + self._LOGIN_URL, None, headers={'Content-Type': 'application/json; charset=UTF-8', 'accept': 'application/json; charset=utf-8'}, + data=json.dumps({ + 'clientId': '', + 'hashedPassword': {'current': { + 'hash': password, + 'recipe': { + 'algorithm': 'cleartext', + 'salt': '', + }, + }, + }, + 'password': password, + 'username': username, + }).encode()) + + self._download_webpage('https://tv.nrk.no/auth/web/login/opsession', None) + response = self._download_json('https://tv.nrk.no/auth/session/tokenforsub/_', None) + self._AUTH_TOKEN = traverse_obj(response, ('session', 'accessToken')) + self._API_CALL_HEADERS['authorization'] = f'Bearer {self._AUTH_TOKEN}' + except ExtractorError as e: + message = None + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 400): + resp = self._parse_json( + e.cause.response.read().decode(), None, fatal=False) or {} + message = next((error['message'] for error in resp['errors'] if error['field'] == 'Password'), None) + self.report_warning(message or 'Unable to log in') + def _real_extract(self, url): video_id = self._match_id(url).split('/')[-1] # known values for preferredCdn: akamai, iponly, minicdn and telenor manifest = self._call_api(f'playback/manifest/{video_id}', video_id, 'manifest', query={'preferredCdn': 'akamai'}) - video_id = try_get(manifest, lambda x: x['id'], str) or video_id + video_id = manifest.get('id') or video_id if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -172,11 +197,13 @@ class NRKIE(NRKBaseIE): formats = [] subtitles = {} - for asset in playable['assets']: - if not isinstance(asset, dict): + has_drm = False + for asset in traverse_obj(playable, ('assets', ..., {dict})): + encryption_scheme = asset.get('encryptionScheme') + if encryption_scheme not in (None, 'none', 'statickey'): + self.report_warning(f'Skipping asset with unsupported encryption scheme "{encryption_scheme}"') + has_drm = True continue - if asset.get('encrypted'): - pass # Unencrypted stream no longer available format_url = url_or_none(asset.get('url')) if not format_url: continue @@ -192,19 +219,22 @@ class NRKIE(NRKBaseIE): 'vcodec': 'none', }) - data = self._call_api(try_get(manifest, lambda x: x['_links']['metadata']['href']), video_id, 'metadata') + if not formats and has_drm: + self.report_drm(video_id) - preplay = try_get(data, lambda x: x['preplay']) - titles = try_get(preplay, lambda x: x['titles']) - title = try_get(titles, lambda x: x['title']) - alt_title = try_get(titles, lambda x: x['subtitle']) + data = self._call_api(traverse_obj(manifest, ('_links', 'metadata', 'href', {str})), video_id, 'metadata') - description = try_get(preplay, lambda x: x['description'].replace('\r', '\n')) + preplay = data.get('preplay') + titles = preplay.get('titles') + title = titles.get('title') + alt_title = titles.get('subtitle') + + description = preplay.get('description') + # Use m3u8 vod dueration for NRKSkoleIE because of incorrect duration in metadata duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) or self._extract_m3u8_vod_duration(formats[0]['url'], video_id) thumbnails = [] - for image in try_get( - preplay, lambda x: x['poster']['images'], list) or []: + for image in traverse_obj(preplay, ('poster', 'images', {list})) or []: if not isinstance(image, dict): continue image_url = url_or_none(image.get('url')) @@ -216,7 +246,7 @@ class NRKIE(NRKBaseIE): 'height': int_or_none(image.get('pixelHeight')), }) - for sub in try_get(playable, lambda x: x['subtitles'], list) or []: + for sub in traverse_obj(playable, ('subtitles', {list})) or []: if not isinstance(sub, dict): continue sub_url = url_or_none(sub.get('webVtt')) @@ -242,16 +272,15 @@ class NRKIE(NRKBaseIE): 'end_time': duration, 'title': 'Outro', }] if item['start_time'] != item['end_time']] - if try_get(data, lambda x: x['preplay']['indexPoints']): + if preplay.get('indexPoints'): seconds_or_none = lambda x: float_or_none(parse_duration(x)) - chapters += traverse_obj(data['preplay'], ('indexPoints', ..., { + chapters += traverse_obj(preplay, ('indexPoints', ..., { 'start_time': ('startPoint', {seconds_or_none}), 'end_time': ('endPoint', {seconds_or_none}), 'title': ('title', {lambda x: x}), })) chapters = sorted(chapters, key=lambda x: x['start_time']) if chapters else None - legal_age = try_get( - data, lambda x: x['legalAge']['body']['rating']['code'], str) + legal_age = traverse_obj(data, ('legalAge', 'body', 'rating', 'code')) # https://en.wikipedia.org/wiki/Norwegian_Media_Authority age_limit = None if legal_age: @@ -260,7 +289,7 @@ class NRKIE(NRKBaseIE): elif legal_age.isdigit(): age_limit = int_or_none(legal_age) - is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' + is_series = traverse_obj(data, ('_links', 'series', 'name')) == 'series' info = { 'id': video_id, @@ -273,16 +302,22 @@ class NRKIE(NRKBaseIE): 'formats': formats, 'subtitles': subtitles, 'chapters': chapters, - 'timestamp': parse_iso8601(try_get(data, lambda x: x['availability']['onDemand']['from'], str)), + 'timestamp': parse_iso8601(traverse_obj(data, ('availability', 'onDemand', 'from'))), } if is_series: series = season_id = season_number = episode = episode_number = None programs = self._call_api( f'programs/{video_id}', video_id, 'programs', fatal=False) - match = re.search(r'\d+', try_get(programs, lambda x: x['firstTimeTransmitted']['publicationDate'] or x['usageRights']['availableFrom'], str) or try_get(programs, lambda x: x['usageRights']['availableFrom'], str)) - if match: - info.update({'timestamp': min(info['timestamp'], int(match.group()) // 1000)}) + matched_dates = [ + int(match.group()) // 1000 + for date in [ + traverse_obj(programs, ('firstTimeTransmitted', 'publicationDate')), + traverse_obj(programs, ('usageRights', 'availableFrom')), + ] if date for match in [re.search(r'\d+', date)] if match + ] + if matched_dates: + info.update({'timestamp': min(info['timestamp'], *matched_dates)}) if programs and isinstance(programs, dict): series = str_or_none(programs.get('seriesTitle')) season_id = str_or_none(programs.get('seasonId')) @@ -318,42 +353,11 @@ class NRKIE(NRKBaseIE): return info - def _perform_login(self, username, password): - try: - self._download_json( - self._LOGIN_URL, None, headers={'Content-Type': 'application/json; charset=UTF-8', 'accept': 'application/json; charset=utf-8'}, - data=json.dumps({ - 'clientId': '', - 'hashedPassword': {'current': { - 'hash': password, - 'recipe': { - 'algorithm': 'cleartext', - 'salt': '', - }, - }, - }, - 'password': password, - 'username': username, - }).encode()) - self._download_webpage('https://tv.nrk.no/auth/web/login/opsession', None) - response = self._download_json('https://tv.nrk.no/auth/session/tokenforsub/_', None) - self._AUTH_TOKEN = try_get(response, lambda x: x['session']['accessToken']) - self._API_CALL_HEADERS['authorization'] = f'Bearer {self._AUTH_TOKEN}' - except ExtractorError as e: - message = None - if isinstance(e.cause, HTTPError) and e.cause.status in (401, 400): - resp = self._parse_json( - e.cause.response.read().decode(), None, fatal=False) or {} - message = next((error['message'] for error in resp['errors'] if error['field'] == 'Password'), None) - self.report_warning(message or 'Unable to log in') - - -class NRKTVIE(InfoExtractor): +class NRKTVIE(NRKIE): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' _VALID_URL = rf'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*{_EPISODE_RE}' - _NETRC_MACHINE = 'nrk' _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', @@ -399,7 +403,18 @@ class NRKTVIE(InfoExtractor): 'season_id': '126936', 'season_number': 2014, 'season': 'Season 2014', - 'chapters': [{'start_time': 0.0, 'end_time': 39.0, 'title': 'Intro'}, {'start_time': 0.0, 'title': 'Velkommen', 'end_time': 152.32}, {'start_time': 152.32, 'title': 'Tannpirker', 'end_time': 304.76}, {'start_time': 304.76, 'title': 'Orgelbrus', 'end_time': 513.48}, {'start_time': 513.48, 'title': 'G-streng', 'end_time': 712.96}, {'start_time': 712.96, 'title': 'Medalje', 'end_time': 837.76}, {'start_time': 837.76, 'title': 'Globus', 'end_time': 1124.48}, {'start_time': 1124.48, 'title': 'Primstav', 'end_time': 1417.4}, {'start_time': 1417.4, 'title': 'Fyr', 'end_time': 1721.0}, {'start_time': 1721.0, 'end_time': 1741.0, 'title': 'Outro'}], + 'chapters': [ + {'start_time': 0.0, 'end_time': 39.0, 'title': 'Intro'}, + {'start_time': 0.0, 'title': 'Velkommen', 'end_time': 152.32}, + {'start_time': 152.32, 'title': 'Tannpirker', 'end_time': 304.76}, + {'start_time': 304.76, 'title': 'Orgelbrus', 'end_time': 513.48}, + {'start_time': 513.48, 'title': 'G-streng', 'end_time': 712.96}, + {'start_time': 712.96, 'title': 'Medalje', 'end_time': 837.76}, + {'start_time': 837.76, 'title': 'Globus', 'end_time': 1124.48}, + {'start_time': 1124.48, 'title': 'Primstav', 'end_time': 1417.4}, + {'start_time': 1417.4, 'title': 'Fyr', 'end_time': 1721.0}, + {'start_time': 1721.0, 'end_time': 1741.0, 'title': 'Outro'}, + ], 'episode_number': 3, 'timestamp': 1400871900, }, @@ -412,14 +427,17 @@ class NRKTVIE(InfoExtractor): 'id': 'MDFP15000514', 'ext': 'mp4', 'title': 'Kunnskapskanalen - Grunnlovsjubiléet - Stor ståhei for ingenting', - 'description': 'md5:09fd0f9cd47ba6b857836a385b88ed56', + 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', 'duration': 4605.08, 'series': 'Kunnskapskanalen', 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', 'age_limit': 0, 'upload_date': '20140524', 'episode_number': 17, - 'chapters': [{'start_time': 0, 'end_time': 4595.0, 'title': ''}, {'start_time': 4595.0, 'end_time': 4605.08, 'title': 'Outro'}], + 'chapters': [ + {'start_time': 0, 'end_time': 4595.0, 'title': ''}, + {'start_time': 4595.0, 'end_time': 4605.08, 'title': 'Outro'}, + ], 'season': 'Season 2014', 'timestamp': 1400937600, 'thumbnail': 'https://gfx.nrk.no/D2u6-EyVUZpVCq0PdSNHRgdBZCV40ekpk6s9fZWiMtyg', @@ -436,23 +454,51 @@ class NRKTVIE(InfoExtractor): 'info_dict': { 'id': 'MSPO40010515', 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'title': 'Tour de Ski - Sprint fri teknikk, kvinner og menn', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'age_limit': 0, + 'episode': 'Sprint fri teknikk, kvinner og menn', + 'series': 'Tour de Ski', + 'thumbnail': 'https://gfx.nrk.no/s9vNwGPGN-Un-UCvitD09we9HRLDxisnipA9K__d5c3Q', + 'season_id': '53512', + 'chapters': [ + {'start_time': 0, 'end_time': 6938.0, 'title': ''}, + {'start_time': 6938.0, 'end_time': 6947.52, 'title': 'Outro'}, + ], + 'season_number': 2015, + 'episode_number': 5, + 'upload_date': '20150106', + 'duration': 6947.52, + 'timestamp': 1420545563, + 'alt_title': 'Sprint fri teknikk, kvinner og menn', + 'season': 'Season 2015', }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['Failed to download m3u8 information'], - 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'info_dict': { 'id': 'MSPO40010515', 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'title': 'Tour de Ski - Sprint fri teknikk, kvinner og menn', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'age_limit': 0, + 'episode': 'Sprint fri teknikk, kvinner og menn', + 'series': 'Tour de Ski', + 'thumbnail': 'https://gfx.nrk.no/s9vNwGPGN-Un-UCvitD09we9HRLDxisnipA9K__d5c3Q', + 'season_id': '53512', + 'chapters': [ + {'start_time': 0, 'end_time': 6938.0, 'title': ''}, + {'start_time': 6938.0, 'end_time': 6947.52, 'title': 'Outro'}, + ], + 'season_number': 2015, + 'episode_number': 5, + 'upload_date': '20150106', + 'duration': 6947.52, + 'timestamp': 1420545563, + 'alt_title': 'Sprint fri teknikk, kvinner og menn', + 'season': 'Season 2015', }, 'expected_warnings': ['Failed to download m3u8 information'], 'skip': 'Ikke tilgjengelig utenfor Norge', @@ -507,7 +553,7 @@ class NRKTVIE(InfoExtractor): f'nrk:{video_id}', ie=NRKIE.ie_key(), video_id=video_id) -class NRKTVEpisodeIE(InfoExtractor): +class NRKTVEpisodeIE(NRKIE): _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/(?P\d+)/episode/(?P\d+))' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2', @@ -528,7 +574,11 @@ class NRKTVEpisodeIE(InfoExtractor): 'alt_title': '2. Kro, krig og kjærlighet', 'season': 'Season 1', 'season_id': '124163', - 'chapters': [{'start_time': 0, 'end_time': 29.0, 'title': ''}, {'start_time': 29.0, 'end_time': 50.0, 'title': 'Intro'}, {'start_time': 1530.0, 'end_time': 1563.92, 'title': 'Outro'}], + 'chapters': [ + {'start_time': 0, 'end_time': 29.0, 'title': ''}, + {'start_time': 29.0, 'end_time': 50.0, 'title': 'Intro'}, + {'start_time': 1530.0, 'end_time': 1563.92, 'title': 'Outro'}, + ], }, 'params': { 'skip_download': True, @@ -554,33 +604,24 @@ class NRKTVEpisodeIE(InfoExtractor): }] def _real_extract(self, url): - display_id, season_number, episode_number = self._match_valid_url(url).groups() + # HEADRequest(url) only works if a regular GET request was recently made by anyone for the specific URL being requested. + response = self._request_webpage(url, None, expected_status=True) - webpage = self._download_webpage(url, display_id) + nrk_id = self._match_id(url) - info = self._parse_json(self._search_regex(r'