Compare commits

..

5 Commits

Author SHA1 Message Date
doe1080
17bfaa53ed
[ie/onsen] Add extractor (#10971)
Closes #10902
Authored by: doe1080
2025-09-11 22:51:31 +00:00
doe1080
8cb037c0b0
[ie/smotrim] Rework extractors (#14200)
Closes #9372, Closes #11804, Closes #13900
Authored by: doe1080, swayll

Co-authored-by: Nikolay Fedorov <40500428+swayll@users.noreply.github.com>
2025-09-11 20:59:54 +00:00
doe1080
7d9e48b22a
[ie/tunein] Fix extractors (#13981)
Authored by: doe1080
2025-09-11 20:42:01 +00:00
sepro
f5cb721185
[ie/loco] Fix extractor (#14256)
Closes #14255
Authored by: seproDev
2025-09-11 21:32:35 +02:00
bashonly
83b8409366
[ci] Test with Python 3.14 (#13468)
Authored by: bashonly
2025-09-11 15:35:55 +00:00
12 changed files with 823 additions and 542 deletions

View File

@ -37,17 +37,21 @@ jobs:
matrix: matrix:
os: [ubuntu-latest] os: [ubuntu-latest]
# CPython 3.9 is in quick-test # CPython 3.9 is in quick-test
python-version: ['3.10', '3.11', '3.12', '3.13', pypy-3.11] python-version: ['3.10', '3.11', '3.12', '3.13', '3.14-dev', pypy-3.11]
include: include:
# atleast one of each CPython/PyPy tests must be in windows # atleast one of each CPython/PyPy tests must be in windows
- os: windows-latest - os: windows-latest
python-version: '3.9' python-version: '3.9'
- os: windows-latest - os: windows-latest
python-version: '3.10' python-version: '3.10'
- os: windows-latest
python-version: '3.11'
- os: windows-latest - os: windows-latest
python-version: '3.12' python-version: '3.12'
- os: windows-latest - os: windows-latest
python-version: '3.13' python-version: '3.13'
- os: windows-latest
python-version: '3.14-dev'
- os: windows-latest - os: windows-latest
python-version: pypy-3.11 python-version: pypy-3.11
steps: steps:

View File

@ -28,7 +28,7 @@ jobs:
fail-fast: true fail-fast: true
matrix: matrix:
os: [ubuntu-latest] os: [ubuntu-latest]
python-version: ['3.10', '3.11', '3.12', '3.13', pypy-3.11] python-version: ['3.10', '3.11', '3.12', '3.13', '3.14-dev', pypy-3.11]
include: include:
# atleast one of each CPython/PyPy tests must be in windows # atleast one of each CPython/PyPy tests must be in windows
- os: windows-latest - os: windows-latest

View File

@ -25,7 +25,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: [ubuntu-latest, windows-latest] os: [ubuntu-latest, windows-latest]
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', pypy-3.11] python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14-dev', pypy-3.11]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}

View File

@ -35,6 +35,7 @@ classifiers = [
"Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"Programming Language :: Python :: Implementation", "Programming Language :: Python :: Implementation",
"Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy", "Programming Language :: Python :: Implementation :: PyPy",
@ -173,7 +174,8 @@ python = [
"3.11", "3.11",
"3.12", "3.12",
"3.13", "3.13",
"pypy3.10", "3.14",
"pypy3.11",
] ]
[tool.ruff] [tool.ruff]

View File

@ -16,7 +16,7 @@ remove-unused-variables = true
[tox:tox] [tox:tox]
skipsdist = true skipsdist = true
envlist = py{39,310,311,312,313},pypy311 envlist = py{39,310,311,312,313,314},pypy311
skip_missing_interpreters = true skip_missing_interpreters = true
[testenv] # tox [testenv] # tox

View File

@ -1433,6 +1433,7 @@ from .onet import (
OnetPlIE, OnetPlIE,
) )
from .onionstudios import OnionStudiosIE from .onionstudios import OnionStudiosIE
from .onsen import OnsenIE
from .opencast import ( from .opencast import (
OpencastIE, OpencastIE,
OpencastPlaylistIE, OpencastPlaylistIE,
@ -1779,7 +1780,6 @@ from .rutube import (
RutubePlaylistIE, RutubePlaylistIE,
RutubeTagsIE, RutubeTagsIE,
) )
from .rutv import RUTVIE
from .ruutu import RuutuIE from .ruutu import RuutuIE
from .ruv import ( from .ruv import (
RuvIE, RuvIE,
@ -1877,7 +1877,12 @@ from .skynewsau import SkyNewsAUIE
from .slideshare import SlideshareIE from .slideshare import SlideshareIE
from .slideslive import SlidesLiveIE from .slideslive import SlidesLiveIE
from .slutload import SlutloadIE from .slutload import SlutloadIE
from .smotrim import SmotrimIE from .smotrim import (
SmotrimAudioIE,
SmotrimIE,
SmotrimLiveIE,
SmotrimPlaylistIE,
)
from .snapchat import SnapchatSpotlightIE from .snapchat import SnapchatSpotlightIE
from .snotr import SnotrIE from .snotr import SnotrIE
from .softwhiteunderbelly import SoftWhiteUnderbellyIE from .softwhiteunderbelly import SoftWhiteUnderbellyIE
@ -2149,6 +2154,7 @@ from .tubitv import (
) )
from .tumblr import TumblrIE from .tumblr import TumblrIE
from .tunein import ( from .tunein import (
TuneInEmbedIE,
TuneInPodcastEpisodeIE, TuneInPodcastEpisodeIE,
TuneInPodcastIE, TuneInPodcastIE,
TuneInShortenerIE, TuneInShortenerIE,
@ -2283,7 +2289,6 @@ from .utreon import UtreonIE
from .varzesh3 import Varzesh3IE from .varzesh3 import Varzesh3IE
from .vbox7 import Vbox7IE from .vbox7 import Vbox7IE
from .veo import VeoIE from .veo import VeoIE
from .vesti import VestiIE
from .vevo import ( from .vevo import (
VevoIE, VevoIE,
VevoPlaylistIE, VevoPlaylistIE,

View File

@ -37,7 +37,7 @@ class LocoIE(InfoExtractor):
}, },
}, { }, {
'url': 'https://loco.com/stream/c64916eb-10fb-46a9-9a19-8c4b7ed064e7', 'url': 'https://loco.com/stream/c64916eb-10fb-46a9-9a19-8c4b7ed064e7',
'md5': '45ebc8a47ee1c2240178757caf8881b5', 'md5': '8b9bda03eba4d066928ae8d71f19befb',
'info_dict': { 'info_dict': {
'id': 'c64916eb-10fb-46a9-9a19-8c4b7ed064e7', 'id': 'c64916eb-10fb-46a9-9a19-8c4b7ed064e7',
'ext': 'mp4', 'ext': 'mp4',
@ -55,9 +55,9 @@ class LocoIE(InfoExtractor):
'tags': ['Gameplay'], 'tags': ['Gameplay'],
'series': 'GTA 5', 'series': 'GTA 5',
'timestamp': 1740612872, 'timestamp': 1740612872,
'modified_timestamp': 1740613037, 'modified_timestamp': 1750948439,
'upload_date': '20250226', 'upload_date': '20250226',
'modified_date': '20250226', 'modified_date': '20250626',
}, },
}, { }, {
# Requires video authorization # Requires video authorization
@ -123,8 +123,8 @@ class LocoIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_type, video_id = self._match_valid_url(url).group('type', 'id') video_type, video_id = self._match_valid_url(url).group('type', 'id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
stream = traverse_obj(self._search_nextjs_data(webpage, video_id), ( stream = traverse_obj(self._search_nextjs_v13_data(webpage, video_id), (
'props', 'pageProps', ('liveStreamData', 'stream', 'liveStream'), {dict}, any, {require('stream info')})) ..., (None, 'ssrData'), ('liveStreamData', 'stream', 'liveStream'), {dict}, any, {require('stream info')}))
if access_token := self._get_access_token(video_id): if access_token := self._get_access_token(video_id):
self._request_webpage( self._request_webpage(

151
yt_dlp/extractor/onsen.py Normal file
View File

@ -0,0 +1,151 @@
import base64
import json
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
clean_html,
int_or_none,
parse_qs,
str_or_none,
strftime_or_none,
update_url,
update_url_query,
url_or_none,
)
from ..utils.traversal import traverse_obj
class OnsenIE(InfoExtractor):
IE_NAME = 'onsen'
IE_DESC = 'インターネットラジオステーション<音泉>'
_BASE_URL = 'https://www.onsen.ag'
_HEADERS = {'Referer': f'{_BASE_URL}/'}
_NETRC_MACHINE = 'onsen'
_VALID_URL = r'https?://(?:(?:share|www)\.)onsen\.ag/program/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://share.onsen.ag/program/onsenking?p=90&c=MTA0NjI',
'info_dict': {
'id': '10462',
'ext': 'm4a',
'title': '第SP回',
'cast': 'count:3',
'description': 'md5:de62c80a41c4c8d84da53a1ee681ad18',
'display_id': 'MTA0NjI=',
'media_type': 'sound',
'section_start': 0,
'series': '音泉キング「下野紘」のラジオ きみはもちろん、<音泉>ファミリーだよね?',
'series_id': 'onsenking',
'tags': 'count:2',
'thumbnail': r're:https?://d3bzklg4lms4gh\.cloudfront\.net/program_info/image/default/production/.+',
'upload_date': '20220627',
'webpage_url': 'https://www.onsen.ag/program/onsenking?c=MTA0NjI=',
},
}, {
'url': 'https://share.onsen.ag/program/girls-band-cry-radio?p=370&c=MTgwMDE',
'info_dict': {
'id': '18001',
'ext': 'mp4',
'title': '第4回',
'cast': 'count:5',
'description': 'md5:bbca8a389d99c90cbbce8f383c85fedd',
'display_id': 'MTgwMDE=',
'media_type': 'movie',
'section_start': 0,
'series': 'TVアニメ『ガールズバンドクライ』WEBラジオ「ガールズバンドクライラジオにも全部ぶち込め。',
'series_id': 'girls-band-cry-radio',
'tags': 'count:3',
'thumbnail': r're:https?://d3bzklg4lms4gh\.cloudfront\.net/program_info/image/default/production/.+',
'upload_date': '20240425',
'webpage_url': 'https://www.onsen.ag/program/girls-band-cry-radio?c=MTgwMDE=',
},
'skip': 'Only available for premium supporters',
}, {
'url': 'https://www.onsen.ag/program/uma',
'info_dict': {
'id': 'uma',
'title': 'UMA YELL RADIO',
},
'playlist_mincount': 35,
}]
@staticmethod
def _get_encoded_id(program):
return base64.urlsafe_b64encode(str(program['id']).encode()).decode()
def _perform_login(self, username, password):
sign_in = self._download_json(
f'{self._BASE_URL}/web_api/signin', None, 'Logging in', headers={
'Accept': 'application/json',
'Content-Type': 'application/json',
}, data=json.dumps({
'session': {
'email': username,
'password': password,
},
}).encode(), expected_status=401)
if sign_in.get('error'):
raise ExtractorError('Invalid username or password', expected=True)
def _real_extract(self, url):
program_id = self._match_id(url)
try:
programs = self._download_json(
f'{self._BASE_URL}/web_api/programs/{program_id}', program_id)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 404:
raise ExtractorError('Invalid URL', expected=True)
raise
query = {k: v[-1] for k, v in parse_qs(url).items() if v}
if 'c' not in query:
entries = [
self.url_result(update_url_query(url, {'c': self._get_encoded_id(program)}), OnsenIE)
for program in traverse_obj(programs, ('contents', lambda _, v: v['id']))
]
return self.playlist_result(
entries, program_id, traverse_obj(programs, ('program_info', 'title', {clean_html})))
raw_id = base64.urlsafe_b64decode(f'{query["c"]}===').decode()
p_keys = ('contents', lambda _, v: v['id'] == int(raw_id))
program = traverse_obj(programs, (*p_keys, any))
if not program:
raise ExtractorError(
'This program is no longer available', expected=True)
m3u8_url = traverse_obj(program, ('streaming_url', {url_or_none}))
if not m3u8_url:
self.raise_login_required(
'This program is only available for premium supporters')
display_id = self._get_encoded_id(program)
date_str = self._search_regex(
rf'{program_id}0?(\d{{6}})', m3u8_url, 'date string', default=None)
return {
'display_id': display_id,
'formats': self._extract_m3u8_formats(m3u8_url, raw_id, headers=self._HEADERS),
'http_headers': self._HEADERS,
'section_start': int_or_none(query.get('t', 0)),
'upload_date': strftime_or_none(f'20{date_str}'),
'webpage_url': f'{self._BASE_URL}/program/{program_id}?c={display_id}',
**traverse_obj(program, {
'id': ('id', {int}, {str_or_none}),
'title': ('title', {clean_html}),
'media_type': ('media_type', {str}),
'thumbnail': ('poster_image_url', {url_or_none}, {update_url(query=None)}),
}),
**traverse_obj(programs, {
'cast': (('performers', (*p_keys, 'guests')), ..., 'name', {str}, filter),
'series_id': ('directory_name', {str}),
}),
**traverse_obj(programs, ('program_info', {
'description': ('description', {clean_html}, filter),
'series': ('title', {clean_html}),
'tags': ('hashtag_list', ..., {str}, filter),
})),
}

View File

@ -1,191 +0,0 @@
import re
from .common import InfoExtractor
from ..utils import ExtractorError, int_or_none, str_to_int
class RUTVIE(InfoExtractor):
IE_DESC = 'RUTV.RU'
_VALID_URL = r'''(?x)
https?://
(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/
(?P<path>
flash\d+v/container\.swf\?id=|
iframe/(?P<type>swf|video|live)/id/|
index/iframe/cast_id/
)
(?P<id>\d+)
'''
_EMBED_REGEX = [
r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1',
r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)',
]
_TESTS = [{
'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724',
'info_dict': {
'id': '774471',
'ext': 'mp4',
'title': 'Монологи на все времена. Концерт',
'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5',
'duration': 2906,
'thumbnail': r're:https?://cdn-st2\.smotrim\.ru/.+\.jpg',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638',
'info_dict': {
'id': '774016',
'ext': 'mp4',
'title': 'Чужой в семье Сталина',
'description': '',
'duration': 2539,
},
'skip': 'Invalid URL',
}, {
'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000',
'info_dict': {
'id': '766888',
'ext': 'mp4',
'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
'duration': 279,
'thumbnail': r're:https?://cdn-st2\.smotrim\.ru/.+\.jpg',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169',
'info_dict': {
'id': '771852',
'ext': 'mp4',
'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет',
'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8',
'duration': 3096,
'thumbnail': r're:https?://cdn-st2\.smotrim\.ru/.+\.jpg',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014',
'info_dict': {
'id': '51499',
'ext': 'flv',
'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
},
'skip': 'Invalid URL',
}, {
'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/',
'info_dict': {
'id': '21',
'ext': 'mp4',
'title': str,
'is_live': True,
},
'skip': 'Invalid URL',
}, {
'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/',
'only_matching': True,
}]
_WEBPAGE_TESTS = [{
'url': 'http://istoriya-teatra.ru/news/item/f00/s05/n0000545/index.shtml',
'info_dict': {
'id': '1952012',
'ext': 'mp4',
'title': 'Новости культуры. Эфир от 10.10.2019 (23:30). Театр Сатиры отмечает день рождения премьерой',
'description': 'md5:fced27112ff01ff8fc4a452fc088bad6',
'duration': 191,
'thumbnail': r're:https?://cdn-st2\.smotrim\.ru/.+\.jpg',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
video_path = mobj.group('path')
if re.match(r'flash\d+v', video_path):
video_type = 'video'
elif video_path.startswith('iframe'):
video_type = mobj.group('type')
if video_type == 'swf':
video_type = 'video'
elif video_path.startswith('index/iframe/cast_id'):
video_type = 'live'
is_live = video_type == 'live'
json_data = self._download_json(
'http://player.vgtrk.com/iframe/data{}/id/{}'.format('live' if is_live else 'video', video_id),
video_id, 'Downloading JSON')
if json_data['errors']:
raise ExtractorError('{} said: {}'.format(self.IE_NAME, json_data['errors']), expected=True)
playlist = json_data['data']['playlist']
medialist = playlist['medialist']
media = medialist[0]
if media['errors']:
raise ExtractorError('{} said: {}'.format(self.IE_NAME, media['errors']), expected=True)
view_count = int_or_none(playlist.get('count_views'))
priority_transport = playlist['priority_transport']
thumbnail = media['picture']
width = int_or_none(media['width'])
height = int_or_none(media['height'])
description = media['anons']
title = media['title']
duration = int_or_none(media.get('duration'))
formats = []
subtitles = {}
for transport, links in media['sources'].items():
for quality, url in links.items():
preference = -1 if priority_transport == transport else -2
if transport == 'rtmp':
mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
if not mobj:
continue
fmt = {
'url': mobj.group('url'),
'play_path': mobj.group('playpath'),
'app': mobj.group('app'),
'page_url': 'http://player.rutv.ru',
'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22',
'rtmp_live': True,
'ext': 'flv',
'vbr': str_to_int(quality),
}
elif transport == 'm3u8':
fmt, subs = self._extract_m3u8_formats_and_subtitles(
url, video_id, 'mp4', quality=preference, m3u8_id='hls')
formats.extend(fmt)
self._merge_subtitles(subs, target=subtitles)
continue
else:
fmt = {
'url': url,
}
fmt.update({
'width': int_or_none(quality, default=height, invscale=width, scale=height),
'height': int_or_none(quality, default=height),
'format_id': f'{transport}-{quality}',
'source_preference': preference,
})
formats.append(fmt)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'view_count': view_count,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
'is_live': is_live,
'_format_sort_fields': ('source', ),
}

View File

@ -1,65 +1,403 @@
import functools
import json
import re
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError from ..utils import (
OnDemandPagedList,
clean_html,
determine_ext,
extract_attributes,
int_or_none,
parse_iso8601,
str_or_none,
unescapeHTML,
url_or_none,
urljoin,
)
from ..utils.traversal import (
find_element,
find_elements,
require,
traverse_obj,
)
class SmotrimIE(InfoExtractor): class SmotrimBaseIE(InfoExtractor):
_VALID_URL = r'https?://smotrim\.ru/(?P<type>brand|video|article|live)/(?P<id>[0-9]+)' _BASE_URL = 'https://smotrim.ru'
_TESTS = [{ # video _GEO_BYPASS = False
_GEO_COUNTRIES = ['RU']
def _extract_from_smotrim_api(self, typ, item_id):
path = f'data{typ.replace("-", "")}/{"uid" if typ == "live" else "id"}'
data = self._download_json(
f'https://player.smotrim.ru/iframe/{path}/{item_id}/sid/smotrim', item_id)
media = traverse_obj(data, ('data', 'playlist', 'medialist', -1, {dict}))
if traverse_obj(media, ('locked', {bool})):
self.raise_login_required()
if error_msg := traverse_obj(media, ('errors', {clean_html})):
self.raise_geo_restricted(error_msg, countries=self._GEO_COUNTRIES)
webpage_url = traverse_obj(data, ('data', 'template', 'share_url', {url_or_none}))
webpage = self._download_webpage(webpage_url, item_id)
common = {
'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None),
**traverse_obj(media, {
'id': ('id', {str_or_none}),
'title': (('episodeTitle', 'title'), {clean_html}, filter, any),
'channel_id': ('channelId', {str_or_none}),
'description': ('anons', {clean_html}, filter),
'season': ('season', {clean_html}, filter),
'series': (('brand_title', 'brandTitle'), {clean_html}, filter, any),
'series_id': ('brand_id', {str_or_none}),
}),
}
if typ == 'audio':
bookmark = self._search_json(
r'class="bookmark"[^>]+value\s*=\s*"', webpage,
'bookmark', item_id, default={}, transform_source=unescapeHTML)
metadata = {
'vcodec': 'none',
**common,
**traverse_obj(media, {
'ext': ('audio_url', {determine_ext(default_ext='mp3')}),
'duration': ('duration', {int_or_none}),
'url': ('audio_url', {url_or_none}),
}),
**traverse_obj(bookmark, {
'title': ('subtitle', {clean_html}),
'timestamp': ('published', {parse_iso8601}),
}),
}
elif typ == 'audio-live':
metadata = {
'ext': 'mp3',
'url': traverse_obj(media, ('source', 'auto', {url_or_none})),
'vcodec': 'none',
**common,
}
else:
formats, subtitles = [], {}
for m3u8_url in traverse_obj(media, (
'sources', 'm3u8', {dict.values}, ..., {url_or_none},
)):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
m3u8_url, item_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
metadata = {
'formats': formats,
'subtitles': subtitles,
**self._search_json_ld(webpage, item_id),
**common,
}
return {
'age_limit': traverse_obj(data, ('data', 'age_restrictions', {int_or_none})),
'is_live': typ in ('audio-live', 'live'),
'tags': traverse_obj(webpage, (
{find_elements(cls='tags-list__link')}, ..., {clean_html}, filter, all, filter)),
'webpage_url': webpage_url,
**metadata,
}
class SmotrimIE(SmotrimBaseIE):
IE_NAME = 'smotrim'
_VALID_URL = r'(?:https?:)?//(?:(?:player|www)\.)?smotrim\.ru(?:/iframe)?/video(?:/id)?/(?P<id>\d+)'
_EMBED_REGEX = [fr'<iframe\b[^>]+\bsrc=["\'](?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://smotrim.ru/video/1539617', 'url': 'https://smotrim.ru/video/1539617',
'md5': 'b1923a533c8cab09679789d720d0b1c5',
'info_dict': { 'info_dict': {
'id': '1539617', 'id': '1539617',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Полиглот. Китайский с нуля за 16 часов! Урок №16', 'title': 'Урок №16',
'description': '', 'duration': 2631,
'series': 'Полиглот. Китайский с нуля за 16 часов!',
'series_id': '60562',
'tags': 'mincount:6',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1466771100,
'upload_date': '20160624',
'view_count': int,
}, },
'add_ie': ['RUTV'], }, {
}, { # article (geo-restricted? plays fine from the US and JP) 'url': 'https://player.smotrim.ru/iframe/video/id/2988590',
'info_dict': {
'id': '2988590',
'ext': 'mp4',
'title': 'Трейлер',
'age_limit': 16,
'description': 'md5:6af7e68ecf4ed7b8ff6720d20c4da47b',
'duration': 30,
'series': 'Мы в разводе',
'series_id': '71624',
'tags': 'mincount:5',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1750670040,
'upload_date': '20250623',
'view_count': int,
'webpage_url': 'https://smotrim.ru/video/2988590',
},
}]
_WEBPAGE_TESTS = [{
'url': 'https://smotrim.ru/article/2813445', 'url': 'https://smotrim.ru/article/2813445',
'md5': 'e0ac453952afbc6a2742e850b4dc8e77',
'info_dict': { 'info_dict': {
'id': '2431846', 'id': '2431846',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Новости культуры. Съёмки первой программы "Большие и маленькие"', 'title': 'Съёмки первой программы "Большие и маленькие"',
'description': 'md5:94a4a22472da4252bf5587a4ee441b99', 'description': 'md5:446c9a5d334b995152a813946353f447',
'duration': 240,
'series': 'Новости культуры',
'series_id': '19725',
'tags': 'mincount:6',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1656054443,
'upload_date': '20220624',
'view_count': int,
'webpage_url': 'https://smotrim.ru/video/2431846',
}, },
'add_ie': ['RUTV'], }, {
}, { # brand, redirect 'url': 'https://www.vesti.ru/article/4642878',
'url': 'https://smotrim.ru/brand/64356',
'md5': '740472999ccff81d7f6df79cecd91c18',
'info_dict': { 'info_dict': {
'id': '2354523', 'id': '3007209',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Большие и маленькие. Лучшее. 4-й выпуск', 'title': 'Иностранные мессенджеры используют не только мошенники, но и вербовщики',
'description': 'md5:84089e834429008371ea41ea3507b989', 'description': 'md5:74ab625a0a89b87b2e0ed98d6391b182',
'duration': 265,
'series': 'Вести. Дежурная часть',
'series_id': '5204',
'tags': 'mincount:6',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1754756280,
'upload_date': '20250809',
'view_count': int,
'webpage_url': 'https://smotrim.ru/video/3007209',
}, },
'add_ie': ['RUTV'],
}, { # live
'url': 'https://smotrim.ru/live/19201',
'info_dict': {
'id': '19201',
'ext': 'mp4',
# this looks like a TV channel name
'title': 'Россия Культура. Прямой эфир',
'description': '',
},
'add_ie': ['RUTV'],
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id, typ = self._match_valid_url(url).group('id', 'type') video_id = self._match_id(url)
rutv_type = 'video'
if typ not in ('video', 'live'):
webpage = self._download_webpage(url, video_id, f'Resolving {typ} link')
# there are two cases matching regex:
# 1. "embedUrl" in JSON LD (/brand/)
# 2. "src" attribute from iframe (/article/)
video_id = self._search_regex(
r'"https://player.smotrim.ru/iframe/video/id/(?P<video_id>\d+)/',
webpage, 'video_id', default=None)
if not video_id:
raise ExtractorError('There are no video in this page.', expected=True)
elif typ == 'live':
rutv_type = 'live'
return self.url_result(f'https://player.vgtrk.com/iframe/{rutv_type}/id/{video_id}') return self._extract_from_smotrim_api('video', video_id)
class SmotrimAudioIE(SmotrimBaseIE):
IE_NAME = 'smotrim:audio'
_VALID_URL = r'https?://(?:(?:player|www)\.)?smotrim\.ru(?:/iframe)?/audio(?:/id)?/(?P<id>\d+)'
_TESTS = [{
'url': 'https://smotrim.ru/audio/2573986',
'md5': 'e28d94c20da524e242b2d00caef41a8e',
'info_dict': {
'id': '2573986',
'ext': 'mp3',
'title': 'Радиоспектакль',
'description': 'md5:4bcaaf7d532bc78f76e478fad944e388',
'duration': 3072,
'series': 'Морис Леблан. Арсен Люпен, джентльмен-грабитель',
'series_id': '66461',
'tags': 'mincount:7',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1624884358,
'upload_date': '20210628',
},
}, {
'url': 'https://player.smotrim.ru/iframe/audio/id/2860468',
'md5': '5a6bc1fa24c7142958be1ad9cfae58a8',
'info_dict': {
'id': '2860468',
'ext': 'mp3',
'title': 'Колобок и музыкальная игра "Терем-теремок"',
'duration': 1501,
'series': 'Веселый колобок',
'series_id': '68880',
'tags': 'mincount:4',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1755925800,
'upload_date': '20250823',
'webpage_url': 'https://smotrim.ru/audio/2860468',
},
}]
def _real_extract(self, url):
audio_id = self._match_id(url)
return self._extract_from_smotrim_api('audio', audio_id)
class SmotrimLiveIE(SmotrimBaseIE):
IE_NAME = 'smotrim:live'
_VALID_URL = r'''(?x:
(?:https?:)?//
(?:(?:(?:test)?player|www)\.)?
(?:
smotrim\.ru|
vgtrk\.com
)
(?:/iframe)?/
(?P<type>
channel|
(?:audio-)?live
)
(?:/u?id)?/(?P<id>[\da-f-]+)
)'''
_EMBED_REGEX = [fr'<iframe\b[^>]+\bsrc=["\'](?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://smotrim.ru/channel/76',
'info_dict': {
'id': '1661',
'ext': 'mp4',
'title': str,
'channel_id': '76',
'description': 'Смотрим прямой эфир «Москва 24»',
'display_id': '76',
'live_status': 'is_live',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': int,
'upload_date': str,
},
'params': {'skip_download': 'Livestream'},
}, {
# Radio
'url': 'https://smotrim.ru/channel/81',
'info_dict': {
'id': '81',
'ext': 'mp3',
'title': str,
'channel_id': '81',
'live_status': 'is_live',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
},
'params': {'skip_download': 'Livestream'},
}, {
# Sometimes geo-restricted to Russia
'url': 'https://player.smotrim.ru/iframe/live/uid/381308c7-a066-4c4f-9656-83e2e792a7b4',
'info_dict': {
'id': '19201',
'ext': 'mp4',
'title': str,
'channel_id': '4',
'description': 'Смотрим прямой эфир «Россия К»',
'display_id': '381308c7-a066-4c4f-9656-83e2e792a7b4',
'live_status': 'is_live',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': int,
'upload_date': str,
'webpage_url': 'https://smotrim.ru/channel/4',
},
'params': {'skip_download': 'Livestream'},
}, {
'url': 'https://smotrim.ru/live/19201',
'only_matching': True,
}, {
'url': 'https://player.smotrim.ru/iframe/audio-live/id/81',
'only_matching': True,
}, {
'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201',
'only_matching': True,
}]
def _real_extract(self, url):
typ, display_id = self._match_valid_url(url).group('type', 'id')
if typ == 'live' and re.fullmatch(r'[0-9]+', display_id):
url = self._request_webpage(url, display_id).url
typ = self._match_valid_url(url).group('type')
if typ == 'channel':
webpage = self._download_webpage(url, display_id)
src_url = traverse_obj(webpage, ((
({find_element(cls='main-player__frame', html=True)}, {extract_attributes}, 'src'),
({find_element(cls='audio-play-button', html=True)},
{extract_attributes}, 'value', {urllib.parse.unquote}, {json.loads}, 'source'),
), any, {self._proto_relative_url}, {url_or_none}, {require('src URL')}))
typ, video_id = self._match_valid_url(src_url).group('type', 'id')
else:
video_id = display_id
return {
'display_id': display_id,
**self._extract_from_smotrim_api(typ, video_id),
}
class SmotrimPlaylistIE(SmotrimBaseIE):
IE_NAME = 'smotrim:playlist'
_PAGE_SIZE = 15
_VALID_URL = r'https?://smotrim\.ru/(?P<type>brand|podcast)/(?P<id>\d+)/?(?P<season>[\w-]+)?'
_TESTS = [{
# Video
'url': 'https://smotrim.ru/brand/64356',
'info_dict': {
'id': '64356',
'title': 'Большие и маленькие',
},
'playlist_mincount': 55,
}, {
# Video, season
'url': 'https://smotrim.ru/brand/65293/3-sezon',
'info_dict': {
'id': '65293',
'title': 'Спасская',
'season': '3 сезон',
},
'playlist_count': 16,
}, {
# Audio
'url': 'https://smotrim.ru/brand/68880',
'info_dict': {
'id': '68880',
'title': 'Веселый колобок',
},
'playlist_mincount': 156,
}, {
# Podcast
'url': 'https://smotrim.ru/podcast/8021',
'info_dict': {
'id': '8021',
'title': 'Сила звука',
},
'playlist_mincount': 27,
}]
def _fetch_page(self, endpoint, key, playlist_id, page):
page += 1
items = self._download_json(
f'{self._BASE_URL}/api/{endpoint}', playlist_id,
f'Downloading page {page}', query={
key: playlist_id,
'limit': self._PAGE_SIZE,
'page': page,
},
)
for link in traverse_obj(items, ('contents', -1, 'list', ..., 'link', {str})):
yield self.url_result(urljoin(self._BASE_URL, link))
def _real_extract(self, url):
playlist_type, playlist_id, season = self._match_valid_url(url).group('type', 'id', 'season')
key = 'rubricId' if playlist_type == 'podcast' else 'brandId'
webpage = self._download_webpage(url, playlist_id)
playlist_title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None)
if season:
return self.playlist_from_matches(traverse_obj(webpage, (
{find_elements(tag='a', attr='href', value=r'/video/\d+', html=True, regex=True)},
..., {extract_attributes}, 'href', {str},
)), playlist_id, playlist_title, season=traverse_obj(webpage, (
{find_element(cls='seasons__item seasons__item--selected')}, {clean_html},
)), ie=SmotrimIE, getter=urljoin(self._BASE_URL))
if traverse_obj(webpage, (
{find_element(cls='brand-main-item__videos')}, {clean_html}, filter,
)):
endpoint = 'videos'
else:
endpoint = 'audios'
return self.playlist_result(OnDemandPagedList(
functools.partial(self._fetch_page, endpoint, key, playlist_id), self._PAGE_SIZE), playlist_id, playlist_title)

View File

@ -1,244 +1,335 @@
import functools
import urllib.parse import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
OnDemandPagedList, OnDemandPagedList,
determine_ext, UnsupportedError,
clean_html,
int_or_none,
join_nonempty,
parse_iso8601, parse_iso8601,
traverse_obj, update_url_query,
url_or_none,
) )
from ..utils.traversal import traverse_obj
class TuneInBaseIE(InfoExtractor): class TuneInBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?tunein\.com' def _call_api(self, item_id, endpoint=None, note='Downloading JSON metadata', fatal=False, query=None):
return self._download_json(
def _extract_metadata(self, webpage, content_id): join_nonempty('https://api.tunein.com/profiles', item_id, endpoint, delim='/'),
return self._search_json(r'window.INITIAL_STATE=', webpage, 'hydration', content_id, fatal=False) item_id, note=note, fatal=fatal, query=query) or {}
def _extract_formats_and_subtitles(self, content_id): def _extract_formats_and_subtitles(self, content_id):
streams = self._download_json( streams = self._download_json(
f'https://opml.radiotime.com/Tune.ashx?render=json&formats=mp3,aac,ogg,flash,hls&id={content_id}', 'https://opml.radiotime.com/Tune.ashx', content_id, query={
content_id)['body'] 'formats': 'mp3,aac,ogg,flash,hls',
'id': content_id,
'render': 'json',
})
formats, subtitles = [], {} formats, subtitles = [], {}
for stream in streams: for stream in traverse_obj(streams, ('body', lambda _, v: url_or_none(v['url']))):
if stream.get('media_type') == 'hls': if stream.get('media_type') == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(stream['url'], content_id, fatal=False) fmts, subs = self._extract_m3u8_formats_and_subtitles(stream['url'], content_id, fatal=False)
formats.extend(fmts) formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles) self._merge_subtitles(subs, target=subtitles)
elif determine_ext(stream['url']) == 'pls':
playlist_content = self._download_webpage(stream['url'], content_id)
formats.append({
'url': self._search_regex(r'File1=(.*)', playlist_content, 'url', fatal=False),
'abr': stream.get('bitrate'),
'ext': stream.get('media_type'),
})
else: else:
formats.append({ formats.append(traverse_obj(stream, {
'url': stream['url'], 'abr': ('bitrate', {int_or_none}),
'abr': stream.get('bitrate'), 'ext': ('media_type', {str}),
'ext': stream.get('media_type'), 'url': ('url', {self._proto_relative_url}),
}) }))
return formats, subtitles return formats, subtitles
class TuneInStationIE(TuneInBaseIE): class TuneInStationIE(TuneInBaseIE):
_VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'(?:/radio/[^?#]+-|/embed/player/)(?P<id>s\d+)' IE_NAME = 'tunein:station'
_EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/s\d+)'] _VALID_URL = r'https?://tunein\.com/radio/[^/?#]+(?P<id>s\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://tunein.com/radio/Jazz24-885-s34682/', 'url': 'https://tunein.com/radio/Jazz24-885-s34682/',
'info_dict': { 'info_dict': {
'id': 's34682', 'id': 's34682',
'title': str,
'description': 'md5:d6d0b89063fd68d529fa7058ee98619b',
'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+',
'location': 'Seattle-Tacoma, US',
'ext': 'mp3', 'ext': 'mp3',
'title': str,
'alt_title': 'World Class Jazz',
'channel_follower_count': int,
'description': 'md5:d6d0b89063fd68d529fa7058ee98619b',
'location': r're:Seattle-Tacoma, (?:US|WA)',
'live_status': 'is_live', 'live_status': 'is_live',
'thumbnail': r're:https?://.+',
}, },
'params': { 'params': {'skip_download': 'Livestream'},
'skip_download': True,
},
}, {
'url': 'https://tunein.com/embed/player/s6404/',
'only_matching': True,
}, { }, {
'url': 'https://tunein.com/radio/BBC-Radio-1-988-s24939/', 'url': 'https://tunein.com/radio/BBC-Radio-1-988-s24939/',
'info_dict': { 'info_dict': {
'id': 's24939', 'id': 's24939',
'title': str,
'description': 'md5:ee2c56794844610d045f8caf5ff34d0c',
'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+',
'location': 'London, UK',
'ext': 'm4a', 'ext': 'm4a',
'title': str,
'alt_title': 'The biggest new pop and all-day vibes',
'channel_follower_count': int,
'description': 'md5:ee2c56794844610d045f8caf5ff34d0c',
'location': 'London, UK',
'live_status': 'is_live', 'live_status': 'is_live',
'thumbnail': r're:https?://.+',
}, },
'params': { 'params': {'skip_download': 'Livestream'},
'skip_download': True, }]
def _real_extract(self, url):
station_id = self._match_id(url)
formats, subtitles = self._extract_formats_and_subtitles(station_id)
return {
'id': station_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(self._call_api(station_id), ('Item', {
'title': ('Title', {clean_html}),
'alt_title': ('Subtitle', {clean_html}, filter),
'channel_follower_count': ('Actions', 'Follow', 'FollowerCount', {int_or_none}),
'description': ('Description', {clean_html}, filter),
'is_live': ('Actions', 'Play', 'IsLive', {bool}),
'location': ('Properties', 'Location', 'DisplayName', {str}),
'thumbnail': ('Image', {url_or_none}),
})),
}
class TuneInPodcastIE(TuneInBaseIE):
IE_NAME = 'tunein:podcast:program'
_PAGE_SIZE = 20
_VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+){1,2}(?P<id>p\d+)'
_TESTS = [{
'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/',
'info_dict': {
'id': 'p1153019',
'title': 'Lex Fridman Podcast',
}, },
'playlist_mincount': 200,
}, {
'url': 'https://tunein.com/podcasts/World-News/BBC-News-p14/',
'info_dict': {
'id': 'p14',
'title': 'BBC News',
},
'playlist_mincount': 35,
}]
@classmethod
def suitable(cls, url):
return False if TuneInPodcastEpisodeIE.suitable(url) else super().suitable(url)
def _fetch_page(self, url, podcast_id, page=0):
items = self._call_api(
podcast_id, 'contents', f'Downloading page {page + 1}', query={
'filter': 't:free',
'limit': self._PAGE_SIZE,
'offset': page * self._PAGE_SIZE,
},
)['Items']
for item in traverse_obj(items, (..., 'GuideId', {str}, filter)):
yield self.url_result(update_url_query(url, {'topicId': item[1:]}))
def _real_extract(self, url):
podcast_id = self._match_id(url)
return self.playlist_result(OnDemandPagedList(
functools.partial(self._fetch_page, url, podcast_id), self._PAGE_SIZE),
podcast_id, traverse_obj(self._call_api(podcast_id), ('Item', 'Title', {str})))
class TuneInPodcastEpisodeIE(TuneInBaseIE):
IE_NAME = 'tunein:podcast'
_VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+){1,2}(?P<series_id>p\d+)/?\?(?:[^#]+&)?(?i:topicid)=(?P<id>\d+)'
_TESTS = [{
'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354',
'info_dict': {
'id': 't236404354',
'ext': 'mp3',
'title': '#351 MrBeast: Future of YouTube, Twitter, TikTok, and Instagram',
'alt_title': 'Technology Podcasts >',
'cast': 'count:1',
'description': 'md5:1029895354ef073ff00f20b82eb6eb71',
'display_id': '236404354',
'duration': 8330,
'thumbnail': r're:https?://.+',
'timestamp': 1673458571,
'upload_date': '20230111',
'series': 'Lex Fridman Podcast',
'series_id': 'p1153019',
},
}, {
'url': 'https://tunein.com/podcasts/The-BOB--TOM-Show-Free-Podcast-p20069/?topicId=174556405',
'info_dict': {
'id': 't174556405',
'ext': 'mp3',
'title': 'B&T Extra: Ohhh Yeah, It\'s Sexy Time',
'alt_title': 'Westwood One >',
'cast': 'count:2',
'description': 'md5:6828234f410ab88c85655495c5fcfa88',
'display_id': '174556405',
'duration': 1203,
'series': 'The BOB & TOM Show Free Podcast',
'series_id': 'p20069',
'thumbnail': r're:https?://.+',
'timestamp': 1661799600,
'upload_date': '20220829',
},
}]
def _real_extract(self, url):
series_id, display_id = self._match_valid_url(url).group('series_id', 'id')
episode_id = f't{display_id}'
formats, subtitles = self._extract_formats_and_subtitles(episode_id)
return {
'id': episode_id,
'display_id': display_id,
'formats': formats,
'series': traverse_obj(self._call_api(series_id), ('Item', 'Title', {clean_html})),
'series_id': series_id,
'subtitles': subtitles,
**traverse_obj(self._call_api(episode_id), ('Item', {
'title': ('Title', {clean_html}),
'alt_title': ('Subtitle', {clean_html}, filter),
'cast': (
'Properties', 'ParentProgram', 'Hosts', {clean_html},
{lambda x: x.split(';')}, ..., {str.strip}, filter, all, filter),
'description': ('Description', {clean_html}, filter),
'duration': ('Actions', 'Play', 'Duration', {int_or_none}),
'thumbnail': ('Image', {url_or_none}),
'timestamp': ('Actions', 'Play', 'PublishTime', {parse_iso8601}),
})),
}
class TuneInEmbedIE(TuneInBaseIE):
IE_NAME = 'tunein:embed'
_VALID_URL = r'https?://tunein\.com/embed/player/(?P<id>[^/?#]+)'
_EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//tunein\.com/embed/player/[^/?#"\']+)']
_TESTS = [{
'url': 'https://tunein.com/embed/player/s6404/',
'info_dict': {
'id': 's6404',
'ext': 'mp3',
'title': str,
'alt_title': 'South Africa\'s News and Information Leader',
'channel_follower_count': int,
'live_status': 'is_live',
'location': 'Johannesburg, South Africa',
'thumbnail': r're:https?://.+',
},
'params': {'skip_download': 'Livestream'},
}, {
'url': 'https://tunein.com/embed/player/t236404354/',
'info_dict': {
'id': 't236404354',
'ext': 'mp3',
'title': '#351 MrBeast: Future of YouTube, Twitter, TikTok, and Instagram',
'alt_title': 'Technology Podcasts >',
'cast': 'count:1',
'description': 'md5:1029895354ef073ff00f20b82eb6eb71',
'display_id': '236404354',
'duration': 8330,
'series': 'Lex Fridman Podcast',
'series_id': 'p1153019',
'thumbnail': r're:https?://.+',
'timestamp': 1673458571,
'upload_date': '20230111',
},
}, {
'url': 'https://tunein.com/embed/player/p191660/',
'info_dict': {
'id': 'p191660',
'title': 'SBS Tamil',
},
'playlist_mincount': 195,
}] }]
_WEBPAGE_TESTS = [{ _WEBPAGE_TESTS = [{
'url': 'https://www.martiniinthemorning.com/', 'url': 'https://www.martiniinthemorning.com/',
'info_dict': { 'info_dict': {
'id': 's55412', 'id': 's55412',
'ext': 'mp3', 'ext': 'mp3',
'title': 'TuneInStation video #s55412', 'title': str,
'alt_title': 'Now that\'s music!',
'channel_follower_count': int,
'description': 'md5:41588a3e2cf34b3eafc6c33522fa611a',
'live_status': 'is_live',
'location': 'US',
'thumbnail': r're:https?://.+',
}, },
'expected_warnings': ['unable to extract hydration', 'Extractor failed to obtain "title"'], 'params': {'skip_download': 'Livestream'},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
station_id = self._match_id(url) embed_id = self._match_id(url)
kind = {
'p': 'program',
's': 'station',
't': 'topic',
}.get(embed_id[:1])
webpage = self._download_webpage(url, station_id) return self.url_result(
metadata = self._extract_metadata(webpage, station_id) f'https://tunein.com/{kind}/?{kind}id={embed_id[1:]}')
formats, subtitles = self._extract_formats_and_subtitles(station_id)
return {
'id': station_id,
'title': traverse_obj(metadata, ('profiles', station_id, 'title')),
'description': traverse_obj(metadata, ('profiles', station_id, 'description')),
'thumbnail': traverse_obj(metadata, ('profiles', station_id, 'image')),
'timestamp': parse_iso8601(
traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'publishTime'))),
'location': traverse_obj(
metadata, ('profiles', station_id, 'metadata', 'properties', 'location', 'displayName'),
('profiles', station_id, 'properties', 'location', 'displayName')),
'formats': formats,
'subtitles': subtitles,
'is_live': traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'isLive')),
}
class TuneInPodcastIE(TuneInBaseIE):
_VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/(?:podcasts/[^?#]+-|embed/player/)(?P<id>p\d+)/?(?:#|$)'
_EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/p\d+)']
_TESTS = [{
'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019',
'info_dict': {
'id': 'p1153019',
'title': 'Lex Fridman Podcast',
'description': 'md5:bedc4e5f1c94f7dec6e4317b5654b00d',
},
'playlist_mincount': 200,
}, {
'url': 'https://tunein.com/embed/player/p191660/',
'only_matching': True,
}, {
'url': 'https://tunein.com/podcasts/World-News/BBC-News-p14/',
'info_dict': {
'id': 'p14',
'title': 'BBC News',
'description': 'md5:30b9622bcc4bd101d4acd6f38f284aed',
},
'playlist_mincount': 36,
}]
_PAGE_SIZE = 30
def _real_extract(self, url):
podcast_id = self._match_id(url)
webpage = self._download_webpage(url, podcast_id, fatal=False)
metadata = self._extract_metadata(webpage, podcast_id)
def page_func(page_num):
api_response = self._download_json(
f'https://api.tunein.com/profiles/{podcast_id}/contents', podcast_id,
note=f'Downloading page {page_num + 1}', query={
'filter': 't:free',
'offset': page_num * self._PAGE_SIZE,
'limit': self._PAGE_SIZE,
})
return [
self.url_result(
f'https://tunein.com/podcasts/{podcast_id}?topicId={episode["GuideId"][1:]}',
TuneInPodcastEpisodeIE, title=episode.get('Title'))
for episode in api_response['Items']]
entries = OnDemandPagedList(page_func, self._PAGE_SIZE)
return self.playlist_result(
entries, playlist_id=podcast_id, title=traverse_obj(metadata, ('profiles', podcast_id, 'title')),
description=traverse_obj(metadata, ('profiles', podcast_id, 'description')))
class TuneInPodcastEpisodeIE(TuneInBaseIE):
_VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/podcasts/(?:[^?&]+-)?(?P<podcast_id>p\d+)/?\?topicId=(?P<id>\w\d+)'
_TESTS = [{
'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354',
'info_dict': {
'id': 't236404354',
'title': '#351 MrBeast: Future of YouTube, Twitter, TikTok, and Instagram',
'description': 'md5:2784533b98f8ac45c0820b1e4a8d8bb2',
'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+',
'timestamp': 1673458571,
'upload_date': '20230111',
'series_id': 'p1153019',
'series': 'Lex Fridman Podcast',
'ext': 'mp3',
},
}]
def _real_extract(self, url):
podcast_id, episode_id = self._match_valid_url(url).group('podcast_id', 'id')
episode_id = f't{episode_id}'
webpage = self._download_webpage(url, episode_id)
metadata = self._extract_metadata(webpage, episode_id)
formats, subtitles = self._extract_formats_and_subtitles(episode_id)
return {
'id': episode_id,
'title': traverse_obj(metadata, ('profiles', episode_id, 'title')),
'description': traverse_obj(metadata, ('profiles', episode_id, 'description')),
'thumbnail': traverse_obj(metadata, ('profiles', episode_id, 'image')),
'timestamp': parse_iso8601(
traverse_obj(metadata, ('profiles', episode_id, 'actions', 'play', 'publishTime'))),
'series_id': podcast_id,
'series': traverse_obj(metadata, ('profiles', podcast_id, 'title')),
'formats': formats,
'subtitles': subtitles,
}
class TuneInShortenerIE(InfoExtractor): class TuneInShortenerIE(InfoExtractor):
_WORKING = False
IE_NAME = 'tunein:shortener' IE_NAME = 'tunein:shortener'
IE_DESC = False # Do not list IE_DESC = False # Do not list
_VALID_URL = r'https?://tun\.in/(?P<id>[A-Za-z0-9]+)' _VALID_URL = r'https?://tun\.in/(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
# test redirection
'url': 'http://tun.in/ser7s', 'url': 'http://tun.in/ser7s',
'info_dict': { 'info_dict': {
'id': 's34682', 'id': 's34682',
'title': str, 'title': str,
'description': 'md5:d6d0b89063fd68d529fa7058ee98619b',
'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+',
'location': 'Seattle-Tacoma, US',
'ext': 'mp3', 'ext': 'mp3',
'alt_title': 'World Class Jazz',
'channel_follower_count': int,
'description': 'md5:d6d0b89063fd68d529fa7058ee98619b',
'location': r're:Seattle-Tacoma, (?:US|WA)',
'live_status': 'is_live', 'live_status': 'is_live',
'thumbnail': r're:https?://.+',
}, },
'params': { 'params': {'skip_download': 'Livestream'},
'skip_download': True, # live stream }, {
'url': 'http://tun.in/tqeeFw',
'info_dict': {
'id': 't236404354',
'title': str,
'ext': 'mp3',
'alt_title': 'Technology Podcasts >',
'cast': 'count:1',
'description': 'md5:1029895354ef073ff00f20b82eb6eb71',
'display_id': '236404354',
'duration': 8330,
'series': 'Lex Fridman Podcast',
'series_id': 'p1153019',
'thumbnail': r're:https?://.+',
'timestamp': 1673458571,
'upload_date': '20230111',
}, },
'params': {'skip_download': 'Livestream'},
}, {
'url': 'http://tun.in/pei6i',
'info_dict': {
'id': 'p14',
'title': 'BBC News',
},
'playlist_mincount': 35,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
redirect_id = self._match_id(url) redirect_id = self._match_id(url)
# The server doesn't support HEAD requests # The server doesn't support HEAD requests
urlh = self._request_webpage( urlh = self._request_webpage(url, redirect_id, 'Downloading redirect page')
url, redirect_id, note='Downloading redirect page') # Need to strip port from URL
parsed = urllib.parse.urlparse(urlh.url)
url = urlh.url new_url = parsed._replace(netloc=parsed.hostname).geturl()
url_parsed = urllib.parse.urlparse(url) # Prevent infinite loop in case redirect fails
if url_parsed.port == 443: if self.suitable(new_url):
url = url_parsed._replace(netloc=url_parsed.hostname).url raise UnsupportedError(new_url)
return self.url_result(new_url)
self.to_screen(f'Following redirect: {url}')
return self.url_result(url)

View File

@ -1,119 +0,0 @@
import re
from .common import InfoExtractor
from .rutv import RUTVIE
from ..utils import ExtractorError
class VestiIE(InfoExtractor):
_WORKING = False
IE_DESC = 'Вести.Ru'
_VALID_URL = r'https?://(?:.+?\.)?vesti\.ru/(?P<id>.+)'
_TESTS = [
{
'url': 'http://www.vesti.ru/videos?vid=575582&cid=1',
'info_dict': {
'id': '765035',
'ext': 'mp4',
'title': 'Вести.net: биткоины в России не являются законными',
'description': 'md5:d4bb3859dc1177b28a94c5014c35a36b',
'duration': 302,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://www.vesti.ru/doc.html?id=1349233',
'info_dict': {
'id': '773865',
'ext': 'mp4',
'title': 'Участники митинга штурмуют Донецкую областную администрацию',
'description': 'md5:1a160e98b3195379b4c849f2f4958009',
'duration': 210,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://www.vesti.ru/only_video.html?vid=576180',
'info_dict': {
'id': '766048',
'ext': 'mp4',
'title': 'США заморозило, Британию затопило',
'description': 'md5:f0ed0695ec05aed27c56a70a58dc4cc1',
'duration': 87,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://hitech.vesti.ru/news/view/id/4000',
'info_dict': {
'id': '766888',
'ext': 'mp4',
'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
'duration': 279,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403',
'info_dict': {
'id': '766403',
'ext': 'mp4',
'title': 'XXII зимние Олимпийские игры. Российские хоккеисты стартовали на Олимпиаде с победы',
'description': 'md5:55805dfd35763a890ff50fa9e35e31b3',
'duration': 271,
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Blocked outside Russia',
},
{
'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
'info_dict': {
'id': '51499',
'ext': 'flv',
'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
},
'params': {
# rtmp download
'skip_download': True,
},
'skip': 'Translation has finished',
},
]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
mobj = re.search(
r'<meta[^>]+?property="og:video"[^>]+?content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)',
page)
if mobj:
video_id = mobj.group('id')
page = self._download_webpage(f'http://www.vesti.ru/only_video.html?vid={video_id}', video_id,
'Downloading video page')
rutv_url = RUTVIE._extract_url(page)
if rutv_url:
return self.url_result(rutv_url, 'RUTV')
raise ExtractorError('No video found', expected=True)