Compare commits

..

No commits in common. "17bfaa53edf5c52fce73cf0cef4592f929c2462d" and "ba8044685537e8e14adc6826fb4d730856fd2e2b" have entirely different histories.

12 changed files with 535 additions and 816 deletions

View File

@ -37,21 +37,17 @@ jobs:
matrix: matrix:
os: [ubuntu-latest] os: [ubuntu-latest]
# CPython 3.9 is in quick-test # CPython 3.9 is in quick-test
python-version: ['3.10', '3.11', '3.12', '3.13', '3.14-dev', pypy-3.11] python-version: ['3.10', '3.11', '3.12', '3.13', pypy-3.11]
include: include:
# atleast one of each CPython/PyPy tests must be in windows # atleast one of each CPython/PyPy tests must be in windows
- os: windows-latest - os: windows-latest
python-version: '3.9' python-version: '3.9'
- os: windows-latest - os: windows-latest
python-version: '3.10' python-version: '3.10'
- os: windows-latest
python-version: '3.11'
- os: windows-latest - os: windows-latest
python-version: '3.12' python-version: '3.12'
- os: windows-latest - os: windows-latest
python-version: '3.13' python-version: '3.13'
- os: windows-latest
python-version: '3.14-dev'
- os: windows-latest - os: windows-latest
python-version: pypy-3.11 python-version: pypy-3.11
steps: steps:

View File

@ -28,7 +28,7 @@ jobs:
fail-fast: true fail-fast: true
matrix: matrix:
os: [ubuntu-latest] os: [ubuntu-latest]
python-version: ['3.10', '3.11', '3.12', '3.13', '3.14-dev', pypy-3.11] python-version: ['3.10', '3.11', '3.12', '3.13', pypy-3.11]
include: include:
# atleast one of each CPython/PyPy tests must be in windows # atleast one of each CPython/PyPy tests must be in windows
- os: windows-latest - os: windows-latest

View File

@ -25,7 +25,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: [ubuntu-latest, windows-latest] os: [ubuntu-latest, windows-latest]
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14-dev', pypy-3.11] python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', pypy-3.11]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}

View File

@ -35,7 +35,6 @@ classifiers = [
"Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"Programming Language :: Python :: Implementation", "Programming Language :: Python :: Implementation",
"Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy", "Programming Language :: Python :: Implementation :: PyPy",
@ -174,8 +173,7 @@ python = [
"3.11", "3.11",
"3.12", "3.12",
"3.13", "3.13",
"3.14", "pypy3.10",
"pypy3.11",
] ]
[tool.ruff] [tool.ruff]

View File

@ -16,7 +16,7 @@ remove-unused-variables = true
[tox:tox] [tox:tox]
skipsdist = true skipsdist = true
envlist = py{39,310,311,312,313,314},pypy311 envlist = py{39,310,311,312,313},pypy311
skip_missing_interpreters = true skip_missing_interpreters = true
[testenv] # tox [testenv] # tox

View File

@ -1433,7 +1433,6 @@ from .onet import (
OnetPlIE, OnetPlIE,
) )
from .onionstudios import OnionStudiosIE from .onionstudios import OnionStudiosIE
from .onsen import OnsenIE
from .opencast import ( from .opencast import (
OpencastIE, OpencastIE,
OpencastPlaylistIE, OpencastPlaylistIE,
@ -1780,6 +1779,7 @@ from .rutube import (
RutubePlaylistIE, RutubePlaylistIE,
RutubeTagsIE, RutubeTagsIE,
) )
from .rutv import RUTVIE
from .ruutu import RuutuIE from .ruutu import RuutuIE
from .ruv import ( from .ruv import (
RuvIE, RuvIE,
@ -1877,12 +1877,7 @@ from .skynewsau import SkyNewsAUIE
from .slideshare import SlideshareIE from .slideshare import SlideshareIE
from .slideslive import SlidesLiveIE from .slideslive import SlidesLiveIE
from .slutload import SlutloadIE from .slutload import SlutloadIE
from .smotrim import ( from .smotrim import SmotrimIE
SmotrimAudioIE,
SmotrimIE,
SmotrimLiveIE,
SmotrimPlaylistIE,
)
from .snapchat import SnapchatSpotlightIE from .snapchat import SnapchatSpotlightIE
from .snotr import SnotrIE from .snotr import SnotrIE
from .softwhiteunderbelly import SoftWhiteUnderbellyIE from .softwhiteunderbelly import SoftWhiteUnderbellyIE
@ -2154,7 +2149,6 @@ from .tubitv import (
) )
from .tumblr import TumblrIE from .tumblr import TumblrIE
from .tunein import ( from .tunein import (
TuneInEmbedIE,
TuneInPodcastEpisodeIE, TuneInPodcastEpisodeIE,
TuneInPodcastIE, TuneInPodcastIE,
TuneInShortenerIE, TuneInShortenerIE,
@ -2289,6 +2283,7 @@ from .utreon import UtreonIE
from .varzesh3 import Varzesh3IE from .varzesh3 import Varzesh3IE
from .vbox7 import Vbox7IE from .vbox7 import Vbox7IE
from .veo import VeoIE from .veo import VeoIE
from .vesti import VestiIE
from .vevo import ( from .vevo import (
VevoIE, VevoIE,
VevoPlaylistIE, VevoPlaylistIE,

View File

@ -37,7 +37,7 @@ class LocoIE(InfoExtractor):
}, },
}, { }, {
'url': 'https://loco.com/stream/c64916eb-10fb-46a9-9a19-8c4b7ed064e7', 'url': 'https://loco.com/stream/c64916eb-10fb-46a9-9a19-8c4b7ed064e7',
'md5': '8b9bda03eba4d066928ae8d71f19befb', 'md5': '45ebc8a47ee1c2240178757caf8881b5',
'info_dict': { 'info_dict': {
'id': 'c64916eb-10fb-46a9-9a19-8c4b7ed064e7', 'id': 'c64916eb-10fb-46a9-9a19-8c4b7ed064e7',
'ext': 'mp4', 'ext': 'mp4',
@ -55,9 +55,9 @@ class LocoIE(InfoExtractor):
'tags': ['Gameplay'], 'tags': ['Gameplay'],
'series': 'GTA 5', 'series': 'GTA 5',
'timestamp': 1740612872, 'timestamp': 1740612872,
'modified_timestamp': 1750948439, 'modified_timestamp': 1740613037,
'upload_date': '20250226', 'upload_date': '20250226',
'modified_date': '20250626', 'modified_date': '20250226',
}, },
}, { }, {
# Requires video authorization # Requires video authorization
@ -123,8 +123,8 @@ class LocoIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_type, video_id = self._match_valid_url(url).group('type', 'id') video_type, video_id = self._match_valid_url(url).group('type', 'id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
stream = traverse_obj(self._search_nextjs_v13_data(webpage, video_id), ( stream = traverse_obj(self._search_nextjs_data(webpage, video_id), (
..., (None, 'ssrData'), ('liveStreamData', 'stream', 'liveStream'), {dict}, any, {require('stream info')})) 'props', 'pageProps', ('liveStreamData', 'stream', 'liveStream'), {dict}, any, {require('stream info')}))
if access_token := self._get_access_token(video_id): if access_token := self._get_access_token(video_id):
self._request_webpage( self._request_webpage(

View File

@ -1,151 +0,0 @@
import base64
import json
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
clean_html,
int_or_none,
parse_qs,
str_or_none,
strftime_or_none,
update_url,
update_url_query,
url_or_none,
)
from ..utils.traversal import traverse_obj
class OnsenIE(InfoExtractor):
IE_NAME = 'onsen'
IE_DESC = 'インターネットラジオステーション<音泉>'
_BASE_URL = 'https://www.onsen.ag'
_HEADERS = {'Referer': f'{_BASE_URL}/'}
_NETRC_MACHINE = 'onsen'
_VALID_URL = r'https?://(?:(?:share|www)\.)onsen\.ag/program/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://share.onsen.ag/program/onsenking?p=90&c=MTA0NjI',
'info_dict': {
'id': '10462',
'ext': 'm4a',
'title': '第SP回',
'cast': 'count:3',
'description': 'md5:de62c80a41c4c8d84da53a1ee681ad18',
'display_id': 'MTA0NjI=',
'media_type': 'sound',
'section_start': 0,
'series': '音泉キング「下野紘」のラジオ きみはもちろん、<音泉>ファミリーだよね?',
'series_id': 'onsenking',
'tags': 'count:2',
'thumbnail': r're:https?://d3bzklg4lms4gh\.cloudfront\.net/program_info/image/default/production/.+',
'upload_date': '20220627',
'webpage_url': 'https://www.onsen.ag/program/onsenking?c=MTA0NjI=',
},
}, {
'url': 'https://share.onsen.ag/program/girls-band-cry-radio?p=370&c=MTgwMDE',
'info_dict': {
'id': '18001',
'ext': 'mp4',
'title': '第4回',
'cast': 'count:5',
'description': 'md5:bbca8a389d99c90cbbce8f383c85fedd',
'display_id': 'MTgwMDE=',
'media_type': 'movie',
'section_start': 0,
'series': 'TVアニメ『ガールズバンドクライ』WEBラジオ「ガールズバンドクライラジオにも全部ぶち込め。',
'series_id': 'girls-band-cry-radio',
'tags': 'count:3',
'thumbnail': r're:https?://d3bzklg4lms4gh\.cloudfront\.net/program_info/image/default/production/.+',
'upload_date': '20240425',
'webpage_url': 'https://www.onsen.ag/program/girls-band-cry-radio?c=MTgwMDE=',
},
'skip': 'Only available for premium supporters',
}, {
'url': 'https://www.onsen.ag/program/uma',
'info_dict': {
'id': 'uma',
'title': 'UMA YELL RADIO',
},
'playlist_mincount': 35,
}]
@staticmethod
def _get_encoded_id(program):
return base64.urlsafe_b64encode(str(program['id']).encode()).decode()
def _perform_login(self, username, password):
sign_in = self._download_json(
f'{self._BASE_URL}/web_api/signin', None, 'Logging in', headers={
'Accept': 'application/json',
'Content-Type': 'application/json',
}, data=json.dumps({
'session': {
'email': username,
'password': password,
},
}).encode(), expected_status=401)
if sign_in.get('error'):
raise ExtractorError('Invalid username or password', expected=True)
def _real_extract(self, url):
program_id = self._match_id(url)
try:
programs = self._download_json(
f'{self._BASE_URL}/web_api/programs/{program_id}', program_id)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 404:
raise ExtractorError('Invalid URL', expected=True)
raise
query = {k: v[-1] for k, v in parse_qs(url).items() if v}
if 'c' not in query:
entries = [
self.url_result(update_url_query(url, {'c': self._get_encoded_id(program)}), OnsenIE)
for program in traverse_obj(programs, ('contents', lambda _, v: v['id']))
]
return self.playlist_result(
entries, program_id, traverse_obj(programs, ('program_info', 'title', {clean_html})))
raw_id = base64.urlsafe_b64decode(f'{query["c"]}===').decode()
p_keys = ('contents', lambda _, v: v['id'] == int(raw_id))
program = traverse_obj(programs, (*p_keys, any))
if not program:
raise ExtractorError(
'This program is no longer available', expected=True)
m3u8_url = traverse_obj(program, ('streaming_url', {url_or_none}))
if not m3u8_url:
self.raise_login_required(
'This program is only available for premium supporters')
display_id = self._get_encoded_id(program)
date_str = self._search_regex(
rf'{program_id}0?(\d{{6}})', m3u8_url, 'date string', default=None)
return {
'display_id': display_id,
'formats': self._extract_m3u8_formats(m3u8_url, raw_id, headers=self._HEADERS),
'http_headers': self._HEADERS,
'section_start': int_or_none(query.get('t', 0)),
'upload_date': strftime_or_none(f'20{date_str}'),
'webpage_url': f'{self._BASE_URL}/program/{program_id}?c={display_id}',
**traverse_obj(program, {
'id': ('id', {int}, {str_or_none}),
'title': ('title', {clean_html}),
'media_type': ('media_type', {str}),
'thumbnail': ('poster_image_url', {url_or_none}, {update_url(query=None)}),
}),
**traverse_obj(programs, {
'cast': (('performers', (*p_keys, 'guests')), ..., 'name', {str}, filter),
'series_id': ('directory_name', {str}),
}),
**traverse_obj(programs, ('program_info', {
'description': ('description', {clean_html}, filter),
'series': ('title', {clean_html}),
'tags': ('hashtag_list', ..., {str}, filter),
})),
}

191
yt_dlp/extractor/rutv.py Normal file
View File

@ -0,0 +1,191 @@
import re
from .common import InfoExtractor
from ..utils import ExtractorError, int_or_none, str_to_int
class RUTVIE(InfoExtractor):
IE_DESC = 'RUTV.RU'
_VALID_URL = r'''(?x)
https?://
(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/
(?P<path>
flash\d+v/container\.swf\?id=|
iframe/(?P<type>swf|video|live)/id/|
index/iframe/cast_id/
)
(?P<id>\d+)
'''
_EMBED_REGEX = [
r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1',
r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)',
]
_TESTS = [{
'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724',
'info_dict': {
'id': '774471',
'ext': 'mp4',
'title': 'Монологи на все времена. Концерт',
'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5',
'duration': 2906,
'thumbnail': r're:https?://cdn-st2\.smotrim\.ru/.+\.jpg',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638',
'info_dict': {
'id': '774016',
'ext': 'mp4',
'title': 'Чужой в семье Сталина',
'description': '',
'duration': 2539,
},
'skip': 'Invalid URL',
}, {
'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000',
'info_dict': {
'id': '766888',
'ext': 'mp4',
'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
'duration': 279,
'thumbnail': r're:https?://cdn-st2\.smotrim\.ru/.+\.jpg',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169',
'info_dict': {
'id': '771852',
'ext': 'mp4',
'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет',
'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8',
'duration': 3096,
'thumbnail': r're:https?://cdn-st2\.smotrim\.ru/.+\.jpg',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014',
'info_dict': {
'id': '51499',
'ext': 'flv',
'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
},
'skip': 'Invalid URL',
}, {
'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/',
'info_dict': {
'id': '21',
'ext': 'mp4',
'title': str,
'is_live': True,
},
'skip': 'Invalid URL',
}, {
'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/',
'only_matching': True,
}]
_WEBPAGE_TESTS = [{
'url': 'http://istoriya-teatra.ru/news/item/f00/s05/n0000545/index.shtml',
'info_dict': {
'id': '1952012',
'ext': 'mp4',
'title': 'Новости культуры. Эфир от 10.10.2019 (23:30). Театр Сатиры отмечает день рождения премьерой',
'description': 'md5:fced27112ff01ff8fc4a452fc088bad6',
'duration': 191,
'thumbnail': r're:https?://cdn-st2\.smotrim\.ru/.+\.jpg',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
video_path = mobj.group('path')
if re.match(r'flash\d+v', video_path):
video_type = 'video'
elif video_path.startswith('iframe'):
video_type = mobj.group('type')
if video_type == 'swf':
video_type = 'video'
elif video_path.startswith('index/iframe/cast_id'):
video_type = 'live'
is_live = video_type == 'live'
json_data = self._download_json(
'http://player.vgtrk.com/iframe/data{}/id/{}'.format('live' if is_live else 'video', video_id),
video_id, 'Downloading JSON')
if json_data['errors']:
raise ExtractorError('{} said: {}'.format(self.IE_NAME, json_data['errors']), expected=True)
playlist = json_data['data']['playlist']
medialist = playlist['medialist']
media = medialist[0]
if media['errors']:
raise ExtractorError('{} said: {}'.format(self.IE_NAME, media['errors']), expected=True)
view_count = int_or_none(playlist.get('count_views'))
priority_transport = playlist['priority_transport']
thumbnail = media['picture']
width = int_or_none(media['width'])
height = int_or_none(media['height'])
description = media['anons']
title = media['title']
duration = int_or_none(media.get('duration'))
formats = []
subtitles = {}
for transport, links in media['sources'].items():
for quality, url in links.items():
preference = -1 if priority_transport == transport else -2
if transport == 'rtmp':
mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
if not mobj:
continue
fmt = {
'url': mobj.group('url'),
'play_path': mobj.group('playpath'),
'app': mobj.group('app'),
'page_url': 'http://player.rutv.ru',
'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22',
'rtmp_live': True,
'ext': 'flv',
'vbr': str_to_int(quality),
}
elif transport == 'm3u8':
fmt, subs = self._extract_m3u8_formats_and_subtitles(
url, video_id, 'mp4', quality=preference, m3u8_id='hls')
formats.extend(fmt)
self._merge_subtitles(subs, target=subtitles)
continue
else:
fmt = {
'url': url,
}
fmt.update({
'width': int_or_none(quality, default=height, invscale=width, scale=height),
'height': int_or_none(quality, default=height),
'format_id': f'{transport}-{quality}',
'source_preference': preference,
})
formats.append(fmt)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'view_count': view_count,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
'is_live': is_live,
'_format_sort_fields': ('source', ),
}

View File

@ -1,403 +1,65 @@
import functools
import json
import re
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import ExtractorError
OnDemandPagedList,
clean_html,
determine_ext,
extract_attributes,
int_or_none,
parse_iso8601,
str_or_none,
unescapeHTML,
url_or_none,
urljoin,
)
from ..utils.traversal import (
find_element,
find_elements,
require,
traverse_obj,
)
class SmotrimBaseIE(InfoExtractor): class SmotrimIE(InfoExtractor):
_BASE_URL = 'https://smotrim.ru' _VALID_URL = r'https?://smotrim\.ru/(?P<type>brand|video|article|live)/(?P<id>[0-9]+)'
_GEO_BYPASS = False _TESTS = [{ # video
_GEO_COUNTRIES = ['RU']
def _extract_from_smotrim_api(self, typ, item_id):
path = f'data{typ.replace("-", "")}/{"uid" if typ == "live" else "id"}'
data = self._download_json(
f'https://player.smotrim.ru/iframe/{path}/{item_id}/sid/smotrim', item_id)
media = traverse_obj(data, ('data', 'playlist', 'medialist', -1, {dict}))
if traverse_obj(media, ('locked', {bool})):
self.raise_login_required()
if error_msg := traverse_obj(media, ('errors', {clean_html})):
self.raise_geo_restricted(error_msg, countries=self._GEO_COUNTRIES)
webpage_url = traverse_obj(data, ('data', 'template', 'share_url', {url_or_none}))
webpage = self._download_webpage(webpage_url, item_id)
common = {
'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None),
**traverse_obj(media, {
'id': ('id', {str_or_none}),
'title': (('episodeTitle', 'title'), {clean_html}, filter, any),
'channel_id': ('channelId', {str_or_none}),
'description': ('anons', {clean_html}, filter),
'season': ('season', {clean_html}, filter),
'series': (('brand_title', 'brandTitle'), {clean_html}, filter, any),
'series_id': ('brand_id', {str_or_none}),
}),
}
if typ == 'audio':
bookmark = self._search_json(
r'class="bookmark"[^>]+value\s*=\s*"', webpage,
'bookmark', item_id, default={}, transform_source=unescapeHTML)
metadata = {
'vcodec': 'none',
**common,
**traverse_obj(media, {
'ext': ('audio_url', {determine_ext(default_ext='mp3')}),
'duration': ('duration', {int_or_none}),
'url': ('audio_url', {url_or_none}),
}),
**traverse_obj(bookmark, {
'title': ('subtitle', {clean_html}),
'timestamp': ('published', {parse_iso8601}),
}),
}
elif typ == 'audio-live':
metadata = {
'ext': 'mp3',
'url': traverse_obj(media, ('source', 'auto', {url_or_none})),
'vcodec': 'none',
**common,
}
else:
formats, subtitles = [], {}
for m3u8_url in traverse_obj(media, (
'sources', 'm3u8', {dict.values}, ..., {url_or_none},
)):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
m3u8_url, item_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
metadata = {
'formats': formats,
'subtitles': subtitles,
**self._search_json_ld(webpage, item_id),
**common,
}
return {
'age_limit': traverse_obj(data, ('data', 'age_restrictions', {int_or_none})),
'is_live': typ in ('audio-live', 'live'),
'tags': traverse_obj(webpage, (
{find_elements(cls='tags-list__link')}, ..., {clean_html}, filter, all, filter)),
'webpage_url': webpage_url,
**metadata,
}
class SmotrimIE(SmotrimBaseIE):
IE_NAME = 'smotrim'
_VALID_URL = r'(?:https?:)?//(?:(?:player|www)\.)?smotrim\.ru(?:/iframe)?/video(?:/id)?/(?P<id>\d+)'
_EMBED_REGEX = [fr'<iframe\b[^>]+\bsrc=["\'](?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://smotrim.ru/video/1539617', 'url': 'https://smotrim.ru/video/1539617',
'md5': 'b1923a533c8cab09679789d720d0b1c5',
'info_dict': { 'info_dict': {
'id': '1539617', 'id': '1539617',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Урок №16', 'title': 'Полиглот. Китайский с нуля за 16 часов! Урок №16',
'duration': 2631, 'description': '',
'series': 'Полиглот. Китайский с нуля за 16 часов!',
'series_id': '60562',
'tags': 'mincount:6',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1466771100,
'upload_date': '20160624',
'view_count': int,
}, },
}, { 'add_ie': ['RUTV'],
'url': 'https://player.smotrim.ru/iframe/video/id/2988590', }, { # article (geo-restricted? plays fine from the US and JP)
'info_dict': {
'id': '2988590',
'ext': 'mp4',
'title': 'Трейлер',
'age_limit': 16,
'description': 'md5:6af7e68ecf4ed7b8ff6720d20c4da47b',
'duration': 30,
'series': 'Мы в разводе',
'series_id': '71624',
'tags': 'mincount:5',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1750670040,
'upload_date': '20250623',
'view_count': int,
'webpage_url': 'https://smotrim.ru/video/2988590',
},
}]
_WEBPAGE_TESTS = [{
'url': 'https://smotrim.ru/article/2813445', 'url': 'https://smotrim.ru/article/2813445',
'md5': 'e0ac453952afbc6a2742e850b4dc8e77',
'info_dict': { 'info_dict': {
'id': '2431846', 'id': '2431846',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Съёмки первой программы "Большие и маленькие"', 'title': 'Новости культуры. Съёмки первой программы "Большие и маленькие"',
'description': 'md5:446c9a5d334b995152a813946353f447', 'description': 'md5:94a4a22472da4252bf5587a4ee441b99',
'duration': 240,
'series': 'Новости культуры',
'series_id': '19725',
'tags': 'mincount:6',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1656054443,
'upload_date': '20220624',
'view_count': int,
'webpage_url': 'https://smotrim.ru/video/2431846',
}, },
}, { 'add_ie': ['RUTV'],
'url': 'https://www.vesti.ru/article/4642878', }, { # brand, redirect
'url': 'https://smotrim.ru/brand/64356',
'md5': '740472999ccff81d7f6df79cecd91c18',
'info_dict': { 'info_dict': {
'id': '3007209', 'id': '2354523',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Иностранные мессенджеры используют не только мошенники, но и вербовщики', 'title': 'Большие и маленькие. Лучшее. 4-й выпуск',
'description': 'md5:74ab625a0a89b87b2e0ed98d6391b182', 'description': 'md5:84089e834429008371ea41ea3507b989',
'duration': 265,
'series': 'Вести. Дежурная часть',
'series_id': '5204',
'tags': 'mincount:6',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1754756280,
'upload_date': '20250809',
'view_count': int,
'webpage_url': 'https://smotrim.ru/video/3007209',
}, },
}] 'add_ie': ['RUTV'],
}, { # live
def _real_extract(self, url): 'url': 'https://smotrim.ru/live/19201',
video_id = self._match_id(url)
return self._extract_from_smotrim_api('video', video_id)
class SmotrimAudioIE(SmotrimBaseIE):
IE_NAME = 'smotrim:audio'
_VALID_URL = r'https?://(?:(?:player|www)\.)?smotrim\.ru(?:/iframe)?/audio(?:/id)?/(?P<id>\d+)'
_TESTS = [{
'url': 'https://smotrim.ru/audio/2573986',
'md5': 'e28d94c20da524e242b2d00caef41a8e',
'info_dict': {
'id': '2573986',
'ext': 'mp3',
'title': 'Радиоспектакль',
'description': 'md5:4bcaaf7d532bc78f76e478fad944e388',
'duration': 3072,
'series': 'Морис Леблан. Арсен Люпен, джентльмен-грабитель',
'series_id': '66461',
'tags': 'mincount:7',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1624884358,
'upload_date': '20210628',
},
}, {
'url': 'https://player.smotrim.ru/iframe/audio/id/2860468',
'md5': '5a6bc1fa24c7142958be1ad9cfae58a8',
'info_dict': {
'id': '2860468',
'ext': 'mp3',
'title': 'Колобок и музыкальная игра "Терем-теремок"',
'duration': 1501,
'series': 'Веселый колобок',
'series_id': '68880',
'tags': 'mincount:4',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': 1755925800,
'upload_date': '20250823',
'webpage_url': 'https://smotrim.ru/audio/2860468',
},
}]
def _real_extract(self, url):
audio_id = self._match_id(url)
return self._extract_from_smotrim_api('audio', audio_id)
class SmotrimLiveIE(SmotrimBaseIE):
IE_NAME = 'smotrim:live'
_VALID_URL = r'''(?x:
(?:https?:)?//
(?:(?:(?:test)?player|www)\.)?
(?:
smotrim\.ru|
vgtrk\.com
)
(?:/iframe)?/
(?P<type>
channel|
(?:audio-)?live
)
(?:/u?id)?/(?P<id>[\da-f-]+)
)'''
_EMBED_REGEX = [fr'<iframe\b[^>]+\bsrc=["\'](?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://smotrim.ru/channel/76',
'info_dict': {
'id': '1661',
'ext': 'mp4',
'title': str,
'channel_id': '76',
'description': 'Смотрим прямой эфир «Москва 24»',
'display_id': '76',
'live_status': 'is_live',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': int,
'upload_date': str,
},
'params': {'skip_download': 'Livestream'},
}, {
# Radio
'url': 'https://smotrim.ru/channel/81',
'info_dict': {
'id': '81',
'ext': 'mp3',
'title': str,
'channel_id': '81',
'live_status': 'is_live',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
},
'params': {'skip_download': 'Livestream'},
}, {
# Sometimes geo-restricted to Russia
'url': 'https://player.smotrim.ru/iframe/live/uid/381308c7-a066-4c4f-9656-83e2e792a7b4',
'info_dict': { 'info_dict': {
'id': '19201', 'id': '19201',
'ext': 'mp4', 'ext': 'mp4',
'title': str, # this looks like a TV channel name
'channel_id': '4', 'title': 'Россия Культура. Прямой эфир',
'description': 'Смотрим прямой эфир «Россия К»', 'description': '',
'display_id': '381308c7-a066-4c4f-9656-83e2e792a7b4',
'live_status': 'is_live',
'thumbnail': r're:https?://cdn-st\d+\.smotrim\.ru/.+\.(?:jpg|png)',
'timestamp': int,
'upload_date': str,
'webpage_url': 'https://smotrim.ru/channel/4',
}, },
'params': {'skip_download': 'Livestream'}, 'add_ie': ['RUTV'],
}, {
'url': 'https://smotrim.ru/live/19201',
'only_matching': True,
}, {
'url': 'https://player.smotrim.ru/iframe/audio-live/id/81',
'only_matching': True,
}, {
'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
typ, display_id = self._match_valid_url(url).group('type', 'id') video_id, typ = self._match_valid_url(url).group('id', 'type')
rutv_type = 'video'
if typ not in ('video', 'live'):
webpage = self._download_webpage(url, video_id, f'Resolving {typ} link')
# there are two cases matching regex:
# 1. "embedUrl" in JSON LD (/brand/)
# 2. "src" attribute from iframe (/article/)
video_id = self._search_regex(
r'"https://player.smotrim.ru/iframe/video/id/(?P<video_id>\d+)/',
webpage, 'video_id', default=None)
if not video_id:
raise ExtractorError('There are no video in this page.', expected=True)
elif typ == 'live':
rutv_type = 'live'
if typ == 'live' and re.fullmatch(r'[0-9]+', display_id): return self.url_result(f'https://player.vgtrk.com/iframe/{rutv_type}/id/{video_id}')
url = self._request_webpage(url, display_id).url
typ = self._match_valid_url(url).group('type')
if typ == 'channel':
webpage = self._download_webpage(url, display_id)
src_url = traverse_obj(webpage, ((
({find_element(cls='main-player__frame', html=True)}, {extract_attributes}, 'src'),
({find_element(cls='audio-play-button', html=True)},
{extract_attributes}, 'value', {urllib.parse.unquote}, {json.loads}, 'source'),
), any, {self._proto_relative_url}, {url_or_none}, {require('src URL')}))
typ, video_id = self._match_valid_url(src_url).group('type', 'id')
else:
video_id = display_id
return {
'display_id': display_id,
**self._extract_from_smotrim_api(typ, video_id),
}
class SmotrimPlaylistIE(SmotrimBaseIE):
IE_NAME = 'smotrim:playlist'
_PAGE_SIZE = 15
_VALID_URL = r'https?://smotrim\.ru/(?P<type>brand|podcast)/(?P<id>\d+)/?(?P<season>[\w-]+)?'
_TESTS = [{
# Video
'url': 'https://smotrim.ru/brand/64356',
'info_dict': {
'id': '64356',
'title': 'Большие и маленькие',
},
'playlist_mincount': 55,
}, {
# Video, season
'url': 'https://smotrim.ru/brand/65293/3-sezon',
'info_dict': {
'id': '65293',
'title': 'Спасская',
'season': '3 сезон',
},
'playlist_count': 16,
}, {
# Audio
'url': 'https://smotrim.ru/brand/68880',
'info_dict': {
'id': '68880',
'title': 'Веселый колобок',
},
'playlist_mincount': 156,
}, {
# Podcast
'url': 'https://smotrim.ru/podcast/8021',
'info_dict': {
'id': '8021',
'title': 'Сила звука',
},
'playlist_mincount': 27,
}]
def _fetch_page(self, endpoint, key, playlist_id, page):
page += 1
items = self._download_json(
f'{self._BASE_URL}/api/{endpoint}', playlist_id,
f'Downloading page {page}', query={
key: playlist_id,
'limit': self._PAGE_SIZE,
'page': page,
},
)
for link in traverse_obj(items, ('contents', -1, 'list', ..., 'link', {str})):
yield self.url_result(urljoin(self._BASE_URL, link))
def _real_extract(self, url):
playlist_type, playlist_id, season = self._match_valid_url(url).group('type', 'id', 'season')
key = 'rubricId' if playlist_type == 'podcast' else 'brandId'
webpage = self._download_webpage(url, playlist_id)
playlist_title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None)
if season:
return self.playlist_from_matches(traverse_obj(webpage, (
{find_elements(tag='a', attr='href', value=r'/video/\d+', html=True, regex=True)},
..., {extract_attributes}, 'href', {str},
)), playlist_id, playlist_title, season=traverse_obj(webpage, (
{find_element(cls='seasons__item seasons__item--selected')}, {clean_html},
)), ie=SmotrimIE, getter=urljoin(self._BASE_URL))
if traverse_obj(webpage, (
{find_element(cls='brand-main-item__videos')}, {clean_html}, filter,
)):
endpoint = 'videos'
else:
endpoint = 'audios'
return self.playlist_result(OnDemandPagedList(
functools.partial(self._fetch_page, endpoint, key, playlist_id), self._PAGE_SIZE), playlist_id, playlist_title)

View File

@ -1,335 +1,244 @@
import functools
import urllib.parse import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
OnDemandPagedList, OnDemandPagedList,
UnsupportedError, determine_ext,
clean_html,
int_or_none,
join_nonempty,
parse_iso8601, parse_iso8601,
update_url_query, traverse_obj,
url_or_none,
) )
from ..utils.traversal import traverse_obj
class TuneInBaseIE(InfoExtractor): class TuneInBaseIE(InfoExtractor):
def _call_api(self, item_id, endpoint=None, note='Downloading JSON metadata', fatal=False, query=None): _VALID_URL_BASE = r'https?://(?:www\.)?tunein\.com'
return self._download_json(
join_nonempty('https://api.tunein.com/profiles', item_id, endpoint, delim='/'), def _extract_metadata(self, webpage, content_id):
item_id, note=note, fatal=fatal, query=query) or {} return self._search_json(r'window.INITIAL_STATE=', webpage, 'hydration', content_id, fatal=False)
def _extract_formats_and_subtitles(self, content_id): def _extract_formats_and_subtitles(self, content_id):
streams = self._download_json( streams = self._download_json(
'https://opml.radiotime.com/Tune.ashx', content_id, query={ f'https://opml.radiotime.com/Tune.ashx?render=json&formats=mp3,aac,ogg,flash,hls&id={content_id}',
'formats': 'mp3,aac,ogg,flash,hls', content_id)['body']
'id': content_id,
'render': 'json',
})
formats, subtitles = [], {} formats, subtitles = [], {}
for stream in traverse_obj(streams, ('body', lambda _, v: url_or_none(v['url']))): for stream in streams:
if stream.get('media_type') == 'hls': if stream.get('media_type') == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(stream['url'], content_id, fatal=False) fmts, subs = self._extract_m3u8_formats_and_subtitles(stream['url'], content_id, fatal=False)
formats.extend(fmts) formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles) self._merge_subtitles(subs, target=subtitles)
elif determine_ext(stream['url']) == 'pls':
playlist_content = self._download_webpage(stream['url'], content_id)
formats.append({
'url': self._search_regex(r'File1=(.*)', playlist_content, 'url', fatal=False),
'abr': stream.get('bitrate'),
'ext': stream.get('media_type'),
})
else: else:
formats.append(traverse_obj(stream, { formats.append({
'abr': ('bitrate', {int_or_none}), 'url': stream['url'],
'ext': ('media_type', {str}), 'abr': stream.get('bitrate'),
'url': ('url', {self._proto_relative_url}), 'ext': stream.get('media_type'),
})) })
return formats, subtitles return formats, subtitles
class TuneInStationIE(TuneInBaseIE): class TuneInStationIE(TuneInBaseIE):
IE_NAME = 'tunein:station' _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'(?:/radio/[^?#]+-|/embed/player/)(?P<id>s\d+)'
_VALID_URL = r'https?://tunein\.com/radio/[^/?#]+(?P<id>s\d+)' _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/s\d+)']
_TESTS = [{ _TESTS = [{
'url': 'https://tunein.com/radio/Jazz24-885-s34682/', 'url': 'https://tunein.com/radio/Jazz24-885-s34682/',
'info_dict': { 'info_dict': {
'id': 's34682', 'id': 's34682',
'ext': 'mp3',
'title': str, 'title': str,
'alt_title': 'World Class Jazz',
'channel_follower_count': int,
'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b',
'location': r're:Seattle-Tacoma, (?:US|WA)', 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+',
'location': 'Seattle-Tacoma, US',
'ext': 'mp3',
'live_status': 'is_live', 'live_status': 'is_live',
'thumbnail': r're:https?://.+',
}, },
'params': {'skip_download': 'Livestream'}, 'params': {
'skip_download': True,
},
}, {
'url': 'https://tunein.com/embed/player/s6404/',
'only_matching': True,
}, { }, {
'url': 'https://tunein.com/radio/BBC-Radio-1-988-s24939/', 'url': 'https://tunein.com/radio/BBC-Radio-1-988-s24939/',
'info_dict': { 'info_dict': {
'id': 's24939', 'id': 's24939',
'ext': 'm4a',
'title': str, 'title': str,
'alt_title': 'The biggest new pop and all-day vibes',
'channel_follower_count': int,
'description': 'md5:ee2c56794844610d045f8caf5ff34d0c', 'description': 'md5:ee2c56794844610d045f8caf5ff34d0c',
'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+',
'location': 'London, UK', 'location': 'London, UK',
'ext': 'm4a',
'live_status': 'is_live', 'live_status': 'is_live',
'thumbnail': r're:https?://.+',
}, },
'params': {'skip_download': 'Livestream'}, 'params': {
}] 'skip_download': True,
def _real_extract(self, url):
station_id = self._match_id(url)
formats, subtitles = self._extract_formats_and_subtitles(station_id)
return {
'id': station_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(self._call_api(station_id), ('Item', {
'title': ('Title', {clean_html}),
'alt_title': ('Subtitle', {clean_html}, filter),
'channel_follower_count': ('Actions', 'Follow', 'FollowerCount', {int_or_none}),
'description': ('Description', {clean_html}, filter),
'is_live': ('Actions', 'Play', 'IsLive', {bool}),
'location': ('Properties', 'Location', 'DisplayName', {str}),
'thumbnail': ('Image', {url_or_none}),
})),
}
class TuneInPodcastIE(TuneInBaseIE):
IE_NAME = 'tunein:podcast:program'
_PAGE_SIZE = 20
_VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+){1,2}(?P<id>p\d+)'
_TESTS = [{
'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/',
'info_dict': {
'id': 'p1153019',
'title': 'Lex Fridman Podcast',
}, },
'playlist_mincount': 200,
}, {
'url': 'https://tunein.com/podcasts/World-News/BBC-News-p14/',
'info_dict': {
'id': 'p14',
'title': 'BBC News',
},
'playlist_mincount': 35,
}]
@classmethod
def suitable(cls, url):
return False if TuneInPodcastEpisodeIE.suitable(url) else super().suitable(url)
def _fetch_page(self, url, podcast_id, page=0):
items = self._call_api(
podcast_id, 'contents', f'Downloading page {page + 1}', query={
'filter': 't:free',
'limit': self._PAGE_SIZE,
'offset': page * self._PAGE_SIZE,
},
)['Items']
for item in traverse_obj(items, (..., 'GuideId', {str}, filter)):
yield self.url_result(update_url_query(url, {'topicId': item[1:]}))
def _real_extract(self, url):
podcast_id = self._match_id(url)
return self.playlist_result(OnDemandPagedList(
functools.partial(self._fetch_page, url, podcast_id), self._PAGE_SIZE),
podcast_id, traverse_obj(self._call_api(podcast_id), ('Item', 'Title', {str})))
class TuneInPodcastEpisodeIE(TuneInBaseIE):
IE_NAME = 'tunein:podcast'
_VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+){1,2}(?P<series_id>p\d+)/?\?(?:[^#]+&)?(?i:topicid)=(?P<id>\d+)'
_TESTS = [{
'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354',
'info_dict': {
'id': 't236404354',
'ext': 'mp3',
'title': '#351 MrBeast: Future of YouTube, Twitter, TikTok, and Instagram',
'alt_title': 'Technology Podcasts >',
'cast': 'count:1',
'description': 'md5:1029895354ef073ff00f20b82eb6eb71',
'display_id': '236404354',
'duration': 8330,
'thumbnail': r're:https?://.+',
'timestamp': 1673458571,
'upload_date': '20230111',
'series': 'Lex Fridman Podcast',
'series_id': 'p1153019',
},
}, {
'url': 'https://tunein.com/podcasts/The-BOB--TOM-Show-Free-Podcast-p20069/?topicId=174556405',
'info_dict': {
'id': 't174556405',
'ext': 'mp3',
'title': 'B&T Extra: Ohhh Yeah, It\'s Sexy Time',
'alt_title': 'Westwood One >',
'cast': 'count:2',
'description': 'md5:6828234f410ab88c85655495c5fcfa88',
'display_id': '174556405',
'duration': 1203,
'series': 'The BOB & TOM Show Free Podcast',
'series_id': 'p20069',
'thumbnail': r're:https?://.+',
'timestamp': 1661799600,
'upload_date': '20220829',
},
}]
def _real_extract(self, url):
series_id, display_id = self._match_valid_url(url).group('series_id', 'id')
episode_id = f't{display_id}'
formats, subtitles = self._extract_formats_and_subtitles(episode_id)
return {
'id': episode_id,
'display_id': display_id,
'formats': formats,
'series': traverse_obj(self._call_api(series_id), ('Item', 'Title', {clean_html})),
'series_id': series_id,
'subtitles': subtitles,
**traverse_obj(self._call_api(episode_id), ('Item', {
'title': ('Title', {clean_html}),
'alt_title': ('Subtitle', {clean_html}, filter),
'cast': (
'Properties', 'ParentProgram', 'Hosts', {clean_html},
{lambda x: x.split(';')}, ..., {str.strip}, filter, all, filter),
'description': ('Description', {clean_html}, filter),
'duration': ('Actions', 'Play', 'Duration', {int_or_none}),
'thumbnail': ('Image', {url_or_none}),
'timestamp': ('Actions', 'Play', 'PublishTime', {parse_iso8601}),
})),
}
class TuneInEmbedIE(TuneInBaseIE):
IE_NAME = 'tunein:embed'
_VALID_URL = r'https?://tunein\.com/embed/player/(?P<id>[^/?#]+)'
_EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//tunein\.com/embed/player/[^/?#"\']+)']
_TESTS = [{
'url': 'https://tunein.com/embed/player/s6404/',
'info_dict': {
'id': 's6404',
'ext': 'mp3',
'title': str,
'alt_title': 'South Africa\'s News and Information Leader',
'channel_follower_count': int,
'live_status': 'is_live',
'location': 'Johannesburg, South Africa',
'thumbnail': r're:https?://.+',
},
'params': {'skip_download': 'Livestream'},
}, {
'url': 'https://tunein.com/embed/player/t236404354/',
'info_dict': {
'id': 't236404354',
'ext': 'mp3',
'title': '#351 MrBeast: Future of YouTube, Twitter, TikTok, and Instagram',
'alt_title': 'Technology Podcasts >',
'cast': 'count:1',
'description': 'md5:1029895354ef073ff00f20b82eb6eb71',
'display_id': '236404354',
'duration': 8330,
'series': 'Lex Fridman Podcast',
'series_id': 'p1153019',
'thumbnail': r're:https?://.+',
'timestamp': 1673458571,
'upload_date': '20230111',
},
}, {
'url': 'https://tunein.com/embed/player/p191660/',
'info_dict': {
'id': 'p191660',
'title': 'SBS Tamil',
},
'playlist_mincount': 195,
}] }]
_WEBPAGE_TESTS = [{ _WEBPAGE_TESTS = [{
'url': 'https://www.martiniinthemorning.com/', 'url': 'https://www.martiniinthemorning.com/',
'info_dict': { 'info_dict': {
'id': 's55412', 'id': 's55412',
'ext': 'mp3', 'ext': 'mp3',
'title': str, 'title': 'TuneInStation video #s55412',
'alt_title': 'Now that\'s music!',
'channel_follower_count': int,
'description': 'md5:41588a3e2cf34b3eafc6c33522fa611a',
'live_status': 'is_live',
'location': 'US',
'thumbnail': r're:https?://.+',
}, },
'params': {'skip_download': 'Livestream'}, 'expected_warnings': ['unable to extract hydration', 'Extractor failed to obtain "title"'],
}] }]
def _real_extract(self, url): def _real_extract(self, url):
embed_id = self._match_id(url) station_id = self._match_id(url)
kind = {
'p': 'program',
's': 'station',
't': 'topic',
}.get(embed_id[:1])
return self.url_result( webpage = self._download_webpage(url, station_id)
f'https://tunein.com/{kind}/?{kind}id={embed_id[1:]}') metadata = self._extract_metadata(webpage, station_id)
formats, subtitles = self._extract_formats_and_subtitles(station_id)
return {
'id': station_id,
'title': traverse_obj(metadata, ('profiles', station_id, 'title')),
'description': traverse_obj(metadata, ('profiles', station_id, 'description')),
'thumbnail': traverse_obj(metadata, ('profiles', station_id, 'image')),
'timestamp': parse_iso8601(
traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'publishTime'))),
'location': traverse_obj(
metadata, ('profiles', station_id, 'metadata', 'properties', 'location', 'displayName'),
('profiles', station_id, 'properties', 'location', 'displayName')),
'formats': formats,
'subtitles': subtitles,
'is_live': traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'isLive')),
}
class TuneInPodcastIE(TuneInBaseIE):
_VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/(?:podcasts/[^?#]+-|embed/player/)(?P<id>p\d+)/?(?:#|$)'
_EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/p\d+)']
_TESTS = [{
'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019',
'info_dict': {
'id': 'p1153019',
'title': 'Lex Fridman Podcast',
'description': 'md5:bedc4e5f1c94f7dec6e4317b5654b00d',
},
'playlist_mincount': 200,
}, {
'url': 'https://tunein.com/embed/player/p191660/',
'only_matching': True,
}, {
'url': 'https://tunein.com/podcasts/World-News/BBC-News-p14/',
'info_dict': {
'id': 'p14',
'title': 'BBC News',
'description': 'md5:30b9622bcc4bd101d4acd6f38f284aed',
},
'playlist_mincount': 36,
}]
_PAGE_SIZE = 30
def _real_extract(self, url):
podcast_id = self._match_id(url)
webpage = self._download_webpage(url, podcast_id, fatal=False)
metadata = self._extract_metadata(webpage, podcast_id)
def page_func(page_num):
api_response = self._download_json(
f'https://api.tunein.com/profiles/{podcast_id}/contents', podcast_id,
note=f'Downloading page {page_num + 1}', query={
'filter': 't:free',
'offset': page_num * self._PAGE_SIZE,
'limit': self._PAGE_SIZE,
})
return [
self.url_result(
f'https://tunein.com/podcasts/{podcast_id}?topicId={episode["GuideId"][1:]}',
TuneInPodcastEpisodeIE, title=episode.get('Title'))
for episode in api_response['Items']]
entries = OnDemandPagedList(page_func, self._PAGE_SIZE)
return self.playlist_result(
entries, playlist_id=podcast_id, title=traverse_obj(metadata, ('profiles', podcast_id, 'title')),
description=traverse_obj(metadata, ('profiles', podcast_id, 'description')))
class TuneInPodcastEpisodeIE(TuneInBaseIE):
_VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/podcasts/(?:[^?&]+-)?(?P<podcast_id>p\d+)/?\?topicId=(?P<id>\w\d+)'
_TESTS = [{
'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354',
'info_dict': {
'id': 't236404354',
'title': '#351 MrBeast: Future of YouTube, Twitter, TikTok, and Instagram',
'description': 'md5:2784533b98f8ac45c0820b1e4a8d8bb2',
'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+',
'timestamp': 1673458571,
'upload_date': '20230111',
'series_id': 'p1153019',
'series': 'Lex Fridman Podcast',
'ext': 'mp3',
},
}]
def _real_extract(self, url):
podcast_id, episode_id = self._match_valid_url(url).group('podcast_id', 'id')
episode_id = f't{episode_id}'
webpage = self._download_webpage(url, episode_id)
metadata = self._extract_metadata(webpage, episode_id)
formats, subtitles = self._extract_formats_and_subtitles(episode_id)
return {
'id': episode_id,
'title': traverse_obj(metadata, ('profiles', episode_id, 'title')),
'description': traverse_obj(metadata, ('profiles', episode_id, 'description')),
'thumbnail': traverse_obj(metadata, ('profiles', episode_id, 'image')),
'timestamp': parse_iso8601(
traverse_obj(metadata, ('profiles', episode_id, 'actions', 'play', 'publishTime'))),
'series_id': podcast_id,
'series': traverse_obj(metadata, ('profiles', podcast_id, 'title')),
'formats': formats,
'subtitles': subtitles,
}
class TuneInShortenerIE(InfoExtractor): class TuneInShortenerIE(InfoExtractor):
_WORKING = False
IE_NAME = 'tunein:shortener' IE_NAME = 'tunein:shortener'
IE_DESC = False # Do not list IE_DESC = False # Do not list
_VALID_URL = r'https?://tun\.in/(?P<id>[^/?#]+)' _VALID_URL = r'https?://tun\.in/(?P<id>[A-Za-z0-9]+)'
_TESTS = [{ _TESTS = [{
# test redirection
'url': 'http://tun.in/ser7s', 'url': 'http://tun.in/ser7s',
'info_dict': { 'info_dict': {
'id': 's34682', 'id': 's34682',
'title': str, 'title': str,
'ext': 'mp3',
'alt_title': 'World Class Jazz',
'channel_follower_count': int,
'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b',
'location': r're:Seattle-Tacoma, (?:US|WA)', 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+',
'live_status': 'is_live', 'location': 'Seattle-Tacoma, US',
'thumbnail': r're:https?://.+',
},
'params': {'skip_download': 'Livestream'},
}, {
'url': 'http://tun.in/tqeeFw',
'info_dict': {
'id': 't236404354',
'title': str,
'ext': 'mp3', 'ext': 'mp3',
'alt_title': 'Technology Podcasts >', 'live_status': 'is_live',
'cast': 'count:1',
'description': 'md5:1029895354ef073ff00f20b82eb6eb71',
'display_id': '236404354',
'duration': 8330,
'series': 'Lex Fridman Podcast',
'series_id': 'p1153019',
'thumbnail': r're:https?://.+',
'timestamp': 1673458571,
'upload_date': '20230111',
}, },
'params': {'skip_download': 'Livestream'}, 'params': {
}, { 'skip_download': True, # live stream
'url': 'http://tun.in/pei6i',
'info_dict': {
'id': 'p14',
'title': 'BBC News',
}, },
'playlist_mincount': 35,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
redirect_id = self._match_id(url) redirect_id = self._match_id(url)
# The server doesn't support HEAD requests # The server doesn't support HEAD requests
urlh = self._request_webpage(url, redirect_id, 'Downloading redirect page') urlh = self._request_webpage(
# Need to strip port from URL url, redirect_id, note='Downloading redirect page')
parsed = urllib.parse.urlparse(urlh.url)
new_url = parsed._replace(netloc=parsed.hostname).geturl() url = urlh.url
# Prevent infinite loop in case redirect fails url_parsed = urllib.parse.urlparse(url)
if self.suitable(new_url): if url_parsed.port == 443:
raise UnsupportedError(new_url) url = url_parsed._replace(netloc=url_parsed.hostname).url
return self.url_result(new_url)
self.to_screen(f'Following redirect: {url}')
return self.url_result(url)

119
yt_dlp/extractor/vesti.py Normal file
View File

@ -0,0 +1,119 @@
import re
from .common import InfoExtractor
from .rutv import RUTVIE
from ..utils import ExtractorError
class VestiIE(InfoExtractor):
_WORKING = False
IE_DESC = 'Вести.Ru'
_VALID_URL = r'https?://(?:.+?\.)?vesti\.ru/(?P<id>.+)'
_TESTS = [
{
'url': 'http://www.vesti.ru/videos?vid=575582&cid=1',
'info_dict': {
'id': '765035',
'ext': 'mp4',
'title': 'Вести.net: биткоины в России не являются законными',
'description': 'md5:d4bb3859dc1177b28a94c5014c35a36b',
'duration': 302,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://www.vesti.ru/doc.html?id=1349233',
'info_dict': {
'id': '773865',
'ext': 'mp4',
'title': 'Участники митинга штурмуют Донецкую областную администрацию',
'description': 'md5:1a160e98b3195379b4c849f2f4958009',
'duration': 210,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://www.vesti.ru/only_video.html?vid=576180',
'info_dict': {
'id': '766048',
'ext': 'mp4',
'title': 'США заморозило, Британию затопило',
'description': 'md5:f0ed0695ec05aed27c56a70a58dc4cc1',
'duration': 87,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://hitech.vesti.ru/news/view/id/4000',
'info_dict': {
'id': '766888',
'ext': 'mp4',
'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
'duration': 279,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403',
'info_dict': {
'id': '766403',
'ext': 'mp4',
'title': 'XXII зимние Олимпийские игры. Российские хоккеисты стартовали на Олимпиаде с победы',
'description': 'md5:55805dfd35763a890ff50fa9e35e31b3',
'duration': 271,
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Blocked outside Russia',
},
{
'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
'info_dict': {
'id': '51499',
'ext': 'flv',
'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
},
'params': {
# rtmp download
'skip_download': True,
},
'skip': 'Translation has finished',
},
]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
mobj = re.search(
r'<meta[^>]+?property="og:video"[^>]+?content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)',
page)
if mobj:
video_id = mobj.group('id')
page = self._download_webpage(f'http://www.vesti.ru/only_video.html?vid={video_id}', video_id,
'Downloading video page')
rutv_url = RUTVIE._extract_url(page)
if rutv_url:
return self.url_result(rutv_url, 'RUTV')
raise ExtractorError('No video found', expected=True)