[ie/TheHighWire] Add extractor (#13505 )

Closes #13364 Authored by: swayll
[ie/archive.org] Fix extractor (#13706 )
2026-06-15 15:25:07 +00:00 · 2025-07-14 19:01:53 +00:00 · 2025-07-14 18:55:52 +00:00
3 changed files with 53 additions and 9 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -2092,6 +2092,7 @@ from .theguardian import (
    TheGuardianPodcastIE,
    TheGuardianPodcastPlaylistIE,
 )
+from .thehighwire import TheHighWireIE
 from .theholetv import TheHoleTvIE
 from .theintercept import TheInterceptIE
 from .theplatform import (
--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@ -16,6 +16,7 @@ from ..utils import (
    dict_get,
    extract_attributes,
    get_element_by_id,
+    get_element_text_and_html_by_tag,
    int_or_none,
    join_nonempty,
    js_to_json,
@ -72,6 +73,7 @@ class ArchiveOrgIE(InfoExtractor):
            'display_id': 'Cops-v2.mp4',
            'thumbnail': r're:https://archive\.org/download/.*\.jpg',
            'duration': 1091.96,
+            'track': 'Cops-v2',
        },
    }, {
        'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
@ -86,6 +88,7 @@ class ArchiveOrgIE(InfoExtractor):
            'thumbnail': r're:https://archive\.org/download/.*\.jpg',
            'duration': 59.77,
            'display_id': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
+            'track': 'Commercial-JFK1960ElectionAdCampaignJingle',
        },
    }, {
        'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
@ -102,6 +105,7 @@ class ArchiveOrgIE(InfoExtractor):
            'duration': 59.51,
            'license': 'http://creativecommons.org/licenses/publicdomain/',
            'thumbnail': r're:https://archive\.org/download/.*\.jpg',
+            'track': 'Commercial-Nixon1960ElectionAdToughonDefense',
        },
    }, {
        'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',
@ -182,6 +186,7 @@ class ArchiveOrgIE(InfoExtractor):
                    'duration': 130.46,
                    'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_01_000117.jpg',
                    'display_id': 'irelandthemakingofarepublicreel1_01.mov',
+                    'track': 'irelandthemakingofarepublicreel1 01',
                },
            }, {
                'md5': '67335ee3b23a0da930841981c1e79b02',
@ -192,6 +197,7 @@ class ArchiveOrgIE(InfoExtractor):
                    'title': 'irelandthemakingofarepublicreel1_02.mov',
                    'display_id': 'irelandthemakingofarepublicreel1_02.mov',
                    'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_02_001374.jpg',
+                    'track': 'irelandthemakingofarepublicreel1 02',
                },
            }, {
                'md5': 'e470e86787893603f4a341a16c281eb5',
@ -202,6 +208,7 @@ class ArchiveOrgIE(InfoExtractor):
                    'title': 'irelandthemakingofarepublicreel2.mov',
                    'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel2_001554.jpg',
                    'display_id': 'irelandthemakingofarepublicreel2.mov',
+                    'track': 'irelandthemakingofarepublicreel2',
                },
            },
        ],
@ -229,15 +236,8 @@ class ArchiveOrgIE(InfoExtractor):

    @staticmethod
    def _playlist_data(webpage):
-        element = re.findall(r'''(?xs)
-            <input
-            (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
-            \s+class=['"]?js-play8-playlist['"]?
-            (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
-            \s*/>
-        ''', webpage)[0]
-
-        return json.loads(extract_attributes(element)['value'])
+        element = get_element_text_and_html_by_tag('play-av', webpage)[1]
+        return json.loads(extract_attributes(element)['playlist'])

    def _real_extract(self, url):
        video_id = urllib.parse.unquote_plus(self._match_id(url))
--- a/yt_dlp/extractor/thehighwire.py
+++ b/yt_dlp/extractor/thehighwire.py
@ -0,0 +1,43 @@
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    extract_attributes,
+    url_or_none,
+)
+from ..utils.traversal import (
+    find_element,
+    require,
+    traverse_obj,
+)
+
+
+class TheHighWireIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thehighwire\.com/ark-videos/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'https://thehighwire.com/ark-videos/the-deposition-of-stanley-plotkin/',
+        'info_dict': {
+            'id': 'the-deposition-of-stanley-plotkin',
+            'ext': 'mp4',
+            'title': 'THE DEPOSITION OF STANLEY PLOTKIN',
+            'description': 'md5:6d0be4f1181daaa10430fd8b945a5e54',
+            'thumbnail': r're:https?://static\.arkengine\.com/video/.+\.jpg',
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        embed_url = traverse_obj(webpage, (
+            {find_element(cls='ark-video-embed', html=True)},
+            {extract_attributes}, 'src', {url_or_none}, {require('embed URL')}))
+        embed_page = self._download_webpage(embed_url, display_id)
+
+        return {
+            'id': display_id,
+            **traverse_obj(webpage, {
+                'title': ({find_element(cls='section-header')}, {clean_html}),
+                'description': ({find_element(cls='episode-description__copy')}, {clean_html}),
+            }),
+            **self._parse_html5_media_entries(embed_url, embed_page, display_id, m3u8_id='hls')[0],
+        }
Author	SHA1	Message	Date
Nikolay Fedorov	3a84be9d16	[ie/TheHighWire] Add extractor (#13505 ) Closes #13364 Authored by: swayll	2025-07-14 19:01:53 +00:00
rdamas	d42a6ff0c4	[ie/archive.org] Fix extractor (#13706 ) Closes #13704 Authored by: rdamas	2025-07-14 18:55:52 +00:00