Compare commits

...

6 Commits

Author SHA1 Message Date
dirkf 4416f82c80 [Vbox7IE] Sanitise ld+json containing unexpected characters 4 months ago
dirkf bdda6b81df [Vbox7IE] Improve extraction 4 months ago
dirkf 1fd8f802b8 [InfoExtractor] Correctly resolve BaseURL in DASH manifest 4 months ago
dirkf 4eaeb9b2c6 [InfoExtractor] Support byte range for DASH 4 months ago
dirkf bec9180e89 [downloader/dash] Support `range` in fragment (format f'{start}-{end}') 4 months ago
dirkf c58b655a9e [InfoExtractor] Support DASH subtitle extraction (yt-dlp back-port) 4 months ago
  1. 179
      test/test_InfoExtractor.py
  2. 35
      test/testdata/mpd/range_only.mpd
  3. 351
      test/testdata/mpd/subtitles.mpd
  4. 32
      test/testdata/mpd/url_and_range.mpd
  5. 9
      youtube_dl/downloader/dash.py
  6. 457
      youtube_dl/extractor/common.py
  7. 151
      youtube_dl/extractor/vbox7.py

@ -993,7 +993,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'tbr': 5997.485, 'tbr': 5997.485,
'width': 1920, 'width': 1920,
'height': 1080, 'height': 1080,
}] }],
{},
), ( ), (
# https://github.com/ytdl-org/youtube-dl/pull/14844 # https://github.com/ytdl-org/youtube-dl/pull/14844
'urls_only', 'urls_only',
@ -1076,7 +1077,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'tbr': 4400, 'tbr': 4400,
'width': 1920, 'width': 1920,
'height': 1080, 'height': 1080,
}] }],
{},
), ( ), (
# https://github.com/ytdl-org/youtube-dl/issues/20346 # https://github.com/ytdl-org/youtube-dl/issues/20346
# Media considered unfragmented even though it contains # Media considered unfragmented even though it contains
@ -1122,18 +1124,185 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'width': 360, 'width': 360,
'height': 360, 'height': 360,
'fps': 30, 'fps': 30,
}] }],
{},
), (
# https://github.com/ytdl-org/youtube-dl/issues/30235
# Bento4 generated test mpd
# mp4dash --mpd-name=manifest.mpd --no-split --use-segment-list mediafiles
'url_and_range',
'http://unknown/manifest.mpd', # mpd_url
'http://unknown/', # mpd_base_url
[{
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/',
'ext': 'm4a',
'format_id': 'audio-und-mp4a.40.2',
'format_note': 'DASH audio',
'container': 'm4a_dash',
'protocol': 'http_dash_segments',
'acodec': 'mp4a.40.2',
'vcodec': 'none',
'tbr': 98.808,
}, {
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/',
'ext': 'mp4',
'format_id': 'video-avc1',
'format_note': 'DASH video',
'container': 'mp4_dash',
'protocol': 'http_dash_segments',
'acodec': 'none',
'vcodec': 'avc1.4D401E',
'tbr': 699.597,
'width': 768,
'height': 432
}],
{},
), (
# https://github.com/ytdl-org/youtube-dl/issues/27575
# GPAC generated test mpd
# MP4Box -dash 10000 -single-file -out manifest.mpd mediafiles
'range_only',
'http://unknown/manifest.mpd', # mpd_url
'http://unknown/', # mpd_base_url
[{
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/audio_dashinit.mp4',
'ext': 'm4a',
'format_id': '2',
'format_note': 'DASH audio',
'container': 'm4a_dash',
'protocol': 'http_dash_segments',
'acodec': 'mp4a.40.2',
'vcodec': 'none',
'tbr': 98.096,
}, {
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/video_dashinit.mp4',
'ext': 'mp4',
'format_id': '1',
'format_note': 'DASH video',
'container': 'mp4_dash',
'protocol': 'http_dash_segments',
'acodec': 'none',
'vcodec': 'avc1.4D401E',
'tbr': 526.987,
'width': 768,
'height': 432
}],
{},
), (
'subtitles',
'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/',
[{
'format_id': 'audio=128001',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'm4a',
'tbr': 128.001,
'asr': 48000,
'format_note': 'DASH audio',
'container': 'm4a_dash',
'vcodec': 'none',
'acodec': 'mp4a.40.2',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=100000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 336,
'height': 144,
'tbr': 100,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=326000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 562,
'height': 240,
'tbr': 326,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=698000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 844,
'height': 360,
'tbr': 698,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=1493000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 1126,
'height': 480,
'tbr': 1493,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=4482000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 1688,
'height': 720,
'tbr': 4482,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}],
{
'en': [
{
'ext': 'mp4',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}
]
},
) )
] ]
for mpd_file, mpd_url, mpd_base_url, expected_formats in _TEST_CASES: for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES:
with open('./test/testdata/mpd/%s.mpd' % mpd_file, with open('./test/testdata/mpd/%s.mpd' % mpd_file,
mode='r', encoding='utf-8') as f: mode='r', encoding='utf-8') as f:
formats = self.ie._parse_mpd_formats( formats, subtitles = self.ie._parse_mpd_formats_and_subtitles(
compat_etree_fromstring(f.read().encode('utf-8')), compat_etree_fromstring(f.read().encode('utf-8')),
mpd_base_url=mpd_base_url, mpd_url=mpd_url) mpd_base_url=mpd_base_url, mpd_url=mpd_url)
self.ie._sort_formats(formats) self.ie._sort_formats(formats)
expect_value(self, formats, expected_formats, None) expect_value(self, formats, expected_formats, None)
expect_value(self, subtitles, expected_subtitles, None)
def test_parse_f4m_formats(self): def test_parse_f4m_formats(self):
_TEST_CASES = [ _TEST_CASES = [

@ -0,0 +1,35 @@
<?xml version="1.0"?>
<!-- MPD file Generated with GPAC version 1.0.1-revrelease at 2021-11-27T20:53:11.690Z -->
<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" minBufferTime="PT1.500S" type="static" mediaPresentationDuration="PT0H0M30.196S" maxSegmentDuration="PT0H0M10.027S" profiles="urn:mpeg:dash:profile:full:2011">
<ProgramInformation moreInformationURL="http://gpac.io">
<Title>manifest.mpd generated by GPAC</Title>
</ProgramInformation>
<Period duration="PT0H0M30.196S">
<AdaptationSet segmentAlignment="true" maxWidth="768" maxHeight="432" maxFrameRate="30000/1001" par="16:9" lang="und" startWithSAP="1">
<Representation id="1" mimeType="video/mp4" codecs="avc1.4D401E" width="768" height="432" frameRate="30000/1001" sar="1:1" bandwidth="526987">
<BaseURL>video_dashinit.mp4</BaseURL>
<SegmentList timescale="90000" duration="900000">
<Initialization range="0-881"/>
<SegmentURL mediaRange="882-876094" indexRange="882-925"/>
<SegmentURL mediaRange="876095-1466732" indexRange="876095-876138"/>
<SegmentURL mediaRange="1466733-1953615" indexRange="1466733-1466776"/>
<SegmentURL mediaRange="1953616-1994211" indexRange="1953616-1953659"/>
</SegmentList>
</Representation>
</AdaptationSet>
<AdaptationSet segmentAlignment="true" lang="und" startWithSAP="1">
<Representation id="2" mimeType="audio/mp4" codecs="mp4a.40.2" audioSamplingRate="48000" bandwidth="98096">
<AudioChannelConfiguration schemeIdUri="urn:mpeg:dash:23003:3:audio_channel_configuration:2011" value="2"/>
<BaseURL>audio_dashinit.mp4</BaseURL>
<SegmentList timescale="48000" duration="480000">
<Initialization range="0-752"/>
<SegmentURL mediaRange="753-124129" indexRange="753-796"/>
<SegmentURL mediaRange="124130-250544" indexRange="124130-124173"/>
<SegmentURL mediaRange="250545-374929" indexRange="250545-250588"/>
</SegmentList>
</Representation>
</AdaptationSet>
</Period>
</MPD>

@ -0,0 +1,351 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Created with Unified Streaming Platform (version=1.10.18-20255) -->
<MPD
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="urn:mpeg:dash:schema:mpd:2011"
xsi:schemaLocation="urn:mpeg:dash:schema:mpd:2011 http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-DASH_schema_files/DASH-MPD.xsd"
type="static"
mediaPresentationDuration="PT14M48S"
maxSegmentDuration="PT1M"
minBufferTime="PT10S"
profiles="urn:mpeg:dash:profile:isoff-live:2011">
<Period
id="1"
duration="PT14M48S">
<BaseURL>dash/</BaseURL>
<AdaptationSet
id="1"
group="1"
contentType="audio"
segmentAlignment="true"
audioSamplingRate="48000"
mimeType="audio/mp4"
codecs="mp4a.40.2"
startWithSAP="1">
<AudioChannelConfiguration
schemeIdUri="urn:mpeg:dash:23003:3:audio_channel_configuration:2011"
value="2" />
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="main" />
<SegmentTemplate
timescale="48000"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="3584" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="audio=128001"
bandwidth="128001">
</Representation>
</AdaptationSet>
<AdaptationSet
id="2"
group="3"
contentType="text"
lang="en"
mimeType="application/mp4"
codecs="stpp"
startWithSAP="1">
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="subtitle" />
<SegmentTemplate
timescale="1000"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="60000" r="9" />
<S d="24000" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="textstream_eng=1000"
bandwidth="1000">
</Representation>
</AdaptationSet>
<AdaptationSet
id="3"
group="2"
contentType="video"
par="960:409"
minBandwidth="100000"
maxBandwidth="4482000"
maxWidth="1689"
maxHeight="720"
segmentAlignment="true"
mimeType="video/mp4"
codecs="avc1.4D401F"
startWithSAP="1">
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="main" />
<SegmentTemplate
timescale="12288"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="24576" r="443" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="video=100000"
bandwidth="100000"
width="336"
height="144"
sar="2880:2863"
scanType="progressive">
</Representation>
<Representation
id="video=326000"
bandwidth="326000"
width="562"
height="240"
sar="115200:114929"
scanType="progressive">
</Representation>
<Representation
id="video=698000"
bandwidth="698000"
width="844"
height="360"
sar="86400:86299"
scanType="progressive">
</Representation>
<Representation
id="video=1493000"
bandwidth="1493000"
width="1126"
height="480"
sar="230400:230267"
scanType="progressive">
</Representation>
<Representation
id="video=4482000"
bandwidth="4482000"
width="1688"
height="720"
sar="86400:86299"
scanType="progressive">
</Representation>
</AdaptationSet>
</Period>
</MPD>

@ -0,0 +1,32 @@
<?xml version="1.0" ?>
<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" profiles="urn:mpeg:dash:profile:isoff-live:2011" minBufferTime="PT10.01S" mediaPresentationDuration="PT30.097S" type="static">
<!-- Created with Bento4 mp4-dash.py, VERSION=2.0.0-639 -->
<Period>
<!-- Video -->
<AdaptationSet mimeType="video/mp4" segmentAlignment="true" startWithSAP="1" maxWidth="768" maxHeight="432">
<Representation id="video-avc1" codecs="avc1.4D401E" width="768" height="432" scanType="progressive" frameRate="30000/1001" bandwidth="699597">
<SegmentList timescale="1000" duration="10010">
<Initialization sourceURL="video-frag.mp4" range="36-746"/>
<SegmentURL media="video-frag.mp4" mediaRange="747-876117"/>
<SegmentURL media="video-frag.mp4" mediaRange="876118-1466913"/>
<SegmentURL media="video-frag.mp4" mediaRange="1466914-1953954"/>
<SegmentURL media="video-frag.mp4" mediaRange="1953955-1994652"/>
</SegmentList>
</Representation>
</AdaptationSet>
<!-- Audio -->
<AdaptationSet mimeType="audio/mp4" startWithSAP="1" segmentAlignment="true">
<Representation id="audio-und-mp4a.40.2" codecs="mp4a.40.2" bandwidth="98808" audioSamplingRate="48000">
<AudioChannelConfiguration schemeIdUri="urn:mpeg:mpegB:cicp:ChannelConfiguration" value="2"/>
<SegmentList timescale="1000" duration="10010">
<Initialization sourceURL="audio-frag.mp4" range="32-623"/>
<SegmentURL media="audio-frag.mp4" mediaRange="624-124199"/>
<SegmentURL media="audio-frag.mp4" mediaRange="124200-250303"/>
<SegmentURL media="audio-frag.mp4" mediaRange="250304-374365"/>
<SegmentURL media="audio-frag.mp4" mediaRange="374366-374836"/>
</SegmentList>
</Representation>
</AdaptationSet>
</Period>
</MPD>

@ -35,6 +35,7 @@ class DashSegmentsFD(FragmentFD):
for frag_index, fragment in enumerate(fragments, 1): for frag_index, fragment in enumerate(fragments, 1):
if frag_index <= ctx['fragment_index']: if frag_index <= ctx['fragment_index']:
continue continue
success = False
# In DASH, the first segment contains necessary headers to # In DASH, the first segment contains necessary headers to
# generate a valid MP4 file, so always abort for the first segment # generate a valid MP4 file, so always abort for the first segment
fatal = frag_index == 1 or not skip_unavailable_fragments fatal = frag_index == 1 or not skip_unavailable_fragments
@ -42,10 +43,14 @@ class DashSegmentsFD(FragmentFD):
if not fragment_url: if not fragment_url:
assert fragment_base_url assert fragment_base_url
fragment_url = urljoin(fragment_base_url, fragment['path']) fragment_url = urljoin(fragment_base_url, fragment['path'])
success = False headers = info_dict.get('http_headers')
fragment_range = fragment.get('range')
if fragment_range:
headers = headers.copy() if headers else {}
headers['Range'] = 'bytes=%s' % (fragment_range,)
for count in itertools.count(): for count in itertools.count():
try: try:
success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) success, frag_content = self._download_fragment(ctx, fragment_url, info_dict, headers)
if not success: if not success:
return False return False
self._append_fragment(ctx, frag_content) self._append_fragment(ctx, frag_content)

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import base64 import base64
import collections
import datetime import datetime
import functools import functools
import hashlib import hashlib
@ -58,6 +59,7 @@ from ..utils import (
GeoRestrictedError, GeoRestrictedError,
GeoUtils, GeoUtils,
int_or_none, int_or_none,
join_nonempty,
js_to_json, js_to_json,
JSON_LD_RE, JSON_LD_RE,
mimetype2ext, mimetype2ext,
@ -74,6 +76,7 @@ from ..utils import (
str_or_none, str_or_none,
str_to_int, str_to_int,
strip_or_none, strip_or_none,
T,
traverse_obj, traverse_obj,
try_get, try_get,
unescapeHTML, unescapeHTML,
@ -180,6 +183,8 @@ class InfoExtractor(object):
fragment_base_url fragment_base_url
* "duration" (optional, int or float) * "duration" (optional, int or float)
* "filesize" (optional, int) * "filesize" (optional, int)
* "range" (optional, str of the form "start-end"
to use in HTTP Range header)
* preference Order number of this format. If this field is * preference Order number of this format. If this field is
present and not None, the formats get sorted present and not None, the formats get sorted
by this field, regardless of all other values. by this field, regardless of all other values.
@ -1751,6 +1756,12 @@ class InfoExtractor(object):
'format_note': 'Quality selection URL', 'format_note': 'Quality selection URL',
} }
def _report_ignoring_subs(self, name):
self.report_warning(bug_reports_message(
'Ignoring subtitle tracks found in the {0} manifest; '
'if any subtitle tracks are missing,'.format(name)
), only_once=True)
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None, entry_protocol='m3u8', preference=None,
m3u8_id=None, note=None, errnote=None, m3u8_id=None, note=None, errnote=None,
@ -2191,23 +2202,46 @@ class InfoExtractor(object):
}) })
return entries return entries
def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): def _extract_mpd_formats(self, *args, **kwargs):
fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
if subs:
self._report_ignoring_subs('DASH')
return fmts
def _extract_mpd_formats_and_subtitles(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers=None, query=None):
# TODO: or not? param not yet implemented
if self.get_param('ignore_no_formats_error'):
fatal = False
res = self._download_xml_handle( res = self._download_xml_handle(
mpd_url, video_id, mpd_url, video_id,
note=note or 'Downloading MPD manifest', note='Downloading MPD manifest' if note is None else note,
errnote=errnote or 'Failed to download MPD manifest', errnote='Failed to download MPD manifest' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers, query=query) fatal=fatal, data=data, headers=headers or {}, query=query or {})
if res is False: if res is False:
return [] return [], {}
mpd_doc, urlh = res mpd_doc, urlh = res
if mpd_doc is None: if mpd_doc is None:
return [] return [], {}
mpd_base_url = base_url(urlh.geturl())
# We could have been redirected to a new url when we retrieved our mpd file.
mpd_url = urlh.geturl()
mpd_base_url = base_url(mpd_url)
return self._parse_mpd_formats( return self._parse_mpd_formats_and_subtitles(
mpd_doc, mpd_id, mpd_base_url, mpd_url) mpd_doc, mpd_id, mpd_base_url, mpd_url)
def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): def _parse_mpd_formats(self, *args, **kwargs):
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
if subs:
self._report_ignoring_subs('DASH')
return fmts
def _parse_mpd_formats_and_subtitles(
self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
""" """
Parse formats from MPD manifest. Parse formats from MPD manifest.
References: References:
@ -2215,8 +2249,10 @@ class InfoExtractor(object):
http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
""" """
if mpd_doc.get('type') == 'dynamic': # TODO: param not yet implemented: default like previous yt-dl logic
return [] if not self.get_param('dynamic_mpd', False):
if mpd_doc.get('type') == 'dynamic':
return [], {}
namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
@ -2226,8 +2262,24 @@ class InfoExtractor(object):
def is_drm_protected(element): def is_drm_protected(element):
return element.find(_add_ns('ContentProtection')) is not None return element.find(_add_ns('ContentProtection')) is not None
from ..utils import YoutubeDLHandler
fix_path = YoutubeDLHandler._fix_path
def resolve_base_url(element, parent_base_url=None):
# TODO: use native XML traversal when ready
b_url = traverse_obj(element, (
T(lambda e: e.find(_add_ns('BaseURL')).text)))
if parent_base_url and b_url:
if not parent_base_url[-1] in ('/', ':'):
parent_base_url += '/'
b_url = compat_urlparse.urljoin(parent_base_url, b_url)
if b_url:
b_url = fix_path(b_url)
return b_url or parent_base_url
def extract_multisegment_info(element, ms_parent_info): def extract_multisegment_info(element, ms_parent_info):
ms_info = ms_parent_info.copy() ms_info = ms_parent_info.copy()
base_url = ms_info['base_url'] = resolve_base_url(element, ms_info.get('base_url'))
# As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
# common attributes and elements. We will only extract relevant # common attributes and elements. We will only extract relevant
@ -2261,15 +2313,27 @@ class InfoExtractor(object):
def extract_Initialization(source): def extract_Initialization(source):
initialization = source.find(_add_ns('Initialization')) initialization = source.find(_add_ns('Initialization'))
if initialization is not None: if initialization is not None:
ms_info['initialization_url'] = initialization.attrib['sourceURL'] ms_info['initialization_url'] = initialization.get('sourceURL') or base_url
initialization_url_range = initialization.get('range')
if initialization_url_range:
ms_info['initialization_url_range'] = initialization_url_range
segment_list = element.find(_add_ns('SegmentList')) segment_list = element.find(_add_ns('SegmentList'))
if segment_list is not None: if segment_list is not None:
extract_common(segment_list) extract_common(segment_list)
extract_Initialization(segment_list) extract_Initialization(segment_list)
segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
if segment_urls_e: segment_urls = traverse_obj(segment_urls_e, (
ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] Ellipsis, T(lambda e: e.attrib), 'media'))
if segment_urls:
ms_info['segment_urls'] = segment_urls
segment_urls_range = traverse_obj(segment_urls_e, (
Ellipsis, T(lambda e: e.attrib), 'mediaRange',
T(lambda r: re.findall(r'^\d+-\d+$', r)), 0))
if segment_urls_range:
ms_info['segment_urls_range'] = segment_urls_range
if not segment_urls:
ms_info['segment_urls'] = [base_url for _ in segment_urls_range]
else: else:
segment_template = element.find(_add_ns('SegmentTemplate')) segment_template = element.find(_add_ns('SegmentTemplate'))
if segment_template is not None: if segment_template is not None:
@ -2285,17 +2349,20 @@ class InfoExtractor(object):
return ms_info return ms_info
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
formats = [] formats, subtitles = [], {}
stream_numbers = collections.defaultdict(int)
mpd_base_url = resolve_base_url(mpd_doc, mpd_base_url or mpd_url)
for period in mpd_doc.findall(_add_ns('Period')): for period in mpd_doc.findall(_add_ns('Period')):
period_duration = parse_duration(period.get('duration')) or mpd_duration period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, { period_ms_info = extract_multisegment_info(period, {
'start_number': 1, 'start_number': 1,
'timescale': 1, 'timescale': 1,
'base_url': mpd_base_url,
}) })
for adaptation_set in period.findall(_add_ns('AdaptationSet')): for adaptation_set in period.findall(_add_ns('AdaptationSet')):
if is_drm_protected(adaptation_set): if is_drm_protected(adaptation_set):
continue continue
adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) adaptation_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
for representation in adaptation_set.findall(_add_ns('Representation')): for representation in adaptation_set.findall(_add_ns('Representation')):
if is_drm_protected(representation): if is_drm_protected(representation):
continue continue
@ -2303,27 +2370,35 @@ class InfoExtractor(object):
representation_attrib.update(representation.attrib) representation_attrib.update(representation.attrib)
# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
mime_type = representation_attrib['mimeType'] mime_type = representation_attrib['mimeType']
content_type = mime_type.split('/')[0] content_type = representation_attrib.get('contentType') or mime_type.split('/')[0]
if content_type == 'text': codec_str = representation_attrib.get('codecs', '')
# TODO implement WebVTT downloading # Some kind of binary subtitle found in some youtube livestreams
pass if mime_type == 'application/x-rawcc':
elif content_type in ('video', 'audio'): codecs = {'scodec': codec_str}
base_url = '' else:
for element in (representation, adaptation_set, period, mpd_doc): codecs = parse_codecs(codec_str)
base_url_e = element.find(_add_ns('BaseURL')) if content_type not in ('video', 'audio', 'text'):
if base_url_e is not None: if mime_type == 'image/jpeg':
base_url = base_url_e.text + base_url content_type = mime_type
if re.match(r'^https?://', base_url): elif codecs.get('vcodec', 'none') != 'none':
break content_type = 'video'
if mpd_base_url and not re.match(r'^https?://', base_url): elif codecs.get('acodec', 'none') != 'none':
if not mpd_base_url.endswith('/') and not base_url.startswith('/'): content_type = 'audio'
mpd_base_url += '/' elif codecs.get('scodec', 'none') != 'none':
base_url = mpd_base_url + base_url content_type = 'text'
representation_id = representation_attrib.get('id') elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
lang = representation_attrib.get('lang') content_type = 'text'
url_el = representation.find(_add_ns('BaseURL')) else:
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
bandwidth = int_or_none(representation_attrib.get('bandwidth')) continue
representation_id = representation_attrib.get('id')
lang = representation_attrib.get('lang')
url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
format_id = join_nonempty(representation_id or content_type, mpd_id)
if content_type in ('video', 'audio'):
f = { f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
'manifest_url': mpd_url, 'manifest_url': mpd_url,
@ -2338,104 +2413,130 @@ class InfoExtractor(object):
'filesize': filesize, 'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash', 'container': mimetype2ext(mime_type) + '_dash',
} }
f.update(parse_codecs(representation_attrib.get('codecs'))) f.update(codecs)
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) elif content_type == 'text':
f = {
def prepare_template(template_name, identifiers): 'ext': mimetype2ext(mime_type),
tmpl = representation_ms_info[template_name] 'manifest_url': mpd_url,
# First of, % characters outside $...$ templates 'filesize': filesize,
# must be escaped by doubling for proper processing }
# by % operator string formatting used further (see elif content_type == 'image/jpeg':
# https://github.com/ytdl-org/youtube-dl/issues/16867). # See test case in VikiIE
t = '' # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
in_template = False f = {
for c in tmpl: 'format_id': format_id,
'ext': 'mhtml',
'manifest_url': mpd_url,
'format_note': 'DASH storyboards (jpeg)',
'acodec': 'none',
'vcodec': 'none',
}
if is_drm_protected(adaptation_set) or is_drm_protected(representation):
f['has_drm'] = True
representation_ms_info = extract_multisegment_info(representation, adaptation_set_ms_info)
def prepare_template(template_name, identifiers):
tmpl = representation_ms_info[template_name]
# First of, % characters outside $...$ templates
# must be escaped by doubling for proper processing
# by % operator string formatting used further (see
# https://github.com/ytdl-org/youtube-dl/issues/16867).
t = ''
in_template = False
for c in tmpl:
t += c
if c == '$':
in_template = not in_template
elif c == '%' and not in_template:
t += c t += c
if c == '$': # Next, $...$ templates are translated to their
in_template = not in_template # %(...) counterparts to be used with % operator
elif c == '%' and not in_template: t = t.replace('$RepresentationID$', representation_id)
t += c t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
# Next, $...$ templates are translated to their t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
# %(...) counterparts to be used with % operator t.replace('$$', '$')
t = t.replace('$RepresentationID$', representation_id) return t
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) # @initialization is a regular template like @media one
t.replace('$$', '$') # so it should be handled just the same way (see
return t # https://github.com/ytdl-org/youtube-dl/issues/11605)
if 'initialization' in representation_ms_info:
# @initialization is a regular template like @media one initialization_template = prepare_template(
# so it should be handled just the same way (see 'initialization',
# https://github.com/ytdl-org/youtube-dl/issues/11605) # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
if 'initialization' in representation_ms_info: # $Time$ shall not be included for @initialization thus
initialization_template = prepare_template( # only $Bandwidth$ remains
'initialization', ('Bandwidth', ))
# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and representation_ms_info['initialization_url'] = initialization_template % {
# $Time$ shall not be included for @initialization thus 'Bandwidth': bandwidth,
# only $Bandwidth$ remains }
('Bandwidth', ))
representation_ms_info['initialization_url'] = initialization_template % {
'Bandwidth': bandwidth,
}
def location_key(location):
return 'url' if re.match(r'^https?://', location) else 'path'
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
media_location_key = location_key(media_template)
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
if '%(Number' in media_template and 's' not in representation_ms_info:
segment_duration = None
if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
representation_ms_info['fragments'] = [{
media_location_key: media_template % {
'Number': segment_number,
'Bandwidth': bandwidth,
},
'duration': segment_duration,
} for segment_number in range(
representation_ms_info['start_number'],
representation_ms_info['total_number'] + representation_ms_info['start_number'])]
else:
# $Number*$ or $Time$ in media template with S list available
# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
representation_ms_info['fragments'] = []
segment_time = 0
segment_d = None
segment_number = representation_ms_info['start_number']
def add_segment_url():
segment_url = media_template % {
'Time': segment_time,
'Bandwidth': bandwidth,
'Number': segment_number,
}
representation_ms_info['fragments'].append({
media_location_key: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
for num, s in enumerate(representation_ms_info['s']): def location_key(location):
segment_time = s.get('t') or segment_time return 'url' if re.match(r'^https?://', location) else 'path'
segment_d = s['d']
def calc_segment_duration():
return float_or_none(
representation_ms_info['segment_duration'],
representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
media_location_key = location_key(media_template)
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
if '%(Number' in media_template and 's' not in representation_ms_info:
segment_duration = None
if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(
float_or_none(period_duration, segment_duration, default=0)))
representation_ms_info['fragments'] = [{
media_location_key: media_template % {
'Number': segment_number,
'Bandwidth': bandwidth,
},
'duration': segment_duration,
} for segment_number in range(
representation_ms_info['start_number'],
representation_ms_info['total_number'] + representation_ms_info['start_number'])]
else:
# $Number*$ or $Time$ in media template with S list available
# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
representation_ms_info['fragments'] = []
segment_time = 0
segment_d = None
segment_number = representation_ms_info['start_number']
def add_segment_url():
segment_url = media_template % {
'Time': segment_time,
'Bandwidth': bandwidth,
'Number': segment_number,
}
representation_ms_info['fragments'].append({
media_location_key: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
for num, s in enumerate(representation_ms_info['s']):
segment_time = s.get('t') or segment_time
segment_d = s['d']
add_segment_url()
segment_number += 1
for r in range(s.get('r', 0)):
segment_time += segment_d
add_segment_url() add_segment_url()
segment_number += 1 segment_number += 1
for r in range(s.get('r', 0)): segment_time += segment_d
segment_time += segment_d elif 'segment_urls' in representation_ms_info:
add_segment_url() fragments = []
segment_number += 1 if 's' in representation_ms_info:
segment_time += segment_d
elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
# No media template # No media template
# Example: https://www.youtube.com/watch?v=iXZV5uAYMJI # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
# or any YouTube dashsegments video # or any YouTube dashsegments video
fragments = []
segment_index = 0 segment_index = 0
timescale = representation_ms_info['timescale'] timescale = representation_ms_info['timescale']
for s in representation_ms_info['s']: for s in representation_ms_info['s']:
@ -2447,48 +2548,78 @@ class InfoExtractor(object):
'duration': duration, 'duration': duration,
}) })
segment_index += 1 segment_index += 1
representation_ms_info['fragments'] = fragments elif 'segment_urls_range' in representation_ms_info:
elif 'segment_urls' in representation_ms_info: # Segment URLs with mediaRange
# Example: https://kinescope.io/200615537/master.mpd
# https://github.com/ytdl-org/youtube-dl/issues/30235
# or any mpd generated with Bento4 `mp4dash --no-split --use-segment-list`
segment_duration = calc_segment_duration()
for segment_url, segment_url_range in zip(
representation_ms_info['segment_urls'], representation_ms_info['segment_urls_range']):
fragments.append({
location_key(segment_url): segment_url,
'range': segment_url_range,
'duration': segment_duration,
})
else:
# Segment URLs with no SegmentTimeline # Segment URLs with no SegmentTimeline
# Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
# https://github.com/ytdl-org/youtube-dl/pull/14844 # https://github.com/ytdl-org/youtube-dl/pull/14844
fragments = [] segment_duration = calc_segment_duration()
segment_duration = float_or_none(
representation_ms_info['segment_duration'],
representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
for segment_url in representation_ms_info['segment_urls']: for segment_url in representation_ms_info['segment_urls']:
fragment = { fragments.append({
location_key(segment_url): segment_url, location_key(segment_url): segment_url,
} 'duration': segment_duration,
if segment_duration: })
fragment['duration'] = segment_duration representation_ms_info['fragments'] = fragments
fragments.append(fragment)
representation_ms_info['fragments'] = fragments # If there is a fragments key available then we correctly recognized fragmented media.
# If there is a fragments key available then we correctly recognized fragmented media. # Otherwise we will assume unfragmented media with direct access. Technically, such
# Otherwise we will assume unfragmented media with direct access. Technically, such # assumption is not necessarily correct since we may simply have no support for
# assumption is not necessarily correct since we may simply have no support for # some forms of fragmented media renditions yet, but for now we'll use this fallback.
# some forms of fragmented media renditions yet, but for now we'll use this fallback. if 'fragments' in representation_ms_info:
if 'fragments' in representation_ms_info: base_url = representation_ms_info['base_url']
f.update({ f.update({
# NB: mpd_url may be empty when MPD manifest is parsed from a string # NB: mpd_url may be empty when MPD manifest is parsed from a string
'url': mpd_url or base_url, 'url': mpd_url or base_url,
'fragment_base_url': base_url, 'fragment_base_url': base_url,
'fragments': [], 'fragments': [],
'protocol': 'http_dash_segments', 'protocol': 'http_dash_segments',
})
if 'initialization_url' in representation_ms_info and 'initialization_url_range' in representation_ms_info:
# Initialization URL with range (accompanied by Segment URLs with mediaRange above)
# https://github.com/ytdl-org/youtube-dl/issues/30235
initialization_url = representation_ms_info['initialization_url']
f['fragments'].append({
location_key(initialization_url): initialization_url,
'range': representation_ms_info['initialization_url_range'],
}) })
if 'initialization_url' in representation_ms_info: elif 'initialization_url' in representation_ms_info:
initialization_url = representation_ms_info['initialization_url'] initialization_url = representation_ms_info['initialization_url']
if not f.get('url'): if not f.get('url'):
f['url'] = initialization_url f['url'] = initialization_url
f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].append({location_key(initialization_url): initialization_url})
f['fragments'].extend(representation_ms_info['fragments']) elif 'initialization_url_range' in representation_ms_info:
else: # no Initialization URL but range (accompanied by no Segment URLs but mediaRange above)
# Assuming direct URL to unfragmented media. # https://github.com/ytdl-org/youtube-dl/issues/27575
f['url'] = base_url f['fragments'].append({
formats.append(f) location_key(base_url): base_url,
'range': representation_ms_info['initialization_url_range'],
})
f['fragments'].extend(representation_ms_info['fragments'])
if not period_duration:
period_duration = sum(traverse_obj(representation_ms_info, (
'fragments', Ellipsis, 'duration', T(float_or_none))))
else: else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) # Assuming direct URL to unfragmented media.
return formats f['url'] = representation_ms_info['base_url']
if content_type in ('video', 'audio', 'image/jpeg'):
f['manifest_stream_number'] = stream_numbers[f['url']]
stream_numbers[f['url']] += 1
formats.append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
return formats, subtitles
def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle( res = self._download_xml_handle(

@ -2,9 +2,20 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import time
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError from ..compat import compat_kwargs
from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
merge_dicts,
T,
traverse_obj,
txt_or_none,
url_or_none,
)
class Vbox7IE(InfoExtractor): class Vbox7IE(InfoExtractor):
@ -20,10 +31,12 @@ class Vbox7IE(InfoExtractor):
) )
(?P<id>[\da-fA-F]+) (?P<id>[\da-fA-F]+)
''' '''
_EMBED_REGEX = [r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)']
_GEO_COUNTRIES = ['BG'] _GEO_COUNTRIES = ['BG']
_GEO_BYPASS = False
_TESTS = [{ _TESTS = [{
'url': 'http://vbox7.com/play:0946fff23c', 'url': 'https://vbox7.com/play:0946fff23c',
'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', 'md5': '50ca1f78345a9c15391af47d8062d074',
'info_dict': { 'info_dict': {
'id': '0946fff23c', 'id': '0946fff23c',
'ext': 'mp4', 'ext': 'mp4',
@ -34,18 +47,21 @@ class Vbox7IE(InfoExtractor):
'upload_date': '20160812', 'upload_date': '20160812',
'uploader': 'zdraveibulgaria', 'uploader': 'zdraveibulgaria',
}, },
'params': { 'expected_warnings': [
'proxy': '127.0.0.1:8118', 'Unable to download webpage',
}, ],
}, { }, {
'url': 'http://vbox7.com/play:249bb972c2', 'url': 'http://vbox7.com/play:249bb972c2',
'md5': '99f65c0c9ef9b682b97313e052734c3f', 'md5': 'aaf19465e37ec0b30b918df83ec32c50',
'info_dict': { 'info_dict': {
'id': '249bb972c2', 'id': '249bb972c2',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Смях! Чудо - чист за секунди - Скрита камера', 'title': 'Смях! Чудо - чист за секунди - Скрита камера',
'description': 'Смях! Чудо - чист за секунди - Скрита камера',
'timestamp': 1360215023,
'upload_date': '20130207',
'uploader': 'svideteliat_ot_varshava',
}, },
'skip': 'georestricted',
}, { }, {
'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1', 'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1',
'only_matching': True, 'only_matching': True,
@ -54,52 +70,109 @@ class Vbox7IE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod @classmethod
def _extract_url(webpage): def _extract_url(cls, webpage):
mobj = re.search( mobj = re.search(cls._EMBED_REGEX[0], webpage)
r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)',
webpage)
if mobj: if mobj:
return mobj.group('url') return mobj.group('url')
# transform_source=None, fatal=True
def _parse_json(self, json_string, video_id, *args, **kwargs):
if '"@context"' in json_string[:30]:
# this is ld+json, or that's the way to bet
transform_source = args[0] if len(args) > 0 else kwargs.get('transform_source')
if not transform_source:
def fix_chars(src):
# fix malformed ld+json: replace raw CRLFs with escaped LFs
return re.sub(
r'"[^"]+"', lambda m: re.sub(r'\r?\n', r'\\n', m.group(0)), src)
if len(args) > 0:
args = (fix_chars,) + args[1:]
else:
kwargs['transform_source'] = fix_chars
kwargs = compat_kwargs(kwargs)
return super(Vbox7IE, self)._parse_json(
json_string, video_id, *args, **kwargs)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
url = 'https://vbox7.com/play:%s' % (video_id,)
now = time.time()
response = self._download_json( response = self._download_json(
'https://www.vbox7.com/ajax/video/nextvideo.php?vid=%s' % video_id, 'https://www.vbox7.com/aj/player/item/options?vid=%s' % (video_id,),
video_id) video_id, headers={'Referer': url})
# estimate time to which possible `ago` member is relative
now = now + 0.5 * (time.time() - now)
if 'error' in response: if 'error' in response:
raise ExtractorError( raise ExtractorError(
'%s said: %s' % (self.IE_NAME, response['error']), expected=True) '%s said: %s' % (self.IE_NAME, response['error']), expected=True)
video = response['options'] video_url = traverse_obj(response, ('options', 'src', T(url_or_none)))
title = video['title']
video_url = video['src']
if '/na.mp4' in video_url: if '/na.mp4' in video_url or '':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES) self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
uploader = video.get('uploader') ext = determine_ext(video_url)
if ext == 'mpd':
webpage = self._download_webpage( # In case MPD cannot be parsed, or anyway, get mp4 combined
'http://vbox7.com/play:%s' % video_id, video_id, fatal=None) # formats usually provided to Safari, iOS, and old Windows
try:
info = {} formats, subtitles = self._extract_mpd_formats_and_subtitles(
video_url, video_id, 'dash', fatal=False)
if webpage: except KeyError:
info = self._search_json_ld( self.report_warning('Failed to parse MPD manifest')
webpage.replace('"/*@context"', '"@context"'), video_id, formats, subtitles = [], {}
fatal=False)
video = response['options']
info.update({ resolutions = (1080, 720, 480, 240, 144)
highest_res = traverse_obj(video, ('highestRes', T(int))) or resolutions[0]
for res in traverse_obj(video, ('resolutions', lambda _, r: int(r) > 0)) or resolutions:
if res > highest_res:
continue
formats.append({
'url': video_url.replace('.mpd', '_%d.mp4' % res),
'format_id': '%dp' % res,
'height': res,
})
# if above formats are flaky, enable the line below
# self._check_formats(formats, video_id)
else:
formats = [{
'url': video_url,
}]
subtitles = {}
self._sort_formats(formats)
webpage = self._download_webpage(url, video_id, fatal=False) or ''
info = self._search_json_ld(
webpage.replace('"/*@context"', '"@context"'), video_id,
fatal=False) if webpage else {}
if not info.get('title'):
info['title'] = traverse_obj(response, (
'options', 'title', T(txt_or_none))) or self._og_search_title(webpage)
def if_missing(k):
return lambda x: None if k in info else x
info = merge_dicts(info, {
'id': video_id, 'id': video_id,
'title': title, 'formats': formats,
'url': video_url, 'subtitles': subtitles or None,
'uploader': uploader, }, info, traverse_obj(response, ('options', {
'thumbnail': self._proto_relative_url( 'uploader': ('uploader', T(txt_or_none)),
'timestamp': ('ago', T(if_missing('timestamp')), T(lambda t: int(round((now - t) / 60.0)) * 60)),
'duration': ('duration', T(if_missing('duration')), T(float_or_none)),
})))
if 'thumbnail' not in info:
info['thumbnail'] = self._proto_relative_url(
info.get('thumbnail') or self._og_search_thumbnail(webpage), info.get('thumbnail') or self._og_search_thumbnail(webpage),
'http:'), 'https:'),
})
return info return info

Loading…
Cancel
Save