Compare commits

...

6 Commits

Author SHA1 Message Date
dirkf 4416f82c80 [Vbox7IE] Sanitise ld+json containing unexpected characters 3 months ago
dirkf bdda6b81df [Vbox7IE] Improve extraction 3 months ago
dirkf 1fd8f802b8 [InfoExtractor] Correctly resolve BaseURL in DASH manifest 3 months ago
dirkf 4eaeb9b2c6 [InfoExtractor] Support byte range for DASH 3 months ago
dirkf bec9180e89 [downloader/dash] Support `range` in fragment (format f'{start}-{end}') 3 months ago
dirkf c58b655a9e [InfoExtractor] Support DASH subtitle extraction (yt-dlp back-port) 3 months ago
  1. 179
      test/test_InfoExtractor.py
  2. 35
      test/testdata/mpd/range_only.mpd
  3. 351
      test/testdata/mpd/subtitles.mpd
  4. 32
      test/testdata/mpd/url_and_range.mpd
  5. 9
      youtube_dl/downloader/dash.py
  6. 457
      youtube_dl/extractor/common.py
  7. 151
      youtube_dl/extractor/vbox7.py

@ -993,7 +993,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'tbr': 5997.485,
'width': 1920,
'height': 1080,
}]
}],
{},
), (
# https://github.com/ytdl-org/youtube-dl/pull/14844
'urls_only',
@ -1076,7 +1077,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'tbr': 4400,
'width': 1920,
'height': 1080,
}]
}],
{},
), (
# https://github.com/ytdl-org/youtube-dl/issues/20346
# Media considered unfragmented even though it contains
@ -1122,18 +1124,185 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'width': 360,
'height': 360,
'fps': 30,
}]
}],
{},
), (
# https://github.com/ytdl-org/youtube-dl/issues/30235
# Bento4 generated test mpd
# mp4dash --mpd-name=manifest.mpd --no-split --use-segment-list mediafiles
'url_and_range',
'http://unknown/manifest.mpd', # mpd_url
'http://unknown/', # mpd_base_url
[{
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/',
'ext': 'm4a',
'format_id': 'audio-und-mp4a.40.2',
'format_note': 'DASH audio',
'container': 'm4a_dash',
'protocol': 'http_dash_segments',
'acodec': 'mp4a.40.2',
'vcodec': 'none',
'tbr': 98.808,
}, {
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/',
'ext': 'mp4',
'format_id': 'video-avc1',
'format_note': 'DASH video',
'container': 'mp4_dash',
'protocol': 'http_dash_segments',
'acodec': 'none',
'vcodec': 'avc1.4D401E',
'tbr': 699.597,
'width': 768,
'height': 432
}],
{},
), (
# https://github.com/ytdl-org/youtube-dl/issues/27575
# GPAC generated test mpd
# MP4Box -dash 10000 -single-file -out manifest.mpd mediafiles
'range_only',
'http://unknown/manifest.mpd', # mpd_url
'http://unknown/', # mpd_base_url
[{
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/audio_dashinit.mp4',
'ext': 'm4a',
'format_id': '2',
'format_note': 'DASH audio',
'container': 'm4a_dash',
'protocol': 'http_dash_segments',
'acodec': 'mp4a.40.2',
'vcodec': 'none',
'tbr': 98.096,
}, {
'manifest_url': 'http://unknown/manifest.mpd',
'fragment_base_url': 'http://unknown/video_dashinit.mp4',
'ext': 'mp4',
'format_id': '1',
'format_note': 'DASH video',
'container': 'mp4_dash',
'protocol': 'http_dash_segments',
'acodec': 'none',
'vcodec': 'avc1.4D401E',
'tbr': 526.987,
'width': 768,
'height': 432
}],
{},
), (
'subtitles',
'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/',
[{
'format_id': 'audio=128001',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'm4a',
'tbr': 128.001,
'asr': 48000,
'format_note': 'DASH audio',
'container': 'm4a_dash',
'vcodec': 'none',
'acodec': 'mp4a.40.2',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=100000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 336,
'height': 144,
'tbr': 100,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=326000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 562,
'height': 240,
'tbr': 326,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=698000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 844,
'height': 360,
'tbr': 698,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=1493000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 1126,
'height': 480,
'tbr': 1493,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}, {
'format_id': 'video=4482000',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'ext': 'mp4',
'width': 1688,
'height': 720,
'tbr': 4482,
'format_note': 'DASH video',
'container': 'mp4_dash',
'vcodec': 'avc1.4D401F',
'acodec': 'none',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}],
{
'en': [
{
'ext': 'mp4',
'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/',
'protocol': 'http_dash_segments',
}
]
},
)
]
for mpd_file, mpd_url, mpd_base_url, expected_formats in _TEST_CASES:
for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES:
with open('./test/testdata/mpd/%s.mpd' % mpd_file,
mode='r', encoding='utf-8') as f:
formats = self.ie._parse_mpd_formats(
formats, subtitles = self.ie._parse_mpd_formats_and_subtitles(
compat_etree_fromstring(f.read().encode('utf-8')),
mpd_base_url=mpd_base_url, mpd_url=mpd_url)
self.ie._sort_formats(formats)
expect_value(self, formats, expected_formats, None)
expect_value(self, subtitles, expected_subtitles, None)
def test_parse_f4m_formats(self):
_TEST_CASES = [

@ -0,0 +1,35 @@
<?xml version="1.0"?>
<!-- MPD file Generated with GPAC version 1.0.1-revrelease at 2021-11-27T20:53:11.690Z -->
<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" minBufferTime="PT1.500S" type="static" mediaPresentationDuration="PT0H0M30.196S" maxSegmentDuration="PT0H0M10.027S" profiles="urn:mpeg:dash:profile:full:2011">
<ProgramInformation moreInformationURL="http://gpac.io">
<Title>manifest.mpd generated by GPAC</Title>
</ProgramInformation>
<Period duration="PT0H0M30.196S">
<AdaptationSet segmentAlignment="true" maxWidth="768" maxHeight="432" maxFrameRate="30000/1001" par="16:9" lang="und" startWithSAP="1">
<Representation id="1" mimeType="video/mp4" codecs="avc1.4D401E" width="768" height="432" frameRate="30000/1001" sar="1:1" bandwidth="526987">
<BaseURL>video_dashinit.mp4</BaseURL>
<SegmentList timescale="90000" duration="900000">
<Initialization range="0-881"/>
<SegmentURL mediaRange="882-876094" indexRange="882-925"/>
<SegmentURL mediaRange="876095-1466732" indexRange="876095-876138"/>
<SegmentURL mediaRange="1466733-1953615" indexRange="1466733-1466776"/>
<SegmentURL mediaRange="1953616-1994211" indexRange="1953616-1953659"/>
</SegmentList>
</Representation>
</AdaptationSet>
<AdaptationSet segmentAlignment="true" lang="und" startWithSAP="1">
<Representation id="2" mimeType="audio/mp4" codecs="mp4a.40.2" audioSamplingRate="48000" bandwidth="98096">
<AudioChannelConfiguration schemeIdUri="urn:mpeg:dash:23003:3:audio_channel_configuration:2011" value="2"/>
<BaseURL>audio_dashinit.mp4</BaseURL>
<SegmentList timescale="48000" duration="480000">
<Initialization range="0-752"/>
<SegmentURL mediaRange="753-124129" indexRange="753-796"/>
<SegmentURL mediaRange="124130-250544" indexRange="124130-124173"/>
<SegmentURL mediaRange="250545-374929" indexRange="250545-250588"/>
</SegmentList>
</Representation>
</AdaptationSet>
</Period>
</MPD>

@ -0,0 +1,351 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Created with Unified Streaming Platform (version=1.10.18-20255) -->
<MPD
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="urn:mpeg:dash:schema:mpd:2011"
xsi:schemaLocation="urn:mpeg:dash:schema:mpd:2011 http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-DASH_schema_files/DASH-MPD.xsd"
type="static"
mediaPresentationDuration="PT14M48S"
maxSegmentDuration="PT1M"
minBufferTime="PT10S"
profiles="urn:mpeg:dash:profile:isoff-live:2011">
<Period
id="1"
duration="PT14M48S">
<BaseURL>dash/</BaseURL>
<AdaptationSet
id="1"
group="1"
contentType="audio"
segmentAlignment="true"
audioSamplingRate="48000"
mimeType="audio/mp4"
codecs="mp4a.40.2"
startWithSAP="1">
<AudioChannelConfiguration
schemeIdUri="urn:mpeg:dash:23003:3:audio_channel_configuration:2011"
value="2" />
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="main" />
<SegmentTemplate
timescale="48000"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="3584" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="audio=128001"
bandwidth="128001">
</Representation>
</AdaptationSet>
<AdaptationSet
id="2"
group="3"
contentType="text"
lang="en"
mimeType="application/mp4"
codecs="stpp"
startWithSAP="1">
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="subtitle" />
<SegmentTemplate
timescale="1000"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="60000" r="9" />
<S d="24000" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="textstream_eng=1000"
bandwidth="1000">
</Representation>
</AdaptationSet>
<AdaptationSet
id="3"
group="2"
contentType="video"
par="960:409"
minBandwidth="100000"
maxBandwidth="4482000"
maxWidth="1689"
maxHeight="720"
segmentAlignment="true"
mimeType="video/mp4"
codecs="avc1.4D401F"
startWithSAP="1">
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="main" />
<SegmentTemplate
timescale="12288"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="24576" r="443" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="video=100000"
bandwidth="100000"
width="336"
height="144"
sar="2880:2863"
scanType="progressive">
</Representation>
<Representation
id="video=326000"
bandwidth="326000"
width="562"
height="240"
sar="115200:114929"
scanType="progressive">
</Representation>
<Representation
id="video=698000"
bandwidth="698000"
width="844"
height="360"
sar="86400:86299"
scanType="progressive">
</Representation>
<Representation
id="video=1493000"
bandwidth="1493000"
width="1126"
height="480"
sar="230400:230267"
scanType="progressive">
</Representation>
<Representation
id="video=4482000"
bandwidth="4482000"
width="1688"
height="720"
sar="86400:86299"
scanType="progressive">
</Representation>
</AdaptationSet>
</Period>
</MPD>

@ -0,0 +1,32 @@
<?xml version="1.0" ?>
<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" profiles="urn:mpeg:dash:profile:isoff-live:2011" minBufferTime="PT10.01S" mediaPresentationDuration="PT30.097S" type="static">
<!-- Created with Bento4 mp4-dash.py, VERSION=2.0.0-639 -->
<Period>
<!-- Video -->
<AdaptationSet mimeType="video/mp4" segmentAlignment="true" startWithSAP="1" maxWidth="768" maxHeight="432">
<Representation id="video-avc1" codecs="avc1.4D401E" width="768" height="432" scanType="progressive" frameRate="30000/1001" bandwidth="699597">
<SegmentList timescale="1000" duration="10010">
<Initialization sourceURL="video-frag.mp4" range="36-746"/>
<SegmentURL media="video-frag.mp4" mediaRange="747-876117"/>
<SegmentURL media="video-frag.mp4" mediaRange="876118-1466913"/>
<SegmentURL media="video-frag.mp4" mediaRange="1466914-1953954"/>
<SegmentURL media="video-frag.mp4" mediaRange="1953955-1994652"/>
</SegmentList>
</Representation>
</AdaptationSet>
<!-- Audio -->
<AdaptationSet mimeType="audio/mp4" startWithSAP="1" segmentAlignment="true">
<Representation id="audio-und-mp4a.40.2" codecs="mp4a.40.2" bandwidth="98808" audioSamplingRate="48000">
<AudioChannelConfiguration schemeIdUri="urn:mpeg:mpegB:cicp:ChannelConfiguration" value="2"/>
<SegmentList timescale="1000" duration="10010">
<Initialization sourceURL="audio-frag.mp4" range="32-623"/>
<SegmentURL media="audio-frag.mp4" mediaRange="624-124199"/>
<SegmentURL media="audio-frag.mp4" mediaRange="124200-250303"/>
<SegmentURL media="audio-frag.mp4" mediaRange="250304-374365"/>
<SegmentURL media="audio-frag.mp4" mediaRange="374366-374836"/>
</SegmentList>
</Representation>
</AdaptationSet>
</Period>
</MPD>

@ -35,6 +35,7 @@ class DashSegmentsFD(FragmentFD):
for frag_index, fragment in enumerate(fragments, 1):
if frag_index <= ctx['fragment_index']:
continue
success = False
# In DASH, the first segment contains necessary headers to
# generate a valid MP4 file, so always abort for the first segment
fatal = frag_index == 1 or not skip_unavailable_fragments
@ -42,10 +43,14 @@ class DashSegmentsFD(FragmentFD):
if not fragment_url:
assert fragment_base_url
fragment_url = urljoin(fragment_base_url, fragment['path'])
success = False
headers = info_dict.get('http_headers')
fragment_range = fragment.get('range')
if fragment_range:
headers = headers.copy() if headers else {}
headers['Range'] = 'bytes=%s' % (fragment_range,)
for count in itertools.count():
try:
success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
success, frag_content = self._download_fragment(ctx, fragment_url, info_dict, headers)
if not success:
return False
self._append_fragment(ctx, frag_content)

@ -2,6 +2,7 @@
from __future__ import unicode_literals
import base64
import collections
import datetime
import functools
import hashlib
@ -58,6 +59,7 @@ from ..utils import (
GeoRestrictedError,
GeoUtils,
int_or_none,
join_nonempty,
js_to_json,
JSON_LD_RE,
mimetype2ext,
@ -74,6 +76,7 @@ from ..utils import (
str_or_none,
str_to_int,
strip_or_none,
T,
traverse_obj,
try_get,
unescapeHTML,
@ -180,6 +183,8 @@ class InfoExtractor(object):
fragment_base_url
* "duration" (optional, int or float)
* "filesize" (optional, int)
* "range" (optional, str of the form "start-end"
to use in HTTP Range header)
* preference Order number of this format. If this field is
present and not None, the formats get sorted
by this field, regardless of all other values.
@ -1751,6 +1756,12 @@ class InfoExtractor(object):
'format_note': 'Quality selection URL',
}
def _report_ignoring_subs(self, name):
self.report_warning(bug_reports_message(
'Ignoring subtitle tracks found in the {0} manifest; '
'if any subtitle tracks are missing,'.format(name)
), only_once=True)
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None,
m3u8_id=None, note=None, errnote=None,
@ -2191,23 +2202,46 @@ class InfoExtractor(object):
})
return entries
def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
def _extract_mpd_formats(self, *args, **kwargs):
fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
if subs:
self._report_ignoring_subs('DASH')
return fmts
def _extract_mpd_formats_and_subtitles(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers=None, query=None):
# TODO: or not? param not yet implemented
if self.get_param('ignore_no_formats_error'):
fatal = False
res = self._download_xml_handle(
mpd_url, video_id,
note=note or 'Downloading MPD manifest',
errnote=errnote or 'Failed to download MPD manifest',
fatal=fatal, data=data, headers=headers, query=query)
note='Downloading MPD manifest' if note is None else note,
errnote='Failed to download MPD manifest' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers or {}, query=query or {})
if res is False:
return []
return [], {}
mpd_doc, urlh = res
if mpd_doc is None:
return []
mpd_base_url = base_url(urlh.geturl())
return [], {}
# We could have been redirected to a new url when we retrieved our mpd file.
mpd_url = urlh.geturl()
mpd_base_url = base_url(mpd_url)
return self._parse_mpd_formats(
return self._parse_mpd_formats_and_subtitles(
mpd_doc, mpd_id, mpd_base_url, mpd_url)
def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
def _parse_mpd_formats(self, *args, **kwargs):
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
if subs:
self._report_ignoring_subs('DASH')
return fmts
def _parse_mpd_formats_and_subtitles(
self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
@ -2215,8 +2249,10 @@ class InfoExtractor(object):
http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
"""
if mpd_doc.get('type') == 'dynamic':
return []
# TODO: param not yet implemented: default like previous yt-dl logic
if not self.get_param('dynamic_mpd', False):
if mpd_doc.get('type') == 'dynamic':
return [], {}
namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
@ -2226,8 +2262,24 @@ class InfoExtractor(object):
def is_drm_protected(element):
return element.find(_add_ns('ContentProtection')) is not None
from ..utils import YoutubeDLHandler
fix_path = YoutubeDLHandler._fix_path
def resolve_base_url(element, parent_base_url=None):
# TODO: use native XML traversal when ready
b_url = traverse_obj(element, (
T(lambda e: e.find(_add_ns('BaseURL')).text)))
if parent_base_url and b_url:
if not parent_base_url[-1] in ('/', ':'):
parent_base_url += '/'
b_url = compat_urlparse.urljoin(parent_base_url, b_url)
if b_url:
b_url = fix_path(b_url)
return b_url or parent_base_url
def extract_multisegment_info(element, ms_parent_info):
ms_info = ms_parent_info.copy()
base_url = ms_info['base_url'] = resolve_base_url(element, ms_info.get('base_url'))
# As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
# common attributes and elements. We will only extract relevant
@ -2261,15 +2313,27 @@ class InfoExtractor(object):
def extract_Initialization(source):
initialization = source.find(_add_ns('Initialization'))
if initialization is not None:
ms_info['initialization_url'] = initialization.attrib['sourceURL']
ms_info['initialization_url'] = initialization.get('sourceURL') or base_url
initialization_url_range = initialization.get('range')
if initialization_url_range:
ms_info['initialization_url_range'] = initialization_url_range
segment_list = element.find(_add_ns('SegmentList'))
if segment_list is not None:
extract_common(segment_list)
extract_Initialization(segment_list)
segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
if segment_urls_e:
ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
segment_urls = traverse_obj(segment_urls_e, (
Ellipsis, T(lambda e: e.attrib), 'media'))
if segment_urls:
ms_info['segment_urls'] = segment_urls
segment_urls_range = traverse_obj(segment_urls_e, (
Ellipsis, T(lambda e: e.attrib), 'mediaRange',
T(lambda r: re.findall(r'^\d+-\d+$', r)), 0))
if segment_urls_range:
ms_info['segment_urls_range'] = segment_urls_range
if not segment_urls:
ms_info['segment_urls'] = [base_url for _ in segment_urls_range]
else:
segment_template = element.find(_add_ns('SegmentTemplate'))
if segment_template is not None:
@ -2285,17 +2349,20 @@ class InfoExtractor(object):
return ms_info
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
formats = []
formats, subtitles = [], {}
stream_numbers = collections.defaultdict(int)
mpd_base_url = resolve_base_url(mpd_doc, mpd_base_url or mpd_url)
for period in mpd_doc.findall(_add_ns('Period')):
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
'start_number': 1,
'timescale': 1,
'base_url': mpd_base_url,
})
for adaptation_set in period.findall(_add_ns('AdaptationSet')):
if is_drm_protected(adaptation_set):
continue
adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
adaptation_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
for representation in adaptation_set.findall(_add_ns('Representation')):
if is_drm_protected(representation):
continue
@ -2303,27 +2370,35 @@ class InfoExtractor(object):
representation_attrib.update(representation.attrib)
# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
mime_type = representation_attrib['mimeType']
content_type = mime_type.split('/')[0]
if content_type == 'text':
# TODO implement WebVTT downloading
pass
elif content_type in ('video', 'audio'):
base_url = ''
for element in (representation, adaptation_set, period, mpd_doc):
base_url_e = element.find(_add_ns('BaseURL'))
if base_url_e is not None:
base_url = base_url_e.text + base_url
if re.match(r'^https?://', base_url):
break
if mpd_base_url and not re.match(r'^https?://', base_url):
if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
mpd_base_url += '/'
base_url = mpd_base_url + base_url
representation_id = representation_attrib.get('id')
lang = representation_attrib.get('lang')
url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
content_type = representation_attrib.get('contentType') or mime_type.split('/')[0]
codec_str = representation_attrib.get('codecs', '')
# Some kind of binary subtitle found in some youtube livestreams
if mime_type == 'application/x-rawcc':
codecs = {'scodec': codec_str}
else:
codecs = parse_codecs(codec_str)
if content_type not in ('video', 'audio', 'text'):
if mime_type == 'image/jpeg':
content_type = mime_type
elif codecs.get('vcodec', 'none') != 'none':
content_type = 'video'
elif codecs.get('acodec', 'none') != 'none':
content_type = 'audio'
elif codecs.get('scodec', 'none') != 'none':
content_type = 'text'
elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
content_type = 'text'
else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
continue
representation_id = representation_attrib.get('id')
lang = representation_attrib.get('lang')
url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
format_id = join_nonempty(representation_id or content_type, mpd_id)
if content_type in ('video', 'audio'):
f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
'manifest_url': mpd_url,
@ -2338,104 +2413,130 @@ class InfoExtractor(object):
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
}
f.update(parse_codecs(representation_attrib.get('codecs')))
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
def prepare_template(template_name, identifiers):
tmpl = representation_ms_info[template_name]
# First of, % characters outside $...$ templates
# must be escaped by doubling for proper processing
# by % operator string formatting used further (see
# https://github.com/ytdl-org/youtube-dl/issues/16867).
t = ''
in_template = False
for c in tmpl:
f.update(codecs)
elif content_type == 'text':
f = {
'ext': mimetype2ext(mime_type),
'manifest_url': mpd_url,
'filesize': filesize,
}
elif content_type == 'image/jpeg':
# See test case in VikiIE
# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
f = {
'format_id': format_id,
'ext': 'mhtml',
'manifest_url': mpd_url,
'format_note': 'DASH storyboards (jpeg)',
'acodec': 'none',
'vcodec': 'none',
}
if is_drm_protected(adaptation_set) or is_drm_protected(representation):
f['has_drm'] = True
representation_ms_info = extract_multisegment_info(representation, adaptation_set_ms_info)
def prepare_template(template_name, identifiers):
tmpl = representation_ms_info[template_name]
# First of, % characters outside $...$ templates
# must be escaped by doubling for proper processing
# by % operator string formatting used further (see
# https://github.com/ytdl-org/youtube-dl/issues/16867).
t = ''
in_template = False
for c in tmpl:
t += c
if c == '$':
in_template = not in_template
elif c == '%' and not in_template:
t += c
if c == '$':
in_template = not in_template
elif c == '%' and not in_template:
t += c
# Next, $...$ templates are translated to their
# %(...) counterparts to be used with % operator
t = t.replace('$RepresentationID$', representation_id)
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
t.replace('$$', '$')
return t
# @initialization is a regular template like @media one
# so it should be handled just the same way (see
# https://github.com/ytdl-org/youtube-dl/issues/11605)
if 'initialization' in representation_ms_info:
initialization_template = prepare_template(
'initialization',
# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
# $Time$ shall not be included for @initialization thus
# only $Bandwidth$ remains
('Bandwidth', ))
representation_ms_info['initialization_url'] = initialization_template % {
'Bandwidth': bandwidth,
}
def location_key(location):
return 'url' if re.match(r'^https?://', location) else 'path'
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
media_location_key = location_key(media_template)
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
if '%(Number' in media_template and 's' not in representation_ms_info:
segment_duration = None
if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
representation_ms_info['fragments'] = [{
media_location_key: media_template % {
'Number': segment_number,
'Bandwidth': bandwidth,
},
'duration': segment_duration,
} for segment_number in range(
representation_ms_info['start_number'],
representation_ms_info['total_number'] + representation_ms_info['start_number'])]
else:
# $Number*$ or $Time$ in media template with S list available
# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
representation_ms_info['fragments'] = []
segment_time = 0
segment_d = None
segment_number = representation_ms_info['start_number']
def add_segment_url():
segment_url = media_template % {
'Time': segment_time,
'Bandwidth': bandwidth,
'Number': segment_number,
}
representation_ms_info['fragments'].append({
media_location_key: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
# Next, $...$ templates are translated to their
# %(...) counterparts to be used with % operator
t = t.replace('$RepresentationID$', representation_id)
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
t.replace('$$', '$')
return t
# @initialization is a regular template like @media one
# so it should be handled just the same way (see
# https://github.com/ytdl-org/youtube-dl/issues/11605)
if 'initialization' in representation_ms_info:
initialization_template = prepare_template(
'initialization',
# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
# $Time$ shall not be included for @initialization thus
# only $Bandwidth$ remains
('Bandwidth', ))
representation_ms_info['initialization_url'] = initialization_template % {
'Bandwidth': bandwidth,
}
for num, s in enumerate(representation_ms_info['s']):
segment_time = s.get('t') or segment_time
segment_d = s['d']
def location_key(location):
return 'url' if re.match(r'^https?://', location) else 'path'
def calc_segment_duration():
return float_or_none(
representation_ms_info['segment_duration'],
representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
media_location_key = location_key(media_template)
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
if '%(Number' in media_template and 's' not in representation_ms_info:
segment_duration = None
if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(
float_or_none(period_duration, segment_duration, default=0)))
representation_ms_info['fragments'] = [{
media_location_key: media_template % {
'Number': segment_number,
'Bandwidth': bandwidth,
},
'duration': segment_duration,
} for segment_number in range(
representation_ms_info['start_number'],
representation_ms_info['total_number'] + representation_ms_info['start_number'])]
else:
# $Number*$ or $Time$ in media template with S list available
# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
representation_ms_info['fragments'] = []
segment_time = 0
segment_d = None
segment_number = representation_ms_info['start_number']
def add_segment_url():
segment_url = media_template % {
'Time': segment_time,
'Bandwidth': bandwidth,
'Number': segment_number,
}
representation_ms_info['fragments'].append({
media_location_key: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
for num, s in enumerate(representation_ms_info['s']):
segment_time = s.get('t') or segment_time
segment_d = s['d']
add_segment_url()
segment_number += 1
for r in range(s.get('r', 0)):
segment_time += segment_d
add_segment_url()
segment_number += 1
for r in range(s.get('r', 0)):
segment_time += segment_d
add_segment_url()
segment_number += 1
segment_time += segment_d
elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
segment_time += segment_d
elif 'segment_urls' in representation_ms_info:
fragments = []
if 's' in representation_ms_info:
# No media template
# Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
# or any YouTube dashsegments video
fragments = []
segment_index = 0
timescale = representation_ms_info['timescale']
for s in representation_ms_info['s']:
@ -2447,48 +2548,78 @@ class InfoExtractor(object):
'duration': duration,
})
segment_index += 1
representation_ms_info['fragments'] = fragments
elif 'segment_urls' in representation_ms_info:
elif 'segment_urls_range' in representation_ms_info:
# Segment URLs with mediaRange
# Example: https://kinescope.io/200615537/master.mpd
# https://github.com/ytdl-org/youtube-dl/issues/30235
# or any mpd generated with Bento4 `mp4dash --no-split --use-segment-list`
segment_duration = calc_segment_duration()
for segment_url, segment_url_range in zip(
representation_ms_info['segment_urls'], representation_ms_info['segment_urls_range']):
fragments.append({
location_key(segment_url): segment_url,
'range': segment_url_range,
'duration': segment_duration,
})
else:
# Segment URLs with no SegmentTimeline
# Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
# https://github.com/ytdl-org/youtube-dl/pull/14844
fragments = []
segment_duration = float_or_none(
representation_ms_info['segment_duration'],
representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
segment_duration = calc_segment_duration()
for segment_url in representation_ms_info['segment_urls']:
fragment = {
fragments.append({
location_key(segment_url): segment_url,
}
if segment_duration:
fragment['duration'] = segment_duration
fragments.append(fragment)
representation_ms_info['fragments'] = fragments
# If there is a fragments key available then we correctly recognized fragmented media.
# Otherwise we will assume unfragmented media with direct access. Technically, such
# assumption is not necessarily correct since we may simply have no support for
# some forms of fragmented media renditions yet, but for now we'll use this fallback.
if 'fragments' in representation_ms_info:
f.update({
# NB: mpd_url may be empty when MPD manifest is parsed from a string
'url': mpd_url or base_url,
'fragment_base_url': base_url,
'fragments': [],
'protocol': 'http_dash_segments',
'duration': segment_duration,
})
representation_ms_info['fragments'] = fragments
# If there is a fragments key available then we correctly recognized fragmented media.
# Otherwise we will assume unfragmented media with direct access. Technically, such
# assumption is not necessarily correct since we may simply have no support for
# some forms of fragmented media renditions yet, but for now we'll use this fallback.
if 'fragments' in representation_ms_info:
base_url = representation_ms_info['base_url']
f.update({
# NB: mpd_url may be empty when MPD manifest is parsed from a string
'url': mpd_url or base_url,
'fragment_base_url': base_url,
'fragments': [],
'protocol': 'http_dash_segments',
})
if 'initialization_url' in representation_ms_info and 'initialization_url_range' in representation_ms_info:
# Initialization URL with range (accompanied by Segment URLs with mediaRange above)
# https://github.com/ytdl-org/youtube-dl/issues/30235
initialization_url = representation_ms_info['initialization_url']
f['fragments'].append({
location_key(initialization_url): initialization_url,
'range': representation_ms_info['initialization_url_range'],
})
if 'initialization_url' in representation_ms_info:
initialization_url = representation_ms_info['initialization_url']
if not f.get('url'):
f['url'] = initialization_url
f['fragments'].append({location_key(initialization_url): initialization_url})
f['fragments'].extend(representation_ms_info['fragments'])
else:
# Assuming direct URL to unfragmented media.
f['url'] = base_url
formats.append(f)
elif 'initialization_url' in representation_ms_info:
initialization_url = representation_ms_info['initialization_url']
if not f.get('url'):
f['url'] = initialization_url
f['fragments'].append({location_key(initialization_url): initialization_url})
elif 'initialization_url_range' in representation_ms_info:
# no Initialization URL but range (accompanied by no Segment URLs but mediaRange above)
# https://github.com/ytdl-org/youtube-dl/issues/27575
f['fragments'].append({
location_key(base_url): base_url,
'range': representation_ms_info['initialization_url_range'],
})
f['fragments'].extend(representation_ms_info['fragments'])
if not period_duration:
period_duration = sum(traverse_obj(representation_ms_info, (
'fragments', Ellipsis, 'duration', T(float_or_none))))
else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
# Assuming direct URL to unfragmented media.
f['url'] = representation_ms_info['base_url']
if content_type in ('video', 'audio', 'image/jpeg'):
f['manifest_stream_number'] = stream_numbers[f['url']]
stream_numbers[f['url']] += 1
formats.append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
return formats, subtitles
def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle(

@ -2,9 +2,20 @@
from __future__ import unicode_literals
import re
import time
from .common import InfoExtractor
from ..utils import ExtractorError
from ..compat import compat_kwargs
from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
merge_dicts,
T,
traverse_obj,
txt_or_none,
url_or_none,
)
class Vbox7IE(InfoExtractor):
@ -20,10 +31,12 @@ class Vbox7IE(InfoExtractor):
)
(?P<id>[\da-fA-F]+)
'''
_EMBED_REGEX = [r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)']
_GEO_COUNTRIES = ['BG']
_GEO_BYPASS = False
_TESTS = [{
'url': 'http://vbox7.com/play:0946fff23c',
'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf',
'url': 'https://vbox7.com/play:0946fff23c',
'md5': '50ca1f78345a9c15391af47d8062d074',
'info_dict': {
'id': '0946fff23c',
'ext': 'mp4',
@ -34,18 +47,21 @@ class Vbox7IE(InfoExtractor):
'upload_date': '20160812',
'uploader': 'zdraveibulgaria',
},
'params': {
'proxy': '127.0.0.1:8118',
},
'expected_warnings': [
'Unable to download webpage',
],
}, {
'url': 'http://vbox7.com/play:249bb972c2',
'md5': '99f65c0c9ef9b682b97313e052734c3f',
'md5': 'aaf19465e37ec0b30b918df83ec32c50',
'info_dict': {
'id': '249bb972c2',
'ext': 'mp4',
'title': 'Смях! Чудо - чист за секунди - Скрита камера',
'description': 'Смях! Чудо - чист за секунди - Скрита камера',
'timestamp': 1360215023,
'upload_date': '20130207',
'uploader': 'svideteliat_ot_varshava',
},
'skip': 'georestricted',
}, {
'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1',
'only_matching': True,
@ -54,52 +70,109 @@ class Vbox7IE(InfoExtractor):
'only_matching': True,
}]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)',
webpage)
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(cls._EMBED_REGEX[0], webpage)
if mobj:
return mobj.group('url')
# transform_source=None, fatal=True
def _parse_json(self, json_string, video_id, *args, **kwargs):
if '"@context"' in json_string[:30]:
# this is ld+json, or that's the way to bet
transform_source = args[0] if len(args) > 0 else kwargs.get('transform_source')
if not transform_source:
def fix_chars(src):
# fix malformed ld+json: replace raw CRLFs with escaped LFs
return re.sub(
r'"[^"]+"', lambda m: re.sub(r'\r?\n', r'\\n', m.group(0)), src)
if len(args) > 0:
args = (fix_chars,) + args[1:]
else:
kwargs['transform_source'] = fix_chars
kwargs = compat_kwargs(kwargs)
return super(Vbox7IE, self)._parse_json(
json_string, video_id, *args, **kwargs)
def _real_extract(self, url):
video_id = self._match_id(url)
url = 'https://vbox7.com/play:%s' % (video_id,)
now = time.time()
response = self._download_json(
'https://www.vbox7.com/ajax/video/nextvideo.php?vid=%s' % video_id,
video_id)
'https://www.vbox7.com/aj/player/item/options?vid=%s' % (video_id,),
video_id, headers={'Referer': url})
# estimate time to which possible `ago` member is relative
now = now + 0.5 * (time.time() - now)
if 'error' in response:
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, response['error']), expected=True)
video = response['options']
title = video['title']
video_url = video['src']
video_url = traverse_obj(response, ('options', 'src', T(url_or_none)))
if '/na.mp4' in video_url:
if '/na.mp4' in video_url or '':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
uploader = video.get('uploader')
webpage = self._download_webpage(
'http://vbox7.com/play:%s' % video_id, video_id, fatal=None)
info = {}
if webpage:
info = self._search_json_ld(
webpage.replace('"/*@context"', '"@context"'), video_id,
fatal=False)
info.update({
ext = determine_ext(video_url)
if ext == 'mpd':
# In case MPD cannot be parsed, or anyway, get mp4 combined
# formats usually provided to Safari, iOS, and old Windows
try:
formats, subtitles = self._extract_mpd_formats_and_subtitles(
video_url, video_id, 'dash', fatal=False)
except KeyError:
self.report_warning('Failed to parse MPD manifest')
formats, subtitles = [], {}
video = response['options']
resolutions = (1080, 720, 480, 240, 144)
highest_res = traverse_obj(video, ('highestRes', T(int))) or resolutions[0]
for res in traverse_obj(video, ('resolutions', lambda _, r: int(r) > 0)) or resolutions:
if res > highest_res:
continue
formats.append({
'url': video_url.replace('.mpd', '_%d.mp4' % res),
'format_id': '%dp' % res,
'height': res,
})
# if above formats are flaky, enable the line below
# self._check_formats(formats, video_id)
else:
formats = [{
'url': video_url,
}]
subtitles = {}
self._sort_formats(formats)
webpage = self._download_webpage(url, video_id, fatal=False) or ''
info = self._search_json_ld(
webpage.replace('"/*@context"', '"@context"'), video_id,
fatal=False) if webpage else {}
if not info.get('title'):
info['title'] = traverse_obj(response, (
'options', 'title', T(txt_or_none))) or self._og_search_title(webpage)
def if_missing(k):
return lambda x: None if k in info else x
info = merge_dicts(info, {
'id': video_id,
'title': title,
'url': video_url,
'uploader': uploader,
'thumbnail': self._proto_relative_url(
'formats': formats,
'subtitles': subtitles or None,
}, info, traverse_obj(response, ('options', {
'uploader': ('uploader', T(txt_or_none)),
'timestamp': ('ago', T(if_missing('timestamp')), T(lambda t: int(round((now - t) / 60.0)) * 60)),
'duration': ('duration', T(if_missing('duration')), T(float_or_none)),
})))
if 'thumbnail' not in info:
info['thumbnail'] = self._proto_relative_url(
info.get('thumbnail') or self._og_search_thumbnail(webpage),
'http:'),
})
'https:'),
return info

Loading…
Cancel
Save