|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals |
|
|
|
|
|
|
|
|
|
import base64 |
|
|
|
|
import datetime |
|
|
|
|
import functools |
|
|
|
|
import hashlib |
|
|
|
|
import json |
|
|
|
|
import netrc |
|
|
|
@ -23,6 +24,7 @@ from ..compat import ( |
|
|
|
|
compat_getpass, |
|
|
|
|
compat_integer_types, |
|
|
|
|
compat_http_client, |
|
|
|
|
compat_map as map, |
|
|
|
|
compat_os_name, |
|
|
|
|
compat_str, |
|
|
|
|
compat_urllib_error, |
|
|
|
@ -31,6 +33,7 @@ from ..compat import ( |
|
|
|
|
compat_urllib_request, |
|
|
|
|
compat_urlparse, |
|
|
|
|
compat_xml_parse_error, |
|
|
|
|
compat_zip as zip, |
|
|
|
|
) |
|
|
|
|
from ..downloader.f4m import ( |
|
|
|
|
get_base_url, |
|
|
|
@ -70,6 +73,7 @@ from ..utils import ( |
|
|
|
|
str_or_none, |
|
|
|
|
str_to_int, |
|
|
|
|
strip_or_none, |
|
|
|
|
traverse_obj, |
|
|
|
|
try_get, |
|
|
|
|
unescapeHTML, |
|
|
|
|
unified_strdate, |
|
|
|
@ -1349,6 +1353,44 @@ class InfoExtractor(object): |
|
|
|
|
break |
|
|
|
|
return dict((k, v) for k, v in info.items() if v is not None) |
|
|
|
|
|
|
|
|
|
def _search_nextjs_data(self, webpage, video_id, **kw): |
|
|
|
|
nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal')) |
|
|
|
|
kw.pop('transform_source', None) |
|
|
|
|
next_data = self._search_regex( |
|
|
|
|
r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''', |
|
|
|
|
webpage, 'next.js data', group='nd', **kw) |
|
|
|
|
if not next_data: |
|
|
|
|
return {} |
|
|
|
|
return self._parse_json(next_data, video_id, **nkw) |
|
|
|
|
|
|
|
|
|
def _search_nuxt_data(self, webpage, video_id, *args, **kwargs): |
|
|
|
|
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" |
|
|
|
|
|
|
|
|
|
# self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0) |
|
|
|
|
context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__') |
|
|
|
|
fatal = kwargs.get('fatal', True) |
|
|
|
|
traverse = kwargs.get('traverse', ('data', 0)) |
|
|
|
|
|
|
|
|
|
re_ctx = re.escape(context_name) |
|
|
|
|
|
|
|
|
|
FUNCTION_RE = (r'\(\s*function\s*\((?P<arg_keys>[\s\S]*?)\)\s*\{\s*' |
|
|
|
|
r'return\s+(?P<js>\{[\s\S]*?})\s*;?\s*}\s*\((?P<arg_vals>[\s\S]*?)\)') |
|
|
|
|
|
|
|
|
|
js, arg_keys, arg_vals = self._search_regex( |
|
|
|
|
(p.format(re_ctx, FUNCTION_RE) for p in |
|
|
|
|
(r'<script>\s*window\s*\.\s*{0}\s*=\s*{1}\s*\)\s*;?\s*</script>', |
|
|
|
|
r'{0}\s*\([\s\S]*?{1}')), |
|
|
|
|
webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), |
|
|
|
|
default=NO_DEFAULT if fatal else (None, None, None)) |
|
|
|
|
if js is None: |
|
|
|
|
return {} |
|
|
|
|
|
|
|
|
|
args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( |
|
|
|
|
'[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ()))) |
|
|
|
|
|
|
|
|
|
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) |
|
|
|
|
return traverse_obj(ret, traverse) or {} |
|
|
|
|
|
|
|
|
|
@staticmethod |
|
|
|
|
def _hidden_inputs(html): |
|
|
|
|
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) |
|
|
|
@ -2496,7 +2538,8 @@ class InfoExtractor(object): |
|
|
|
|
return f |
|
|
|
|
return {} |
|
|
|
|
|
|
|
|
|
def _media_formats(src, cur_media_type, type_info={}): |
|
|
|
|
def _media_formats(src, cur_media_type, type_info=None): |
|
|
|
|
type_info = type_info or {} |
|
|
|
|
full_url = absolute_url(src) |
|
|
|
|
ext = type_info.get('ext') or determine_ext(full_url) |
|
|
|
|
if ext == 'm3u8': |
|
|
|
@ -2514,6 +2557,7 @@ class InfoExtractor(object): |
|
|
|
|
formats = [{ |
|
|
|
|
'url': full_url, |
|
|
|
|
'vcodec': 'none' if cur_media_type == 'audio' else None, |
|
|
|
|
'ext': ext, |
|
|
|
|
}] |
|
|
|
|
return is_plain_url, formats |
|
|
|
|
|
|
|
|
@ -2522,7 +2566,7 @@ class InfoExtractor(object): |
|
|
|
|
# so we wll include them right here (see |
|
|
|
|
# https://www.ampproject.org/docs/reference/components/amp-video) |
|
|
|
|
# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ |
|
|
|
|
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' |
|
|
|
|
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)' |
|
|
|
|
media_tags = [(media_tag, media_tag_name, media_type, '') |
|
|
|
|
for media_tag, media_tag_name, media_type |
|
|
|
|
in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] |
|
|
|
@ -2540,7 +2584,8 @@ class InfoExtractor(object): |
|
|
|
|
media_attributes = extract_attributes(media_tag) |
|
|
|
|
src = strip_or_none(media_attributes.get('src')) |
|
|
|
|
if src: |
|
|
|
|
_, formats = _media_formats(src, media_type) |
|
|
|
|
f = parse_content_type(media_attributes.get('type')) |
|
|
|
|
_, formats = _media_formats(src, media_type, f) |
|
|
|
|
media_info['formats'].extend(formats) |
|
|
|
|
media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) |
|
|
|
|
if media_content: |
|
|
|
|