From 6d0d4fc26d45c55ef6e99b31892047b0bdfed0e0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 18 Jun 2016 13:40:55 +0800 Subject: [PATCH] [wdr] Add WDRBaseIE, for Sportschau (#9799) --- youtube_dl/extractor/wdr.py | 179 +++++++++++++++++++----------------- 1 file changed, 95 insertions(+), 84 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 6b83a2a04..390f9e830 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -15,7 +15,87 @@ from ..utils import ( ) -class WDRIE(InfoExtractor): +class WDRBaseIE(InfoExtractor): + def _extract_wdr_video(self, webpage, display_id): + # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" + # for wdrmaus its in a link to the page in a multiline "videoLink"-tag + json_metadata = self._html_search_regex( + r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', + webpage, 'media link', default=None, flags=re.MULTILINE) + + if not json_metadata: + return + + media_link_obj = self._parse_json(json_metadata, display_id, + transform_source=js_to_json) + jsonp_url = media_link_obj['mediaObj']['url'] + + metadata = self._download_json( + jsonp_url, 'metadata', transform_source=strip_jsonp) + + metadata_tracker_data = metadata['trackerData'] + metadata_media_resource = metadata['mediaResource'] + + formats = [] + + # check if the metadata contains a direct URL to a file + for kind, media_resource in metadata_media_resource.items(): + if kind not in ('dflt', 'alt'): + continue + + for tag_name, medium_url in media_resource.items(): + if tag_name not in ('videoURL', 'audioURL'): + continue + + ext = determine_ext(medium_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + medium_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls')) + elif ext == 'f4m': + manifest_url = update_url_query( + medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) + formats.extend(self._extract_f4m_formats( + manifest_url, display_id, f4m_id='hds', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + medium_url, 'stream', fatal=False)) + else: + a_format = { + 'url': medium_url + } + if ext == 'unknown_video': + urlh = self._request_webpage( + medium_url, display_id, note='Determining extension') + ext = urlhandle_detect_ext(urlh) + a_format['ext'] = ext + formats.append(a_format) + + self._sort_formats(formats) + + subtitles = {} + caption_url = metadata_media_resource.get('captionURL') + if caption_url: + subtitles['de'] = [{ + 'url': caption_url, + 'ext': 'ttml', + }] + + title = metadata_tracker_data['trackerClipTitle'] + + return { + 'id': metadata_tracker_data.get('trackerClipId', display_id), + 'display_id': display_id, + 'title': title, + 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), + 'formats': formats, + 'subtitles': subtitles, + 'upload_date': unified_strdate(metadata_tracker_data.get('trackerClipAirTime')), + } + + +class WDRIE(WDRBaseIE): _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P[^/]+)/(?P.+)\.html' _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL @@ -91,10 +171,10 @@ class WDRIE(InfoExtractor): }, { 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5', - # HDS download, MD5 is unstable + 'md5': '803138901f6368ee497b4d195bb164f2', 'info_dict': { 'id': 'mdb-186083', - 'ext': 'flv', + 'ext': 'mp4', 'upload_date': '20130919', 'title': 'Sachgeschichte - Achterbahn ', 'description': '- Die Sendung mit der Maus -', @@ -120,14 +200,9 @@ class WDRIE(InfoExtractor): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - # for wdr.de the data-extension is in a tag with the class "mediaLink" - # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" - # for wdrmaus its in a link to the page in a multiline "videoLink"-tag - json_metadata = self._html_search_regex( - r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', - webpage, 'media link', default=None, flags=re.MULTILINE) + info_dict = self._extract_wdr_video(webpage, display_id) - if not json_metadata: + if not info_dict: entries = [ self.url_result(page_url + href[0], 'WDR') for href in re.findall( @@ -140,86 +215,22 @@ class WDRIE(InfoExtractor): raise ExtractorError('No downloadable streams found', expected=True) - media_link_obj = self._parse_json(json_metadata, display_id, - transform_source=js_to_json) - jsonp_url = media_link_obj['mediaObj']['url'] - - metadata = self._download_json( - jsonp_url, 'metadata', transform_source=strip_jsonp) - - metadata_tracker_data = metadata['trackerData'] - metadata_media_resource = metadata['mediaResource'] - - formats = [] - - # check if the metadata contains a direct URL to a file - for kind, media_resource in metadata_media_resource.items(): - if kind not in ('dflt', 'alt'): - continue - - for tag_name, medium_url in media_resource.items(): - if tag_name not in ('videoURL', 'audioURL'): - continue - - ext = determine_ext(medium_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - medium_url, display_id, 'mp4', 'm3u8_native', - m3u8_id='hls')) - elif ext == 'f4m': - manifest_url = update_url_query( - medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) - formats.extend(self._extract_f4m_formats( - manifest_url, display_id, f4m_id='hds', fatal=False)) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - medium_url, 'stream', fatal=False)) - else: - a_format = { - 'url': medium_url - } - if ext == 'unknown_video': - urlh = self._request_webpage( - medium_url, display_id, note='Determining extension') - ext = urlhandle_detect_ext(urlh) - a_format['ext'] = ext - formats.append(a_format) - - self._sort_formats(formats) - - subtitles = {} - caption_url = metadata_media_resource.get('captionURL') - if caption_url: - subtitles['de'] = [{ - 'url': caption_url, - 'ext': 'ttml', - }] - - title = metadata_tracker_data.get('trackerClipTitle') is_live = url_type == 'live' if is_live: - title = self._live_title(title) - upload_date = None - elif 'trackerClipAirTime' in metadata_tracker_data: - upload_date = metadata_tracker_data['trackerClipAirTime'] - else: - upload_date = self._html_search_meta('DC.Date', webpage, 'upload date') + info_dict.update({ + 'title': self._live_title(info_dict['title']), + 'upload_date': None, + }) + elif 'upload_date' not in info_dict: + info_dict['upload_date'] = unified_strdate(self._html_search_meta('DC.Date', webpage, 'upload date')) - if upload_date: - upload_date = unified_strdate(upload_date) - - return { - 'id': metadata_tracker_data.get('trackerClipId', display_id), - 'display_id': display_id, - 'title': title, - 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), - 'formats': formats, - 'upload_date': upload_date, + info_dict.update({ 'description': self._html_search_meta('Description', webpage), 'is_live': is_live, - 'subtitles': subtitles, - } + }) + + return info_dict class WDRMobileIE(InfoExtractor):