From 5d73273f6f458970b34b3c6f4c8bd18fbad9c1ca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 6 Jan 2014 17:15:27 +0100 Subject: [PATCH] [orf] Use new extraction method (Fixes #2057) --- youtube_dl/extractor/common.py | 5 ++ youtube_dl/extractor/orf.py | 116 +++++++++++++++++++++++---------- youtube_dl/utils.py | 1 + 3 files changed, 86 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f498bcf6f..2a5e8076c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -73,6 +73,10 @@ class InfoExtractor(object): by this field. -1 for default (order by other properties), -2 or smaller for less than default. + * quality Order number of the video quality of this + format, irrespective of the file format. + -1 for default (order by other properties), + -2 or smaller for less than default. url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) @@ -483,6 +487,7 @@ class InfoExtractor(object): return ( preference, + f.get('quality') if f.get('quality') is not None else -1, f.get('height') if f.get('height') is not None else -1, f.get('width') if f.get('width') is not None else -1, ext_preference, diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index b42eae89a..88f03608b 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -1,54 +1,98 @@ # coding: utf-8 +from __future__ import unicode_literals -import re -import xml.etree.ElementTree import json +import re from .common import InfoExtractor from ..utils import ( - compat_urlparse, - ExtractorError, - find_xpath_attr, + HEADRequest, + unified_strdate, ) + class ORFIE(InfoExtractor): - _VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P\d+)' + _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P\d+)' + + _TEST = { + 'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747', + 'file': '7319747.mp4', + 'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375', + 'info_dict': { + 'title': 'Was Sie schon immer über Klassik wissen wollten', + 'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4', + 'duration': 3508, + 'upload_date': '20140105', + }, + 'skip': 'Blocked outside of Austria', + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') webpage = self._download_webpage(url, playlist_id) - flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml') - flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0] - flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8')) - playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"') - playlist = json.loads(playlist_json) + data_json = self._search_regex( + r'initializeAdworx\((.+?)\);\n', webpage, 'video info') + all_data = json.loads(data_json) + sdata = all_data[0]['values']['segments'] - videos = [] - ns = '{http://tempuri.org/XMLSchema.xsd}' - xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns} - webpage_description = self._og_search_description(webpage) - for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1): - # Get best quality url - rtmp_url = None - for q in ['Q6A', 'Q4A', 'Q1A']: - video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q) - if video_url is not None: - rtmp_url = video_url.text - break - if rtmp_url is None: - raise ExtractorError(u'Couldn\'t get video url: %s' % info['id']) - description = self._html_search_regex( - r'id="playlist_entry_%s".*?

(.*?)

' % i, webpage, - u'description', default=webpage_description, flags=re.DOTALL) - videos.append({ + def quality_to_int(s): + m = re.search('([0-9]+)', s) + if m is None: + return -1 + return int(m.group(1)) + + entries = [] + for sd in sdata: + video_id = sd['id'] + formats = [{ + 'preference': -10 if fd['delivery'] == 'hls' else None, + 'format_id': '%s-%s-%s' % ( + fd['delivery'], fd['quality'], fd['quality_string']), + 'url': fd['src'], + 'protocol': fd['protocol'], + 'quality': quality_to_int(fd['quality']), + } for fd in sd['playlist_item_array']['sources']] + + # Check for geoblocking. + # There is a property is_geoprotection, but that's always false + geo_str = sd.get('geoprotection_string') + if geo_str: + try: + http_url = next( + f['url'] + for f in formats + if re.match(r'^https?://.*\.mp4$', f['url'])) + except StopIteration: + pass + else: + req = HEADRequest(http_url) + response = self._request_webpage( + req, video_id, + note='Testing for geoblocking', + errnote=(( + 'This video seems to be blocked outside of %s. ' + 'You may want to try the streaming-* formats.') + % geo_str), + fatal=False) + + self._sort_formats(formats) + + upload_date = unified_strdate(sd['created_date']) + entries.append({ '_type': 'video', - 'id': info['id'], - 'title': info['title'], - 'url': rtmp_url, - 'ext': 'flv', - 'description': description, - }) + 'id': video_id, + 'title': sd['header'], + 'formats': formats, + 'description': sd.get('description'), + 'duration': int(sd['duration_in_seconds']), + 'upload_date': upload_date, + 'thumbnail': sd.get('image_full_url'), + }) - return videos + return { + '_type': 'playlist', + 'entries': entries, + 'id': playlist_id, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 536504e7e..918a127ca 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -764,6 +764,7 @@ def unified_strdate(date_str): '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S', '%d.%m.%Y %H:%M', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S.%fZ',