[tagesschau] Separate player extractor

This commit is contained in:
Sergey M․ 2016-05-01 06:44:59 +06:00
parent 9cf79e8f4b
commit 6a0f9a24d0
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D
2 changed files with 168 additions and 72 deletions

View File

@ -724,7 +724,10 @@ from .svt import (
from .swrmediathek import SWRMediathekIE from .swrmediathek import SWRMediathekIE
from .syfy import SyfyIE from .syfy import SyfyIE
from .sztvhu import SztvHuIE from .sztvhu import SztvHuIE
from .tagesschau import TagesschauIE from .tagesschau import (
TagesschauPlayerIE,
TagesschauIE,
)
from .tapely import TapelyIE from .tapely import TapelyIE
from .tass import TassIE from .tass import TassIE
from .tdslifeway import TDSLifewayIE from .tdslifeway import TDSLifewayIE

View File

@ -6,10 +6,124 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
js_to_json,
parse_iso8601,
parse_filesize, parse_filesize,
) )
class TagesschauPlayerIE(InfoExtractor):
IE_NAME = 'tagesschau:player'
_VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html'
_TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html',
'md5': '8d09548d5c15debad38bee3a4d15ca21',
'info_dict': {
'id': '179517',
'ext': 'mp4',
'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD',
'thumbnail': 're:^https?:.*\.jpg$',
'formats': 'mincount:6',
},
}, {
'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
'md5': '76e6eec6ebd40740671cf0a2c88617e5',
'info_dict': {
'id': '29417',
'ext': 'mp3',
'title': 'Trabi - Bye, bye Rennpappe',
'thumbnail': 're:^https?:.*\.jpg$',
'formats': 'mincount:2',
},
}, {
'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html',
'only_matching': True,
}]
_FORMATS = {
'xs': {'quality': 0},
's': {'width': 320, 'height': 180, 'quality': 1},
'm': {'width': 512, 'height': 288, 'quality': 2},
'l': {'width': 960, 'height': 540, 'quality': 3},
'xl': {'width': 1280, 'height': 720, 'quality': 4},
'xxl': {'quality': 5},
}
def _extract_via_api(self, kind, video_id):
info = self._download_json(
'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id),
video_id)
title = info['headline']
formats = []
for media in info['mediadata']:
for format_id, format_url in media.items():
if determine_ext(format_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls'))
else:
formats.append({
'url': format_url,
'format_id': format_id,
'vcodec': 'none' if kind == 'audio' else None,
})
self._sort_formats(formats)
timestamp = parse_iso8601(info.get('date'))
return {
'id': video_id,
'title': title,
'timestamp': timestamp,
'formats': formats,
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
# kind = mobj.group('kind').lower()
# if kind == 'video':
# return self._extract_via_api(kind, video_id)
# JSON api does not provide some audio formats (e.g. ogg) thus
# extractiong audio via webpage
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage).strip()
formats = []
for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage):
media = self._parse_json(js_to_json(media_json), video_id, fatal=False)
if not media:
continue
src = media.get('src')
if not src:
return
quality = media.get('quality')
kind = media.get('type', '').split('/')[0]
ext = determine_ext(src)
f = {
'url': src,
'format_id': '%s_%s' % (quality, ext) if quality else ext,
'ext': ext,
'vcodec': 'none' if kind == 'audio' else None,
}
f.update(self._FORMATS.get(quality, {}))
formats.append(f)
self._sort_formats(formats)
thumbnail = self._og_search_thumbnail(webpage)
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
}
class TagesschauIE(InfoExtractor): class TagesschauIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tagesschau\.de/[^/]+/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_?[^/#?]+?)?\.html' _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/[^/]+/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_?[^/#?]+?)?\.html'
@ -20,7 +134,7 @@ class TagesschauIE(InfoExtractor):
'id': '102143', 'id': '102143',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
'description': 'md5:171feccd9d9b3dd54d05d501568f6359', 'description': '18.07.2015 20:10 Uhr',
'thumbnail': 're:^https?:.*\.jpg$', 'thumbnail': 're:^https?:.*\.jpg$',
}, },
}, { }, {
@ -29,18 +143,30 @@ class TagesschauIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '5727', 'id': '5727',
'ext': 'mp4', 'ext': 'mp4',
'description': 'md5:695c01bfd98b7e313c501386327aea59',
'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
'description': 'md5:695c01bfd98b7e313c501386327aea59',
'thumbnail': 're:^https?:.*\.jpg$', 'thumbnail': 're:^https?:.*\.jpg$',
}, },
}, { }, {
'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', # exclusive audio
'md5': 'aef45de271c4bf0a5db834aa40bf774c', 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
'md5': '76e6eec6ebd40740671cf0a2c88617e5',
'info_dict': { 'info_dict': {
'id': '18407', 'id': '29417',
'ext': 'mp3', 'ext': 'mp3',
'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', 'title': 'Trabi - Bye, bye Rennpappe',
'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', 'description': 'md5:8687dda862cbbe2cfb2df09b56341317',
'thumbnail': 're:^https?:.*\.jpg$',
},
}, {
# audio in article
'url': 'http://www.tagesschau.de/inland/bnd-303.html',
'md5': 'e0916c623e85fc1d2b26b78f299d3958',
'info_dict': {
'id': '303',
'ext': 'mp3',
'title': 'Viele Baustellen für neuen BND-Chef',
'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4',
'thumbnail': 're:^https?:.*\.jpg$', 'thumbnail': 're:^https?:.*\.jpg$',
}, },
}, { }, {
@ -71,19 +197,11 @@ class TagesschauIE(InfoExtractor):
}, { }, {
'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
'only_matching': True, 'only_matching': True,
}, {
'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html',
'only_matching': True,
}] }]
_FORMATS = { @classmethod
'xs': {'quality': 0}, def suitable(cls, url):
's': {'width': 320, 'height': 180, 'quality': 1}, return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url)
'm': {'width': 512, 'height': 288, 'quality': 2},
'l': {'width': 960, 'height': 540, 'quality': 3},
'xl': {'width': 1280, 'height': 720, 'quality': 4},
'xxl': {'quality': 5},
}
def _extract_formats(self, download_text, media_kind): def _extract_formats(self, download_text, media_kind):
links = re.finditer( links = re.finditer(
@ -140,64 +258,39 @@ class TagesschauIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
display_id = video_id.lstrip('-') display_id = video_id.lstrip('-')
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
player_url = self._html_search_meta( title = self._html_search_regex(
'twitter:player', webpage, 'player URL', default=None) r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
if player_url: webpage, 'title', default=None) or self._og_search_title(webpage)
playerpage = self._download_webpage(
player_url, display_id, 'Downloading player page')
formats = [] DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>'
for media in re.finditer(
r'''(?x)
(?P<q_url>["\'])(?P<url>http://media.+?)(?P=q_url)
,\s*type:(?P<q_type>["\'])(?P<type>video|audio)/(?P<ext>.+?)(?P=q_type)
(?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))?
''', playerpage):
url = media.group('url')
webpage_type = media.group('type')
ext = media.group('ext')
res = media.group('quality')
f = {
'format_id': '%s_%s' % (res, ext) if res else ext,
'url': url,
'ext': ext,
'vcodec': 'none' if webpage_type == 'audio' else None,
}
f.update(self._FORMATS.get(res, {}))
formats.append(f)
thumbnail = self._og_search_thumbnail(playerpage)
title = self._og_search_title(webpage).strip()
description = self._og_search_description(webpage).strip()
else:
title = self._html_search_regex(
r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>' webpage_type = self._og_search_property('type', webpage, default=None)
if webpage_type == 'website': # Article
webpage_type = self._og_search_property('type', webpage, default=None) entries = []
if webpage_type == 'website': # Article for num, (entry_title, media_kind, download_text) in enumerate(re.findall(
entries = [] r'(?s)<p[^>]+class="infotext"[^>]*>.*?<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX,
for num, (entry_title, media_kind, download_text) in enumerate(re.findall( webpage), 1):
r'(?s)<p[^>]+class="infotext"[^>]*>.*?<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX, entries.append({
webpage), 1): 'id': '%s-%d' % (display_id, num),
entries.append({ 'title': '%s' % entry_title,
'id': '%s-%d' % (display_id, num), 'formats': self._extract_formats(download_text, media_kind),
'title': '%s' % entry_title, })
'formats': self._extract_formats(download_text, media_kind), if len(entries) > 1:
})
return self.playlist_result(entries, display_id, title) return self.playlist_result(entries, display_id, title)
else: # Assume single video formats = entries[0]['formats']
download_text = self._search_regex( else: # Assume single video
DOWNLOAD_REGEX, webpage, 'download links', group='links') download_text = self._search_regex(
media_kind = self._search_regex( DOWNLOAD_REGEX, webpage, 'download links', group='links')
DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='links') media_kind = self._search_regex(
formats = self._extract_formats(download_text, media_kind) DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind')
thumbnail = self._og_search_thumbnail(webpage) formats = self._extract_formats(download_text, media_kind)
description = self._html_search_regex( thumbnail = self._og_search_thumbnail(webpage)
r'(?s)<p class="teasertext">(.*?)</p>', description = self._html_search_regex(
webpage, 'description', default=None) r'(?s)<p class="teasertext">(.*?)</p>',
webpage, 'description', default=None)
self._sort_formats(formats) self._sort_formats(formats)