From a572ae6114deca4e8f2f0365ca7091749f01deaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 Jun 2018 01:35:23 +0700 Subject: [PATCH] [tvnet] Improve and fix issues (closes #15462) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/tvnet.py | 133 +++++++++++++++++++++++++++++ youtube_dl/extractor/vtv.py | 91 -------------------- 3 files changed, 134 insertions(+), 92 deletions(-) create mode 100644 youtube_dl/extractor/tvnet.py delete mode 100644 youtube_dl/extractor/vtv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e6d1fe70e..d4583b8e4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1139,6 +1139,7 @@ from .tvc import ( from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE +from .tvnet import TVNetIE from .tvnoe import TVNoeIE from .tvnow import ( TVNowIE, @@ -1306,7 +1307,6 @@ from .vrv import ( VRVSeriesIE, ) from .vshare import VShareIE -from .vtv import VTVIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py new file mode 100644 index 000000000..0ec2da4da --- /dev/null +++ b/youtube_dl/extractor/tvnet.py @@ -0,0 +1,133 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + unescapeHTML, +) + + +class TVNetIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?P[0-9]+)' + _TESTS = [{ + # video + 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', + 'md5': 'b4d7abe0252c9b47774760b7519c7558', + 'info_dict': { + 'id': '109788', + 'ext': 'mp4', + 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + 'view_count': int, + }, + }, { + # audio + 'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi', + 'md5': 'b5875ce9b0a2eecde029216d0e6db2ae', + 'info_dict': { + 'id': '27017', + 'ext': 'm4a', + 'title': 'VOV1 - Bản tin chiều (10/06/2018)', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': False, + }, + }, { + # live stream + 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', + 'info_dict': { + 'id': '1011', + 'ext': 'mp4', + 'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + # radio live stream + 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', + 'info_dict': { + 'id': '1014', + 'ext': 'm4a', + 'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, default=None) or self._search_regex( + r'([^<]+)<', webpage, 'title') + title = re.sub(r'\s*-\s*TV Net\s*$', '', title) + + if '/video/' in url or '/radio/' in url: + is_live = False + elif '/kenh-truyen-hinh/' in url: + is_live = True + else: + is_live = None + + data_file = unescapeHTML(self._search_regex( + r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'data file', group='url')) + + stream_urls = set() + formats = [] + for stream in self._download_json(data_file, video_id): + if not isinstance(stream, dict): + continue + stream_url = stream.get('url') + if (stream_url in stream_urls or not stream_url or + not isinstance(stream_url, compat_str)): + continue + stream_urls.add(stream_url) + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + # better support for radio streams + if title.startswith('VOV'): + for f in formats: + f.update({ + 'ext': 'm4a', + 'vcodec': 'none', + }) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or unescapeHTML( + self._search_regex( + r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, + 'thumbnail', default=None, group='url')) + + if is_live: + title = self._live_title(title) + + view_count = int_or_none(self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>', + webpage, 'view count', default=None)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'is_live': is_live, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/vtv.py b/youtube_dl/extractor/vtv.py deleted file mode 100644 index a9683dd85..000000000 --- a/youtube_dl/extractor/vtv.py +++ /dev/null @@ -1,91 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - -import re - -from ..utils import extract_attributes - -class VTVIE(InfoExtractor): - _VALID_URL = r'https?://(au|ca|cz|de|jp|kr|tw|us|vn)\.tvnet\.gov\.vn/[^/]*/(?P<id>[0-9]+)/?' - _TESTS = [{ - # Livestream. Channel: VTV 1 - 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', - 'info_dict': { - 'id': '1011', - 'ext': 'mp4', - 'title': r're:^VTV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'thumbnail': r're:https?://.*\.png$', - } - }, { - # Downloading a video. - 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', - 'md5': '5263c63d738569ed507980f1e49ebc03', - 'info_dict': { - 'id': '109788', - 'ext': 'mp4', - 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang - TV Net', - 'thumbnail': r're:https?://.*\.JPG$', - } - }, { - # Radio live stream. Channel: VOV 1 - 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', - 'info_dict': { - 'id': '1014', - 'ext': 'm4a', - 'vcodec': 'none', - 'title': r're:VOV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'thumbnail': r're:https?://.*\.png$', - } - - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex(r'<title>(.+?)', webpage, 'title', default=None, fatal=False) - if title is None: - title = self._og_search_title(webpage) - title.strip() - - mediaplayer_div = self._search_regex(r'(]*id="mediaplayer"[^>]*>)', webpage, 'mediaplayer element') - mediaplayer_div_attributes = extract_attributes(mediaplayer_div) - - thumbnail = mediaplayer_div_attributes.get("data-image") - - json_url = mediaplayer_div_attributes["data-file"] - video_streams = self._download_json(json_url, video_id) - - - # get any working playlist from streams. Currently there's 2 and the first always works, - # but you never know in the future - for stream in video_streams: - formats = self._extract_m3u8_formats(stream.get("url"), video_id, ext="mp4", fatal=False) - if formats: - break - - # better support radio streams - if title.startswith("VOV"): - for f in formats: - f["ext"] = "m4a" - f["vcodec"] = "none" - - if "/video/" in url or "/radio/" in url: - is_live = False - elif "/kenh-truyen-hinh/" in url: - is_live = True - else: - is_live = None - - if is_live: - title = self._live_title(title) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'is_live': is_live, - }