[joj] Rewrite and add support for generic embeds (closes #13268)
This commit is contained in:
		| @@ -91,6 +91,7 @@ from .anvato import AnvatoIE | ||||
| from .washingtonpost import WashingtonPostIE | ||||
| from .wistia import WistiaIE | ||||
| from .mediaset import MediasetIE | ||||
| from .joj import JojIE | ||||
|  | ||||
|  | ||||
| class GenericIE(InfoExtractor): | ||||
| @@ -1770,6 +1771,16 @@ class GenericIE(InfoExtractor): | ||||
|             }, | ||||
|             'add_ie': [MediasetIE.ie_key()], | ||||
|         }, | ||||
|         { | ||||
|             # JOJ.sk embeds | ||||
|             'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok', | ||||
|             'info_dict': { | ||||
|                 'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok', | ||||
|                 'title': 'Slovenskom sa prehnala vlna silných búrok', | ||||
|             }, | ||||
|             'playlist_mincount': 5, | ||||
|             'add_ie': [JojIE.ie_key()], | ||||
|         }, | ||||
|         { | ||||
|             # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) | ||||
|             'url': 'https://tvrain.ru/amp/418921/', | ||||
| @@ -2722,6 +2733,12 @@ class GenericIE(InfoExtractor): | ||||
|             return self.playlist_from_matches( | ||||
|                 mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) | ||||
|  | ||||
|         # Look for JOJ.sk embeds | ||||
|         joj_urls = JojIE._extract_urls(webpage) | ||||
|         if joj_urls: | ||||
|             return self.playlist_from_matches( | ||||
|                 joj_urls, video_id, video_title, ie=JojIE.ie_key()) | ||||
|  | ||||
|         def merge_dicts(dict1, dict2): | ||||
|             merged = {} | ||||
|             for k, v in dict1.items(): | ||||
|   | ||||
| @@ -1,56 +1,100 @@ | ||||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
|  | ||||
| from .common import InfoExtractor | ||||
| import re | ||||
|  | ||||
| from .common import InfoExtractor | ||||
| from ..compat import compat_str | ||||
| from ..utils import ( | ||||
|     int_or_none, | ||||
|     js_to_json, | ||||
|     try_get, | ||||
| ) | ||||
|  | ||||
|  | ||||
| class JojIE(InfoExtractor): | ||||
|     _VALID_URL = r'https?://[a-z0-9]+\.joj\.sk/([^/]+/)*(?P<title_query>(?P<release_date>[0-9]{4}(-[0-9]{2}){2}).*)' # noqa | ||||
|     _VALID_URL = r'''(?x) | ||||
|                     (?: | ||||
|                         joj:| | ||||
|                         https?://media\.joj\.sk/embed/ | ||||
|                     ) | ||||
|                     (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) | ||||
|                 ''' | ||||
|     _TESTS = [{ | ||||
|         'url': 'https://www.joj.sk/nove-byvanie/archiv/2017-05-28-nove-byvanie', # noqa | ||||
|         'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', | ||||
|         'info_dict': { | ||||
|             'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Nové Bývanie', | ||||
|             'release_date': '20170528' | ||||
|             'title': 'NOVÉ BÝVANIE', | ||||
|             'thumbnail': r're:^https?://.*\.jpg$', | ||||
|             'duration': 3118, | ||||
|         } | ||||
|     }, { | ||||
|         'url': 'http://nasi.joj.sk/epizody/2016-09-06-stari-rodicia', | ||||
|         'info_dict': { | ||||
|             'id': 'f18b2c5f-9ea8-4941-a164-a814c53306ad', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Starí Rodičia', | ||||
|             'release_date': '20160906' | ||||
|         } | ||||
|         'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932', | ||||
|         'only_matching': True, | ||||
|     }] | ||||
|  | ||||
|     media_src_url = 'http://n16.joj.sk/storage/' | ||||
|     xml_source_url = 'https://media.joj.sk/services/Video.php?clip=' | ||||
|     @staticmethod | ||||
|     def _extract_urls(webpage): | ||||
|         return re.findall( | ||||
|             r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', | ||||
|             webpage) | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         mobj = re.match(self._VALID_URL, url) | ||||
|         release_date = mobj.group('release_date').replace('-', '') | ||||
|         webpage = self._download_webpage(url, 'id') | ||||
|         video_id = self._html_search_regex( | ||||
|             r'https?://([a-z0-9]+\.)joj\.sk/embed/(?P<video_id>[a-f0-9\-]+)', | ||||
|             webpage, 'id', group='video_id') | ||||
|         xml_playlist_url = self.xml_source_url + video_id | ||||
|         xml_playlist_et = self._download_xml(xml_playlist_url, 'XML playlist') | ||||
|         video_id = self._match_id(url) | ||||
|  | ||||
|         webpage = self._download_webpage( | ||||
|             'https://media.joj.sk/embed/%s' % video_id, video_id) | ||||
|  | ||||
|         title = self._search_regex( | ||||
|             (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1', | ||||
|              r'<title>(?P<title>[^<]+)'), webpage, 'title', | ||||
|             default=None, group='title') or self._og_search_title(webpage) | ||||
|  | ||||
|         bitrates = self._parse_json( | ||||
|             self._search_regex( | ||||
|                 r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates', | ||||
|                 default='{}'), | ||||
|             video_id, transform_source=js_to_json, fatal=False) | ||||
|  | ||||
|         formats = [] | ||||
|         for file_el in xml_playlist_et.findall('files/file'): | ||||
|             try: | ||||
|                 height = int(file_el.attrib['id'].replace('p', '')) | ||||
|             except ValueError: | ||||
|                 height = 0 | ||||
|             formats.append({'height': height, | ||||
|                             'url': self.media_src_url + file_el.attrib['path'].replace(  # noqa | ||||
|                                 'dat/', '', 1)}) | ||||
|         for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []: | ||||
|             if isinstance(format_url, compat_str): | ||||
|                 height = self._search_regex( | ||||
|                     r'(\d+)[pP]\.', format_url, 'height', default=None) | ||||
|                 formats.append({ | ||||
|                     'url': format_url, | ||||
|                     'format_id': '%sp' % height if height else None, | ||||
|                     'height': int(height), | ||||
|                 }) | ||||
|         if not formats: | ||||
|             playlist = self._download_xml( | ||||
|                 'https://media.joj.sk/services/Video.php?clip=%s' % video_id, | ||||
|                 video_id) | ||||
|             for file_el in playlist.findall('./files/file'): | ||||
|                 path = file_el.get('path') | ||||
|                 if not path: | ||||
|                     continue | ||||
|                 format_id = file_el.get('id') or file_el.get('label') | ||||
|                 formats.append({ | ||||
|                     'url': 'http://n16.joj.sk/storage/%s' % path.replace( | ||||
|                         'dat/', '', 1), | ||||
|                     'format_id': format_id, | ||||
|                     'height': int_or_none(self._search_regex( | ||||
|                         r'(\d+)[pP]', format_id or path, 'height', | ||||
|                         default=None)), | ||||
|                 }) | ||||
|         self._sort_formats(formats) | ||||
|  | ||||
|         thumbnail = self._og_search_thumbnail(webpage) | ||||
|  | ||||
|         duration = int_or_none(self._search_regex( | ||||
|             r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) | ||||
|  | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'title': self._og_search_title(webpage).title(), | ||||
|             'title': title, | ||||
|             'thumbnail': thumbnail, | ||||
|             'duration': duration, | ||||
|             'formats': formats, | ||||
|             'release_date': release_date | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user