| @@ -4,17 +4,10 @@ from __future__ import unicode_literals | ||||
| import re | ||||
|  | ||||
| from .common import InfoExtractor | ||||
| from ..compat import ( | ||||
|     compat_parse_qs, | ||||
|     compat_str, | ||||
|     compat_urllib_parse_urlparse, | ||||
| ) | ||||
| from ..compat import compat_str | ||||
| from ..utils import ( | ||||
|     ExtractorError, | ||||
|     find_xpath_attr, | ||||
|     get_element_by_attribute, | ||||
|     int_or_none, | ||||
|     NO_DEFAULT, | ||||
|     qualities, | ||||
|     try_get, | ||||
|     unified_strdate, | ||||
| @@ -25,59 +18,7 @@ from ..utils import ( | ||||
| # add tests. | ||||
|  | ||||
|  | ||||
| class ArteTvIE(InfoExtractor): | ||||
|     _VALID_URL = r'https?://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' | ||||
|     IE_NAME = 'arte.tv' | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         mobj = re.match(self._VALID_URL, url) | ||||
|         lang = mobj.group('lang') | ||||
|         video_id = mobj.group('id') | ||||
|  | ||||
|         ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') | ||||
|         ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') | ||||
|         ref_xml_doc = self._download_xml( | ||||
|             ref_xml_url, video_id, note='Downloading metadata') | ||||
|         config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) | ||||
|         config_xml_url = config_node.attrib['ref'] | ||||
|         config = self._download_xml( | ||||
|             config_xml_url, video_id, note='Downloading configuration') | ||||
|  | ||||
|         formats = [{ | ||||
|             'format_id': q.attrib['quality'], | ||||
|             # The playpath starts at 'mp4:', if we don't manually | ||||
|             # split the url, rtmpdump will incorrectly parse them | ||||
|             'url': q.text.split('mp4:', 1)[0], | ||||
|             'play_path': 'mp4:' + q.text.split('mp4:', 1)[1], | ||||
|             'ext': 'flv', | ||||
|             'quality': 2 if q.attrib['quality'] == 'hd' else 1, | ||||
|         } for q in config.findall('./urls/url')] | ||||
|         self._sort_formats(formats) | ||||
|  | ||||
|         title = config.find('.//name').text | ||||
|         thumbnail = config.find('.//firstThumbnailUrl').text | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'title': title, | ||||
|             'thumbnail': thumbnail, | ||||
|             'formats': formats, | ||||
|         } | ||||
|  | ||||
|  | ||||
| class ArteTVBaseIE(InfoExtractor): | ||||
|     @classmethod | ||||
|     def _extract_url_info(cls, url): | ||||
|         mobj = re.match(cls._VALID_URL, url) | ||||
|         lang = mobj.group('lang') | ||||
|         query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) | ||||
|         if 'vid' in query: | ||||
|             video_id = query['vid'][0] | ||||
|         else: | ||||
|             # This is not a real id, it can be for example AJT for the news | ||||
|             # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal | ||||
|             video_id = mobj.group('id') | ||||
|         return video_id, lang | ||||
|  | ||||
|     def _extract_from_json_url(self, json_url, video_id, lang, title=None): | ||||
|         info = self._download_json(json_url, video_id) | ||||
|         player_info = info['videoJsonPlayer'] | ||||
| @@ -108,13 +49,15 @@ class ArteTVBaseIE(InfoExtractor): | ||||
|             'upload_date': unified_strdate(upload_date_str), | ||||
|             'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), | ||||
|         } | ||||
|         qfunc = qualities(['HQ', 'MQ', 'EQ', 'SQ']) | ||||
|         qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) | ||||
|  | ||||
|         LANGS = { | ||||
|             'fr': 'F', | ||||
|             'de': 'A', | ||||
|             'en': 'E[ANG]', | ||||
|             'es': 'E[ESP]', | ||||
|             'it': 'E[ITA]', | ||||
|             'pl': 'E[POL]', | ||||
|         } | ||||
|  | ||||
|         langcode = LANGS.get(lang, lang) | ||||
| @@ -126,8 +69,8 @@ class ArteTVBaseIE(InfoExtractor): | ||||
|             l = re.escape(langcode) | ||||
|  | ||||
|             # Language preference from most to least priority | ||||
|             # Reference: section 5.6.3 of | ||||
|             # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf | ||||
|             # Reference: section 6.8 of | ||||
|             # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf | ||||
|             PREFERENCES = ( | ||||
|                 # original version in requested language, without subtitles | ||||
|                 r'VO{0}$'.format(l), | ||||
| @@ -193,274 +136,59 @@ class ArteTVBaseIE(InfoExtractor): | ||||
|  | ||||
| class ArteTVPlus7IE(ArteTVBaseIE): | ||||
|     IE_NAME = 'arte.tv:+7' | ||||
|     _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/(?:[^/]+/)?(?P<lang>fr|de|en|es)/(?:videos/)?(?:[^/]+/)*(?P<id>[^/?#&]+)' | ||||
|     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', | ||||
|         'only_matching': True, | ||||
|     }, { | ||||
|         'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22', | ||||
|         'only_matching': True, | ||||
|     }, { | ||||
|         'url': 'http://www.arte.tv/de/videos/048696-000-A/der-kluge-bauch-unser-zweites-gehirn', | ||||
|         'only_matching': True, | ||||
|         'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', | ||||
|         'info_dict': { | ||||
|             'id': '088501-000-A', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Mexico: Stealing Petrol to Survive', | ||||
|             'upload_date': '20190628', | ||||
|         }, | ||||
|     }] | ||||
|  | ||||
|     @classmethod | ||||
|     def suitable(cls, url): | ||||
|         return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url) | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         video_id, lang = self._extract_url_info(url) | ||||
|         webpage = self._download_webpage(url, video_id) | ||||
|         return self._extract_from_webpage(webpage, video_id, lang) | ||||
|  | ||||
|     def _extract_from_webpage(self, webpage, video_id, lang): | ||||
|         patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') | ||||
|         ids = (video_id, '') | ||||
|         # some pages contain multiple videos (like | ||||
|         # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), | ||||
|         # so we first try to look for json URLs that contain the video id from | ||||
|         # the 'vid' parameter. | ||||
|         patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] | ||||
|         json_url = self._html_search_regex( | ||||
|             patterns, webpage, 'json vp url', default=None) | ||||
|         if not json_url: | ||||
|             def find_iframe_url(webpage, default=NO_DEFAULT): | ||||
|                 return self._html_search_regex( | ||||
|                     r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', | ||||
|                     webpage, 'iframe url', group='url', default=default) | ||||
|  | ||||
|             iframe_url = find_iframe_url(webpage, None) | ||||
|             if not iframe_url: | ||||
|                 embed_url = self._html_search_regex( | ||||
|                     r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) | ||||
|                 if embed_url: | ||||
|                     player = self._download_json( | ||||
|                         embed_url, video_id, 'Downloading player page') | ||||
|                     iframe_url = find_iframe_url(player['html']) | ||||
|             # en and es URLs produce react-based pages with different layout (e.g. | ||||
|             # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) | ||||
|             if not iframe_url: | ||||
|                 program = self._search_regex( | ||||
|                     r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', | ||||
|                     webpage, 'program', default=None) | ||||
|                 if program: | ||||
|                     embed_html = self._parse_json(program, video_id) | ||||
|                     if embed_html: | ||||
|                         iframe_url = find_iframe_url(embed_html['embed_html']) | ||||
|             if iframe_url: | ||||
|                 json_url = compat_parse_qs( | ||||
|                     compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] | ||||
|         if json_url: | ||||
|             title = self._search_regex( | ||||
|                 r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', | ||||
|                 webpage, 'title', default=None, group='title') | ||||
|             return self._extract_from_json_url(json_url, video_id, lang, title=title) | ||||
|         # Different kind of embed URL (e.g. | ||||
|         # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) | ||||
|         entries = [ | ||||
|             self.url_result(url) | ||||
|             for _, url in re.findall(r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', webpage)] | ||||
|         return self.playlist_result(entries) | ||||
|  | ||||
|  | ||||
| # It also uses the arte_vp_url url from the webpage to extract the information | ||||
| class ArteTVCreativeIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:creative' | ||||
|     _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://creative.arte.tv/fr/episode/osmosis-episode-1', | ||||
|         'info_dict': { | ||||
|             'id': '057405-001-A', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'OSMOSIS - N\'AYEZ PLUS PEUR D\'AIMER (1)', | ||||
|             'upload_date': '20150716', | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion', | ||||
|         'playlist_count': 11, | ||||
|         'add_ie': ['Youtube'], | ||||
|     }, { | ||||
|         'url': 'http://creative.arte.tv/de/episode/agentur-amateur-4-der-erste-kunde', | ||||
|         'only_matching': True, | ||||
|     }] | ||||
|  | ||||
|  | ||||
| class ArteTVInfoIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:info' | ||||
|     _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere', | ||||
|         'info_dict': { | ||||
|             'id': '067528-000-A', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Service civique, un cache misère ?', | ||||
|             'upload_date': '20160403', | ||||
|         }, | ||||
|     }] | ||||
|  | ||||
|  | ||||
| class ArteTVFutureIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:future' | ||||
|     _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://future.arte.tv/fr/info-sciences/les-ecrevisses-aussi-sont-anxieuses', | ||||
|         'info_dict': { | ||||
|             'id': '050940-028-A', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Les écrevisses aussi peuvent être anxieuses', | ||||
|             'upload_date': '20140902', | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'http://future.arte.tv/fr/la-science-est-elle-responsable', | ||||
|         'only_matching': True, | ||||
|     }] | ||||
|  | ||||
|  | ||||
| class ArteTVDDCIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:ddc' | ||||
|     _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)' | ||||
|  | ||||
|     _TESTS = [] | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         video_id, lang = self._extract_url_info(url) | ||||
|         if lang == 'folge': | ||||
|             lang = 'de' | ||||
|         elif lang == 'emission': | ||||
|             lang = 'fr' | ||||
|         webpage = self._download_webpage(url, video_id) | ||||
|         scriptElement = get_element_by_attribute('class', 'visu_video_block', webpage) | ||||
|         script_url = self._html_search_regex(r'src="(.*?)"', scriptElement, 'script url') | ||||
|         javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator') | ||||
|         json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url') | ||||
|         return self._extract_from_json_url(json_url, video_id, lang) | ||||
|  | ||||
|  | ||||
| class ArteTVConcertIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:concert' | ||||
|     _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', | ||||
|         'md5': '9ea035b7bd69696b67aa2ccaaa218161', | ||||
|         'info_dict': { | ||||
|             'id': '186', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'The Notwist im Pariser Konzertclub "Divan du Monde"', | ||||
|             'upload_date': '20140128', | ||||
|             'description': 'md5:486eb08f991552ade77439fe6d82c305', | ||||
|         }, | ||||
|     }] | ||||
|  | ||||
|  | ||||
| class ArteTVCinemaIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:cinema' | ||||
|     _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://cinema.arte.tv/fr/article/les-ailes-du-desir-de-julia-reck', | ||||
|         'md5': 'a5b9dd5575a11d93daf0e3f404f45438', | ||||
|         'info_dict': { | ||||
|             'id': '062494-000-A', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Film lauréat du concours web - "Les ailes du désir" de Julia Reck', | ||||
|             'upload_date': '20150807', | ||||
|         }, | ||||
|     }] | ||||
|  | ||||
|  | ||||
| class ArteTVMagazineIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:magazine' | ||||
|     _VALID_URL = r'https?://(?:www\.)?arte\.tv/magazine/[^/]+/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         # Embedded via <iframe src="http://www.arte.tv/arte_vp/index.php?json_url=..." | ||||
|         'url': 'http://www.arte.tv/magazine/trepalium/fr/entretien-avec-le-realisateur-vincent-lannoo-trepalium', | ||||
|         'md5': '2a9369bcccf847d1c741e51416299f25', | ||||
|         'info_dict': { | ||||
|             'id': '065965-000-A', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Trepalium - Extrait Ep.01', | ||||
|             'upload_date': '20160121', | ||||
|         }, | ||||
|     }, { | ||||
|         # Embedded via <iframe src="http://www.arte.tv/guide/fr/embed/054813-004-A/medium" | ||||
|         'url': 'http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium', | ||||
|         'md5': 'fedc64fc7a946110fe311634e79782ca', | ||||
|         'info_dict': { | ||||
|             'id': '054813-004_PLUS7-F', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Trepalium (4/6)', | ||||
|             'description': 'md5:10057003c34d54e95350be4f9b05cb40', | ||||
|             'upload_date': '20160218', | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'http://www.arte.tv/magazine/metropolis/de/frank-woeste-german-paris-metropolis', | ||||
|         'only_matching': True, | ||||
|     }] | ||||
|         lang, video_id = re.match(self._VALID_URL, url).groups() | ||||
|         return self._extract_from_json_url( | ||||
|             'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), | ||||
|             video_id, lang) | ||||
|  | ||||
|  | ||||
| class ArteTVEmbedIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:embed' | ||||
|     _VALID_URL = r'''(?x) | ||||
|         http://www\.arte\.tv | ||||
|         /(?:playerv2/embed|arte_vp/index)\.php\?json_url= | ||||
|         https://www\.arte\.tv | ||||
|         /player/v3/index\.php\?json_url= | ||||
|         (?P<json_url> | ||||
|             http://arte\.tv/papi/tvguide/videos/stream/player/ | ||||
|             (?P<lang>[^/]+)/(?P<id>[^/]+)[^&]* | ||||
|             https?://api\.arte\.tv/api/player/v1/config/ | ||||
|             (?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF]) | ||||
|         ) | ||||
|     ''' | ||||
|  | ||||
|     _TESTS = [] | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         mobj = re.match(self._VALID_URL, url) | ||||
|         video_id = mobj.group('id') | ||||
|         lang = mobj.group('lang') | ||||
|         json_url = mobj.group('json_url') | ||||
|         json_url, lang, video_id = re.match(self._VALID_URL, url).groups() | ||||
|         return self._extract_from_json_url(json_url, video_id, lang) | ||||
|  | ||||
|  | ||||
| class TheOperaPlatformIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'theoperaplatform' | ||||
|     _VALID_URL = r'https?://(?:www\.)?theoperaplatform\.eu/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://www.theoperaplatform.eu/de/opera/verdi-otello', | ||||
|         'md5': '970655901fa2e82e04c00b955e9afe7b', | ||||
|         'info_dict': { | ||||
|             'id': '060338-009-A', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Verdi - OTELLO', | ||||
|             'upload_date': '20160927', | ||||
|         }, | ||||
|     }] | ||||
|  | ||||
|  | ||||
| class ArteTVPlaylistIE(ArteTVBaseIE): | ||||
|     IE_NAME = 'arte.tv:playlist' | ||||
|     _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)' | ||||
|     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV', | ||||
|         'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', | ||||
|         'info_dict': { | ||||
|             'id': 'PL-013263', | ||||
|             'title': 'Areva & Uramin', | ||||
|             'description': 'md5:a1dc0312ce357c262259139cfd48c9bf', | ||||
|             'id': 'RC-016954', | ||||
|             'title': 'Earn a Living', | ||||
|             'description': 'md5:d322c55011514b3a7241f7fb80d494c2', | ||||
|         }, | ||||
|         'playlist_mincount': 6, | ||||
|     }, { | ||||
|         'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV', | ||||
|         'only_matching': True, | ||||
|     }] | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         playlist_id, lang = self._extract_url_info(url) | ||||
|         lang, playlist_id = re.match(self._VALID_URL, url).groups() | ||||
|         collection = self._download_json( | ||||
|             'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' | ||||
|             % (lang, playlist_id), playlist_id) | ||||
|   | ||||
| @@ -58,17 +58,8 @@ from .ard import ( | ||||
|     ARDMediathekIE, | ||||
| ) | ||||
| from .arte import ( | ||||
|     ArteTvIE, | ||||
|     ArteTVPlus7IE, | ||||
|     ArteTVCreativeIE, | ||||
|     ArteTVConcertIE, | ||||
|     ArteTVInfoIE, | ||||
|     ArteTVFutureIE, | ||||
|     ArteTVCinemaIE, | ||||
|     ArteTVDDCIE, | ||||
|     ArteTVMagazineIE, | ||||
|     ArteTVEmbedIE, | ||||
|     TheOperaPlatformIE, | ||||
|     ArteTVPlaylistIE, | ||||
| ) | ||||
| from .asiancrush import ( | ||||
|   | ||||
		Reference in New Issue
	
	Block a user