[appletrailers] Rework extraction (fixes #1387)
The exraction was broken:
* The includes page contains img elements that need to be fixed.
* Use the 'itunes.inc' page, it contains a json dictionary for each trailer with information.
* Get the formats from 'includes/settings{trailer_name}.json'
* Use urljoin to allow urls with a fragment identifier to work
Removed the thumbnail urls from the tests, they are different now.
			
			
This commit is contained in:
		| @@ -1,8 +1,10 @@ | ||||
| import re | ||||
| import xml.etree.ElementTree | ||||
| import json | ||||
|  | ||||
| from .common import InfoExtractor | ||||
| from ..utils import ( | ||||
|     compat_urlparse, | ||||
|     determine_ext, | ||||
| ) | ||||
|  | ||||
| @@ -14,10 +16,9 @@ class AppleTrailersIE(InfoExtractor): | ||||
|         u"playlist": [ | ||||
|             { | ||||
|                 u"file": u"manofsteel-trailer4.mov", | ||||
|                 u"md5": u"11874af099d480cc09e103b189805d5f", | ||||
|                 u"md5": u"d97a8e575432dbcb81b7c3acb741f8a8", | ||||
|                 u"info_dict": { | ||||
|                     u"duration": 111, | ||||
|                     u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", | ||||
|                     u"title": u"Trailer 4", | ||||
|                     u"upload_date": u"20130523", | ||||
|                     u"uploader_id": u"wb", | ||||
| @@ -25,10 +26,9 @@ class AppleTrailersIE(InfoExtractor): | ||||
|             }, | ||||
|             { | ||||
|                 u"file": u"manofsteel-trailer3.mov", | ||||
|                 u"md5": u"07a0a262aae5afe68120eed61137ab34", | ||||
|                 u"md5": u"b8017b7131b721fb4e8d6f49e1df908c", | ||||
|                 u"info_dict": { | ||||
|                     u"duration": 182, | ||||
|                     u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", | ||||
|                     u"title": u"Trailer 3", | ||||
|                     u"upload_date": u"20130417", | ||||
|                     u"uploader_id": u"wb", | ||||
| @@ -36,10 +36,9 @@ class AppleTrailersIE(InfoExtractor): | ||||
|             }, | ||||
|             { | ||||
|                 u"file": u"manofsteel-trailer.mov", | ||||
|                 u"md5": u"e401fde0813008e3307e54b6f384cff1", | ||||
|                 u"md5": u"d0f1e1150989b9924679b441f3404d48", | ||||
|                 u"info_dict": { | ||||
|                     u"duration": 148, | ||||
|                     u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", | ||||
|                     u"title": u"Trailer", | ||||
|                     u"upload_date": u"20121212", | ||||
|                     u"uploader_id": u"wb", | ||||
| @@ -47,10 +46,9 @@ class AppleTrailersIE(InfoExtractor): | ||||
|             }, | ||||
|             { | ||||
|                 u"file": u"manofsteel-teaser.mov", | ||||
|                 u"md5": u"76b392f2ae9e7c98b22913c10a639c97", | ||||
|                 u"md5": u"5fe08795b943eb2e757fa95cb6def1cb", | ||||
|                 u"info_dict": { | ||||
|                     u"duration": 93, | ||||
|                     u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", | ||||
|                     u"title": u"Teaser", | ||||
|                     u"upload_date": u"20120721", | ||||
|                     u"uploader_id": u"wb", | ||||
| @@ -59,87 +57,61 @@ class AppleTrailersIE(InfoExtractor): | ||||
|         ] | ||||
|     } | ||||
|  | ||||
|     _JSON_RE = r'iTunes.playURL\((.*?)\);' | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         mobj = re.match(self._VALID_URL, url) | ||||
|         movie = mobj.group('movie') | ||||
|         uploader_id = mobj.group('company') | ||||
|  | ||||
|         playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' | ||||
|         playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') | ||||
|         playlist_snippet = self._download_webpage(playlist_url, movie) | ||||
|         playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet) | ||||
|         playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet) | ||||
|         playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned) | ||||
|         # The ' in the onClick attributes are not escaped, it couldn't be parsed | ||||
|         # with xml.etree.ElementTree.fromstring | ||||
|         # like: http://trailers.apple.com/trailers/wb/gravity/ | ||||
|         def _clean_json(m): | ||||
|             return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') | ||||
|         playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) | ||||
|         playlist_html = u'<html>' + playlist_cleaned + u'</html>' | ||||
|  | ||||
|         size_cache = {} | ||||
|  | ||||
|         doc = xml.etree.ElementTree.fromstring(playlist_html) | ||||
|         playlist = [] | ||||
|         for li in doc.findall('./div/ul/li'): | ||||
|             title = li.find('.//h3').text | ||||
|             on_click = li.find('.//a').attrib['onClick'] | ||||
|             trailer_info_json = self._search_regex(self._JSON_RE, | ||||
|                 on_click, u'trailer info') | ||||
|             trailer_info = json.loads(trailer_info_json) | ||||
|             title = trailer_info['title'] | ||||
|             video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() | ||||
|             thumbnail = li.find('.//img').attrib['src'] | ||||
|             upload_date = trailer_info['posted'].replace('-', '') | ||||
|  | ||||
|             date_el = li.find('.//p') | ||||
|             upload_date = None | ||||
|             m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text) | ||||
|             if m: | ||||
|                 upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') | ||||
|             runtime_el = date_el.find('./br') | ||||
|             m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail) | ||||
|             runtime = trailer_info['runtime'] | ||||
|             m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime) | ||||
|             duration = None | ||||
|             if m: | ||||
|                 duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) | ||||
|  | ||||
|             first_url = trailer_info['url'] | ||||
|             trailer_id = first_url.split('/')[-1].rpartition('_')[0] | ||||
|             settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) | ||||
|             settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') | ||||
|             settings = json.loads(settings_json) | ||||
|  | ||||
|             formats = [] | ||||
|             for formats_el in li.findall('.//a'): | ||||
|                 if formats_el.attrib['class'] != 'OverlayPanel': | ||||
|                     continue | ||||
|                 target = formats_el.attrib['target'] | ||||
|  | ||||
|                 format_code = formats_el.text | ||||
|                 if 'Automatic' in format_code: | ||||
|                     continue | ||||
|  | ||||
|                 size_q = formats_el.attrib['href'] | ||||
|                 size_id = size_q.rpartition('#videos-')[2] | ||||
|                 if size_id not in size_cache: | ||||
|                     size_url = url + size_q | ||||
|                     sizepage_html = self._download_webpage( | ||||
|                         size_url, movie, | ||||
|                         note=u'Downloading size info %s' % size_id, | ||||
|                         errnote=u'Error while downloading size info %s' % size_id, | ||||
|                     ) | ||||
|                     _doc = xml.etree.ElementTree.fromstring(sizepage_html) | ||||
|                     size_cache[size_id] = _doc | ||||
|  | ||||
|                 sizepage_doc = size_cache[size_id] | ||||
|                 links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') | ||||
|                 for vid_a in links: | ||||
|                     href = vid_a.get('href') | ||||
|                     if not href.endswith(target): | ||||
|                         continue | ||||
|                     detail_q = href.partition('#')[0] | ||||
|                     detail_url = url + '/' + detail_q | ||||
|  | ||||
|                     m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q) | ||||
|                     detail_id = m.group('detail_id') | ||||
|  | ||||
|                     detail_html = self._download_webpage( | ||||
|                         detail_url, movie, | ||||
|                         note=u'Downloading detail %s %s' % (detail_id, size_id), | ||||
|                         errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) | ||||
|                     ) | ||||
|                     detail_doc = xml.etree.ElementTree.fromstring(detail_html) | ||||
|                     movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') | ||||
|                     assert movie_link_el.get('class') == 'movieLink' | ||||
|                     movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') | ||||
|                     ext = determine_ext(movie_link) | ||||
|                     assert ext == 'mov' | ||||
|  | ||||
|                     formats.append({ | ||||
|                         'format': format_code, | ||||
|                         'ext': ext, | ||||
|                         'url': movie_link, | ||||
|                     }) | ||||
|             for format in settings['metadata']['sizes']: | ||||
|                 # The src is a file pointing to the real video file | ||||
|                 format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) | ||||
|                 formats.append({ | ||||
|                     'url': format_url, | ||||
|                     'ext': determine_ext(format_url), | ||||
|                     'format': format['type'], | ||||
|                     'width': format['width'], | ||||
|                     'height': int(format['height']), | ||||
|                 }) | ||||
|             formats = sorted(formats, key=lambda f: (f['height'], f['width'])) | ||||
|  | ||||
|             info = { | ||||
|                 '_type': 'video', | ||||
|   | ||||
		Reference in New Issue
	
	Block a user