[wistia] Add extractor

2013-12-06 09:15:04 +01:00
parent 72135030d1
commit ef4fd84857
4 changed files with 80 additions and 10 deletions
@@ -488,7 +488,8 @@ class YoutubeDL(object):
                new_result = ie_result.copy()
                for f in ('_type', 'url', 'ext', 'player_url', 'formats',
                          'entries', 'urlhandle', 'ie_key', 'duration',
-                          'subtitles', 'annotations', 'format'):
+                          'subtitles', 'annotations', 'format',
                          'thumbnail', 'thumbnails'):
                    if f in new_result:
                        del new_result[f]
                    if f in embedded_info:
@@ -178,6 +178,7 @@ from .wat import WatIE
 from .websurg import WeBSurgIE
 from .weibo import WeiboIE
 from .wimp import WimpIE
 from .wistia import WistiaIE
 from .worldstarhiphop import WorldStarHipHopIE
 from .xhamster import XHamsterIE
 from .xnxx import XNXXIE
@@ -169,8 +169,13 @@ class GenericIE(InfoExtractor):
        #   Site Name | Video Title
        #   Video Title - Tagline | Site Name
        # and so on and so forth; it's just not practical
-        video_title = self._html_search_regex(r'<title>(.*)</title>',
+        video_title = self._html_search_regex(
-            webpage, u'video title', default=u'video', flags=re.DOTALL)
+            r'(?s)<title>(.*?)</title>', webpage, u'video title',
            default=u'video')
        # video uploader is domain name
        video_uploader = self._search_regex(
            r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')
        # Look for BrightCove:
        bc_url = BrightcoveIE._extract_brightcove_url(webpage)
@@ -188,7 +193,7 @@ class GenericIE(InfoExtractor):
        # Look for embedded YouTube player
        matches = re.findall(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage)
        if matches:
            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
                     for tuppl in matches]
@@ -197,13 +202,26 @@ class GenericIE(InfoExtractor):
        # Look for embedded Dailymotion player
        matches = re.findall(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
        if matches:
            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
                     for tuppl in matches]
            return self.playlist_result(
                urlrs, playlist_id=video_id, playlist_title=video_title)
        # Look for embedded Wistia player
        match = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
        if match:
            return {
                '_type': 'url_transparent',
                'url': unescapeHTML(match.group('url')),
                'ie_key': 'Wistia',
                'uploader': video_uploader,
                'title': video_title,
                'id': video_id,
            }
        # Look for Bandcamp pages with custom domain
        mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
        if mobj is not None:
@@ -247,14 +265,9 @@ class GenericIE(InfoExtractor):
        # here's a fun little line of code for you:
        video_id = os.path.splitext(video_id)[0]
        # video uploader is domain name
        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
            url, u'video uploader')
        return {
            'id':       video_id,
            'url':      video_url,
            'uploader': video_uploader,
            'upload_date':  None,
            'title':    video_title,
        }
@@ -0,0 +1,55 @@
 import json
 import re
 from .common import InfoExtractor
 class WistiaIE(InfoExtractor):
    _VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
    _TEST = {
        u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt",
        u"file": u"sh7fpupwlt.mov",
        u"md5": u"cafeb56ec0c53c18c97405eecb3133df",
        u"info_dict": {
            u"title": u"cfh_resourceful_zdkh_final_1"
        },
    }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        webpage = self._download_webpage(url, video_id)
        data_json = self._html_search_regex(
            r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data')
        data = json.loads(data_json)
        formats = []
        thumbnails = []
        for atype, a in data['assets'].items():
            if atype == 'still':
                thumbnails.append({
                    'url': a['url'],
                    'resolution': '%dx%d' % (a['width'], a['height']),
                })
                continue
            if atype == 'preview':
                continue
            formats.append({
                'format_id': atype,
                'url': a['url'],
                'width': a['width'],
                'height': a['height'],
                'filesize': a['size'],
                'ext': a['ext'],
            })
        formats.sort(key=lambda a: a['filesize'])
        return {
            'id': video_id,
            'title': data['name'],
            'formats': formats,
            'thumbnails': thumbnails,
        }