Compare commits

..

30 Commits

Author SHA1 Message Date
Philipp Hagemeister
898c23c03f release 2015.01.22 2015-01-22 12:04:26 +01:00
Philipp Hagemeister
b55ee18ff3 [hearthisat] Add support for more high-quality download links 2015-01-22 12:04:13 +01:00
Naglis Jonaitis
e5763a7a7e [hearthisat] Add new extractor (Closes #4743) 2015-01-21 21:47:55 +02:00
Sergey M․
8bb1bdfae9 [twitch:past_broadcasts] Fix IE_NAME 2015-01-21 23:06:16 +06:00
Sergey M․
c62b449765 Credit @yan12125 for streetvoice (#4758) 2015-01-21 22:56:28 +06:00
Sergey M․
bb0aa4cb3c [streetvoice] Improve 2015-01-21 22:53:51 +06:00
Sergey M.
d63528c8c7 Merge pull request #4758 from yan12125/IE_streetvoice
[StreetVoice] Add new extractor
2015-01-21 22:36:50 +06:00
Sergey M․
c5db6bb32b [twitch] Refactor and add support for past broadcasts 2015-01-21 22:27:21 +06:00
Yen Chi Hsuan
c8dc41a6e7 [StreetVoice] Add new extractor 2015-01-21 23:05:47 +08:00
Jaime Marquínez Ferrándiz
47e0e1e0e2 [nbc] Fix pep8 issue 2015-01-21 10:36:15 +01:00
Jaime Marquínez Ferrándiz
efcddaebe9 [cnn] Use edition.cnn.com for getting the information (fixes #4757)
Some videos (like http://edition.cnn.com/videos/us/2015/01/20/orig-yellowstone-oil-spill.cnn) will fail if we use cnn.com.
2015-01-21 10:31:57 +01:00
Jaime Marquínez Ferrándiz
5fe5112589 [CNNArticle] Update test 2015-01-21 10:27:18 +01:00
Sergey M․
564bb5e964 [tinypic] Tweak VALID_URL regex (Closes #4754) 2015-01-21 02:15:28 +06:00
Sergey M․
2df54b4ba8 [nbcnews] Ignore HTTP errors while coping with playlists (Closes #4749) 2015-01-20 21:23:51 +06:00
Sergey M․
030aa5d9e7 [tvp] Fix extraction 2015-01-19 23:00:22 +06:00
Philipp Hagemeister
c511f13f22 [ndtv] Modernize 2015-01-19 10:10:05 +01:00
Sergey M․
fdb2ed7455 [abc7news] Add extractor (Closes #4734) 2015-01-18 08:09:18 +06:00
Philipp Hagemeister
ba319696a9 [options] Clarify that --password can be left out (#4723) 2015-01-17 23:56:34 +01:00
Philipp Hagemeister
910c552052 release 2015.01.16 2015-01-16 14:20:38 +01:00
Philipp Hagemeister
cce81f192c [bandcamp:album] Fix title extraction (Fixes #4721) 2015-01-16 14:20:25 +01:00
Philipp Hagemeister
9d22a7dfb0 [fourtube] Fix extraction 2015-01-16 13:44:44 +01:00
Philipp Hagemeister
4f4f642822 [npo] Remove unused import 2015-01-16 13:44:36 +01:00
Jaime Marquínez Ferrándiz
2875cf01bb FFmpegEmbedSubtitlePP: simplify command 2015-01-16 13:37:37 +01:00
Jaime Marquínez Ferrándiz
e205db3bcd FFmpegEmbedSubtitlePP: don't fail if the video doesn't have an audio stream (fixes #4718)
Instead of specifying which streams ffmpeg must copy, we tell it to copy all.
2015-01-16 13:29:01 +01:00
Philipp Hagemeister
31d4a6e212 release 2015.01.15.1 2015-01-15 22:38:11 +01:00
Sergey M․
aaeb86f682 [youtube] Add test for #4706 2015-01-16 01:25:03 +06:00
Sergey M.
9fa6ea2680 Merge pull request #4706 from pkulak/master
Fix Youtube encrypted sigs.
2015-01-16 01:12:50 +06:00
Phil Kulak
a9b6b5cd15 Looks like Google switched to a new JS compiler that includes dollar signs in function names. 2015-01-15 10:23:05 -08:00
Naglis Jonaitis
a45c0a5d67 [videomega] Fix extraction (Closes #4703) 2015-01-15 19:57:36 +02:00
Sergey M․
c8dfe360eb [atresplayer] Add authentication support (Closes #4700) 2015-01-15 21:43:35 +06:00
22 changed files with 616 additions and 262 deletions

View File

@@ -103,3 +103,4 @@ Christopher Krooss
Ondřej Caletka
Dinesh S
Johan K. Jensen
Yen Chi Hsuan

View File

@@ -304,7 +304,8 @@ which means you can modify it, redistribute it or use it however you like.
## Authentication Options:
-u, --username USERNAME login with this account ID
-p, --password PASSWORD account password
-p, --password PASSWORD account password. If this option is left
out, youtube-dl will ask interactively.
-2, --twofactor TWOFACTOR two-factor auth code
-n, --netrc use .netrc authentication data
--video-password PASSWORD video password (vimeo, smotri)

View File

@@ -1,6 +1,7 @@
from __future__ import unicode_literals
from .abc import ABCIE
from .abc7news import Abc7NewsIE
from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE
from .adobetv import AdobeTVIE
@@ -175,6 +176,7 @@ from .goshgay import GoshgayIE
from .grooveshark import GroovesharkIE
from .groupon import GrouponIE
from .hark import HarkIE
from .hearthisat import HearThisAtIE
from .heise import HeiseIE
from .hellporno import HellPornoIE
from .helsinki import HelsinkiIE
@@ -408,6 +410,7 @@ from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streetvoice import StreetVoiceIE
from .sunporno import SunPornoIE
from .swrmediathek import SWRMediathekIE
from .syfy import SyfyIE
@@ -457,7 +460,13 @@ from .tvigle import TvigleIE
from .tvp import TvpIE, TvpSeriesIE
from .tvplay import TVPlayIE
from .twentyfourvideo import TwentyFourVideoIE
from .twitch import TwitchIE
from .twitch import (
TwitchVideoIE,
TwitchChapterIE,
TwitchVodIE,
TwitchProfileIE,
TwitchPastBroadcastsIE,
)
from .ubu import UbuIE
from .udemy import (
UdemyIE,

View File

@@ -0,0 +1,68 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import parse_iso8601
class Abc7NewsIE(InfoExtractor):
_VALID_URL = r'https?://abc7news\.com(?:/[^/]+/(?P<display_id>[^/]+))?/(?P<id>\d+)'
_TESTS = [
{
'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/',
'info_dict': {
'id': '472581',
'display_id': 'east-bay-museum-celebrates-vintage-synthesizers',
'ext': 'mp4',
'title': 'East Bay museum celebrates history of synthesized music',
'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10',
'thumbnail': 're:^https?://.*\.jpg$',
'timestamp': 1421123075,
'upload_date': '20150113',
'uploader': 'Jonathan Bloom',
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://abc7news.com/472581',
'only_matching': True,
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
webpage = self._download_webpage(url, display_id)
m3u8 = self._html_search_meta(
'contentURL', webpage, 'm3u8 url', fatal=True)
formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4')
self._sort_formats(formats)
title = self._og_search_title(webpage).strip()
description = self._og_search_description(webpage).strip()
thumbnail = self._og_search_thumbnail(webpage)
timestamp = parse_iso8601(self._search_regex(
r'<div class="meta">\s*<time class="timeago" datetime="([^"]+)">',
webpage, 'upload date', fatal=False))
uploader = self._search_regex(
r'rel="author">([^<]+)</a>',
webpage, 'uploader', default=None)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'timestamp': timestamp,
'uploader': uploader,
'formats': formats,
}

View File

@@ -4,9 +4,12 @@ import time
import hmac
from .common import InfoExtractor
from ..utils import (
from ..compat import (
compat_str,
compat_urllib_parse,
compat_urllib_request,
)
from ..utils import (
int_or_none,
float_or_none,
xpath_text,
@@ -44,6 +47,33 @@ class AtresPlayerIE(InfoExtractor):
_PLAYER_URL_TEMPLATE = 'https://servicios.atresplayer.com/episode/getplayer.json?episodePk=%s'
_EPISODE_URL_TEMPLATE = 'http://www.atresplayer.com/episodexml/%s'
_LOGIN_URL = 'https://servicios.atresplayer.com/j_spring_security_check'
def _real_initialize(self):
self._login()
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
login_form = {
'j_username': username,
'j_password': password,
}
request = compat_urllib_request.Request(
self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
error = self._html_search_regex(
r'(?s)<ul class="list_error">(.+?)</ul>', response, 'error', default=None)
if error:
raise ExtractorError(
'Unable to login: %s' % error, expected=True)
def _real_extract(self, url):
video_id = self._match_id(url)

View File

@@ -161,7 +161,8 @@ class BandcampAlbumIE(InfoExtractor):
entries = [
self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
for t_path in tracks_paths]
title = self._search_regex(r'album_title : "(.*?)"', webpage, 'title')
title = self._search_regex(
r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False)
return {
'_type': 'playlist',
'id': playlist_id,

View File

@@ -51,7 +51,7 @@ class CNNIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
path = mobj.group('path')
page_title = mobj.group('title')
info_url = 'http://cnn.com/video/data/3.0/%s/index.xml' % path
info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path
info = self._download_xml(info_url, page_title)
formats = []
@@ -143,13 +143,13 @@ class CNNArticleIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)'
_TEST = {
'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
'md5': '275b326f85d80dff7592a9820f5dc887',
'md5': '689034c2a3d9c6dc4aa72d65a81efd01',
'info_dict': {
'id': 'bestoftv/2014/12/21/sotu-crowley-president-obama-north-korea-not-going-to-be-intimidated.cnn',
'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn',
'ext': 'mp4',
'title': 'Obama: We\'re not going to be intimidated',
'description': 'md5:e735586f3dc936075fa654a4d91b21f9',
'upload_date': '20141220',
'title': 'Obama: Cyberattack not an act of war',
'description': 'md5:51ce6750450603795cad0cdfbd7d05c5',
'upload_date': '20141221',
},
'add_ie': ['CNN'],
}

View File

@@ -7,10 +7,9 @@ from ..compat import (
compat_urllib_request,
)
from ..utils import (
clean_html,
parse_duration,
parse_iso8601,
str_to_int,
unified_strdate,
)
@@ -28,68 +27,81 @@ class FourTubeIE(InfoExtractor):
'uploader': 'WCP Club',
'uploader_id': 'wcp-club',
'upload_date': '20131031',
'timestamp': 1383263892,
'duration': 583,
'view_count': int,
'like_count': int,
'categories': list,
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage_url = 'http://www.4tube.com/videos/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
webpage = self._download_webpage(url, video_id)
self.report_extraction(video_id)
title = self._html_search_meta('name', webpage)
timestamp = parse_iso8601(self._html_search_meta(
'uploadDate', webpage))
thumbnail = self._html_search_meta('thumbnailUrl', webpage)
uploader_id = self._html_search_regex(
r'<a class="img-avatar" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">',
webpage, 'uploader id')
uploader = self._html_search_regex(
r'<a class="img-avatar" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">',
webpage, 'uploader')
playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, 'Playlist')
media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, 'Media Id')
sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, 'Sources').split(',')
title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, 'Title')
thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, 'Thumbnail', fatal=False)
categories_html = self._search_regex(
r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="list">(.*?)</ul>',
webpage, 'categories', fatal=False)
categories = None
if categories_html:
categories = [
c.strip() for c in re.findall(
r'(?s)<li><a.*?>(.*?)</a>', categories_html)]
uploader_str = self._search_regex(r'<span>Uploaded by</span>(.*?)<span>', webpage, 'uploader', fatal=False)
mobj = re.search(r'<a href="/sites/(?P<id>[^"]+)"><strong>(?P<name>[^<]+)</strong></a>', uploader_str)
(uploader, uploader_id) = (mobj.group('name'), mobj.group('id')) if mobj else (clean_html(uploader_str), None)
view_count = str_to_int(self._search_regex(
r'<meta itemprop="interactionCount" content="UserPlays:([0-9,]+)">',
webpage, 'view count', fatal=False))
like_count = str_to_int(self._search_regex(
r'<meta itemprop="interactionCount" content="UserLikes:([0-9,]+)">',
webpage, 'like count', fatal=False))
duration = parse_duration(self._html_search_meta('duration', webpage))
upload_date = None
view_count = None
duration = None
description = self._html_search_meta('description', webpage, 'description')
if description:
upload_date = self._search_regex(r'Published Date: (\d{2} [a-zA-Z]{3} \d{4})', description, 'upload date',
fatal=False)
if upload_date:
upload_date = unified_strdate(upload_date)
view_count = self._search_regex(r'Views: ([\d,\.]+)', description, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
duration = parse_duration(self._search_regex(r'Length: (\d+m\d+s)', description, 'duration', fatal=False))
params_js = self._search_regex(
r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)',
webpage, 'initialization parameters'
)
params = self._parse_json('[%s]' % params_js, video_id)
media_id = params[0]
sources = ['%s' % p for p in params[2]]
token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources))
token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format(
media_id, '+'.join(sources))
headers = {
b'Content-Type': b'application/x-www-form-urlencoded',
b'Origin': b'http://www.4tube.com',
}
token_req = compat_urllib_request.Request(token_url, b'{}', headers)
tokens = self._download_json(token_req, video_id)
formats = [{
'url': tokens[format]['token'],
'format_id': format + 'p',
'resolution': format + 'p',
'quality': int(format),
} for format in sources]
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail_url,
'categories': categories,
'thumbnail': thumbnail,
'uploader': uploader,
'uploader_id': uploader_id,
'upload_date': upload_date,
'timestamp': timestamp,
'like_count': like_count,
'view_count': view_count,
'duration': duration,
'age_limit': 18,
'webpage_url': webpage_url,
}

View File

@@ -0,0 +1,117 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_request,
compat_urlparse,
)
from ..utils import (
HEADRequest,
str_to_int,
urlencode_postdata,
urlhandle_detect_ext,
)
class HearThisAtIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$'
_PLAYLIST_URL = 'https://hearthis.at/playlist.php'
_TEST = {
'url': 'https://hearthis.at/moofi/dr-kreep',
'md5': 'ab6ec33c8fed6556029337c7885eb4e0',
'info_dict': {
'id': '150939',
'ext': 'wav',
'title': 'Moofi - Dr. Kreep',
'thumbnail': 're:^https?://.*\.jpg$',
'timestamp': 1421564134,
'description': 'Creepy Patch. Mutable Instruments Braids Vowel + Formant Mode.',
'upload_date': '20150118',
'comment_count': int,
'view_count': int,
'like_count': int,
'duration': 71,
'categories': ['Experimental'],
}
}
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
display_id = '{artist:s} - {title:s}'.format(**m.groupdict())
webpage = self._download_webpage(url, display_id)
track_id = self._search_regex(
r'intTrackId\s*=\s*(\d+)', webpage, 'track ID')
payload = urlencode_postdata({'tracks[]': track_id})
req = compat_urllib_request.Request(self._PLAYLIST_URL, payload)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
track = self._download_json(req, track_id, 'Downloading playlist')[0]
title = '{artist:s} - {title:s}'.format(**track)
categories = None
if track.get('category'):
categories = [track['category']]
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)
meta_span = r'<span[^>]+class="%s".*?</i>([^<]+)</span>'
view_count = str_to_int(self._search_regex(
meta_span % 'plays_count', webpage, 'view count', fatal=False))
like_count = str_to_int(self._search_regex(
meta_span % 'likes_count', webpage, 'like count', fatal=False))
comment_count = str_to_int(self._search_regex(
meta_span % 'comment_count', webpage, 'comment count', fatal=False))
duration = str_to_int(self._search_regex(
r'data-length="(\d+)', webpage, 'duration', fatal=False))
timestamp = str_to_int(self._search_regex(
r'<span[^>]+class="calctime"[^>]+data-time="(\d+)', webpage, 'timestamp', fatal=False))
formats = []
mp3_url = self._search_regex(
r'(?s)<a class="player-link"\s+(?:[a-zA-Z0-9_:-]+="[^"]+"\s+)*?data-mp3="([^"]+)"',
webpage, 'title', fatal=False)
if mp3_url:
formats.append({
'format_id': 'mp3',
'vcodec': 'none',
'acodec': 'mp3',
'url': mp3_url,
})
download_path = self._search_regex(
r'<a class="[^"]*download_fct[^"]*"\s+href="([^"]+)"',
webpage, 'download URL', default=None)
if download_path:
download_url = compat_urlparse.urljoin(url, download_path)
ext_req = HEADRequest(download_url)
ext_handle = self._request_webpage(
ext_req, display_id, note='Determining extension')
ext = urlhandle_detect_ext(ext_handle)
formats.append({
'format_id': 'download',
'vcodec': 'none',
'ext': ext,
'url': download_url,
'preference': 2, # Usually better quality
})
self._sort_formats(formats)
return {
'id': track_id,
'display_id': display_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
'description': description,
'duration': duration,
'timestamp': timestamp,
'view_count': view_count,
'comment_count': comment_count,
'like_count': like_count,
'categories': categories,
}

View File

@@ -6,6 +6,7 @@ import json
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_HTTPError,
)
from ..utils import (
ExtractorError,
@@ -78,6 +79,16 @@ class NBCNewsIE(InfoExtractor):
},
'add_ie': ['ThePlatform'],
},
{
'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
'md5': 'fdbf39ab73a72df5896b6234ff98518a',
'info_dict': {
'id': 'Wjf9EDR3A_60',
'ext': 'mp4',
'title': 'FULL EPISODE: Family Business',
'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
},
},
]
def _real_extract(self, url):
@@ -115,10 +126,19 @@ class NBCNewsIE(InfoExtractor):
if not base_url:
continue
playlist_url = base_url + '?form=MPXNBCNewsAPI'
all_videos = self._download_json(playlist_url, title)['videos']
try:
info = next(v for v in all_videos if v['mpxId'] == mpxid)
all_videos = self._download_json(playlist_url, title)
except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError):
continue
raise
if not all_videos or 'videos' not in all_videos:
continue
try:
info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid)
break
except StopIteration:
continue

View File

@@ -27,9 +27,7 @@ class NDTVIE(InfoExtractor):
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
filename = self._search_regex(

View File

@@ -1,7 +1,5 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
fix_xml_ampersands,

View File

@@ -0,0 +1,51 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import unified_strdate
class StreetVoiceIE(InfoExtractor):
_VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://streetvoice.com/skippylu/songs/94440/',
'md5': '15974627fc01a29e492c98593c2fd472',
'info_dict': {
'id': '94440',
'ext': 'mp3',
'filesize': 4167053,
'title': '',
'description': 'Crispy脆樂團 - 輸',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 260,
'upload_date': '20091018',
'uploader': 'Crispy脆樂團',
'uploader_id': '627810',
}
}, {
'url': 'http://tw.streetvoice.com/skippylu/songs/94440/',
'only_matching': True,
}]
def _real_extract(self, url):
song_id = self._match_id(url)
song = self._download_json(
'http://streetvoice.com/music/api/song/%s' % song_id, song_id)
title = song['name']
author = song['musician']['name']
return {
'id': song_id,
'url': song['file'],
'filesize': song.get('size'),
'title': title,
'description': '%s - %s' % (author, title),
'thumbnail': self._proto_relative_url(song.get('image'), 'http:'),
'duration': song.get('length'),
'upload_date': unified_strdate(song.get('created_at')),
'uploader': author,
'uploader_id': compat_str(song['musician']['id']),
}

View File

@@ -9,17 +9,23 @@ from ..utils import ExtractorError
class TinyPicIE(InfoExtractor):
IE_NAME = 'tinypic'
IE_DESC = 'tinypic.com videos'
_VALID_URL = r'http://tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+'
_VALID_URL = r'http://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+'
_TEST = {
'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8',
'md5': '609b74432465364e72727ebc6203f044',
'info_dict': {
'id': '6xw7tc',
'ext': 'flv',
'title': 'shadow phenomenon weird',
_TESTS = [
{
'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8',
'md5': '609b74432465364e72727ebc6203f044',
'info_dict': {
'id': '6xw7tc',
'ext': 'flv',
'title': 'shadow phenomenon weird',
},
},
{
'url': 'http://de.tinypic.com/player.php?v=dy90yh&s=8',
'only_matching': True,
}
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)

View File

@@ -12,61 +12,59 @@ class TvpIE(InfoExtractor):
_TESTS = [{
'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035',
'md5': 'cdd98303338b8a7f7abab5cd14092bf2',
'info_dict': {
'id': '4278035',
'ext': 'wmv',
'title': 'Ogniem i mieczem, odc. 2',
'description': 'Bohun dowiaduje się o złamaniu przez kniahinię danego mu słowa i wyrusza do Rozłogów. Helenie w ostatniej chwili udaje się uciec dzięki pomocy Zagłoby.',
},
}, {
'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536',
'md5': '8aa518c15e5cc32dfe8db400dc921fbb',
'info_dict': {
'id': '194536',
'ext': 'mp4',
'title': 'Czas honoru, I seria odc. 13',
# 'description': 'WŁADEK\nCzesław prosi Marię o dostarczenie Władkowi zarazki tyfusu. Jeśli zachoruje zostanie przewieziony do szpitala skąd łatwiej będzie go odbić. Czy matka zdecyduje się zarazić syna? Karol odwiedza Wandę przyznaje się, że ją oszukiwał, ale ostrzega też, że grozi jej aresztowanie i nalega, żeby wyjechała z Warszawy. Czy dziewczyna zdecyduje się znów oddalić od ukochanego? Rozpoczyna się akcja odbicia Władka.',
},
}, {
'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
'md5': 'c3b15ed1af288131115ff17a17c19dda',
'info_dict': {
'id': '17916176',
'ext': 'mp4',
'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
},
'params': {
# m3u8 download
'skip_download': 'true',
},
}, {
'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
'md5': 'c3b15ed1af288131115ff17a17c19dda',
'info_dict': {
'id': '17834272',
'ext': 'mp4',
'title': 'Na sygnale, odc. 39',
'description': 'Ekipa Wiktora ratuje młodą matkę, która spadła ze schodów trzymając na rękach noworodka. Okazuje się, że dziewczyna jest surogatką, a biologiczni rodzice dziecka próbują zmusić ją do oddania synka…',
},
'params': {
# m3u8 download
'skip_download': 'true',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
title = self._og_search_title(webpage)
series = self._search_regex(
r'{name:\s*([\'"])SeriesTitle\1,\s*value:\s*\1(?P<series>.*?)\1},',
title = self._search_regex(
r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
webpage, 'title', group='title')
series_title = self._search_regex(
r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1',
webpage, 'series', group='series', default=None)
if series is not None and series not in title:
title = '%s, %s' % (series, title)
description = self._og_search_description(webpage, default=None)
if series_title:
title = '%s, %s' % (series_title, title)
thumbnail = self._search_regex(
r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
video_url = self._search_regex(
r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None)
if video_url is None:
if not video_url:
video_url = self._download_json(
'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
video_id)['video_url']
@@ -89,8 +87,7 @@ class TvpIE(InfoExtractor):
return {
'id': video_id,
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
'description': description,
'thumbnail': thumbnail,
'formats': formats,
}

View File

@@ -15,44 +15,11 @@ from ..utils import (
)
class TwitchIE(InfoExtractor):
# TODO: One broadcast may be split into multiple videos. The key
# 'broadcast_id' is the same for all parts, and 'broadcast_part'
# starts at 1 and increases. Can we treat all parts as one video?
_VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/
(?:
(?P<channelid>[^/]+)|
(?:(?:[^/]+)/v/(?P<vodid>[^/]+))|
(?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
(?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
)
/?(?:\#.*)?$
"""
_PAGE_LIMIT = 100
class TwitchBaseIE(InfoExtractor):
_VALID_URL_BASE = r'http://(?:www\.)?twitch\.tv'
_API_BASE = 'https://api.twitch.tv'
_LOGIN_URL = 'https://secure.twitch.tv/user/login'
_TESTS = [{
'url': 'http://www.twitch.tv/riotgames/b/577357806',
'info_dict': {
'id': 'a577357806',
'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
},
'playlist_mincount': 12,
}, {
'url': 'http://www.twitch.tv/acracingleague/c/5285812',
'info_dict': {
'id': 'c5285812',
'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
},
'playlist_mincount': 3,
}, {
'url': 'http://www.twitch.tv/vanillatv',
'info_dict': {
'id': 'vanillatv',
'title': 'VanillaTV',
},
'playlist_mincount': 412,
}]
def _handle_error(self, response):
if not isinstance(response, dict):
@@ -64,71 +31,10 @@ class TwitchIE(InfoExtractor):
expected=True)
def _download_json(self, url, video_id, note='Downloading JSON metadata'):
response = super(TwitchIE, self)._download_json(url, video_id, note)
response = super(TwitchBaseIE, self)._download_json(url, video_id, note)
self._handle_error(response)
return response
def _extract_media(self, item, item_id):
ITEMS = {
'a': 'video',
'v': 'vod',
'c': 'chapter',
}
info = self._extract_info(self._download_json(
'%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
'Downloading %s info JSON' % ITEMS[item]))
if item == 'v':
access_token = self._download_json(
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
'Downloading %s access token' % ITEMS[item])
formats = self._extract_m3u8_formats(
'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s'
% (item_id, access_token['token'], access_token['sig']),
item_id, 'mp4')
info['formats'] = formats
return info
response = self._download_json(
'%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
'Downloading %s playlist JSON' % ITEMS[item])
entries = []
chunks = response['chunks']
qualities = list(chunks.keys())
for num, fragment in enumerate(zip(*chunks.values()), start=1):
formats = []
for fmt_num, fragment_fmt in enumerate(fragment):
format_id = qualities[fmt_num]
fmt = {
'url': fragment_fmt['url'],
'format_id': format_id,
'quality': 1 if format_id == 'live' else 0,
}
m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
if m:
fmt['height'] = int(m.group('height'))
formats.append(fmt)
self._sort_formats(formats)
entry = dict(info)
entry['id'] = '%s_%d' % (entry['id'], num)
entry['title'] = '%s part %d' % (entry['title'], num)
entry['formats'] = formats
entries.append(entry)
return self.playlist_result(entries, info['id'], info['title'])
def _extract_info(self, info):
return {
'id': info['_id'],
'title': info['title'],
'description': info['description'],
'duration': info['length'],
'thumbnail': info['preview'],
'uploader': info['channel']['display_name'],
'uploader_id': info['channel']['name'],
'timestamp': parse_iso8601(info['recorded_at']),
'view_count': info['views'],
}
def _real_initialize(self):
self._login()
@@ -167,81 +73,184 @@ class TwitchIE(InfoExtractor):
raise ExtractorError(
'Unable to login: %s' % m.group('msg').strip(), expected=True)
class TwitchItemBaseIE(TwitchBaseIE):
def _download_info(self, item, item_id):
return self._extract_info(self._download_json(
'%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
'Downloading %s info JSON' % self._ITEM_TYPE))
def _extract_media(self, item_id):
info = self._download_info(self._ITEM_SHORTCUT, item_id)
response = self._download_json(
'%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id,
'Downloading %s playlist JSON' % self._ITEM_TYPE)
entries = []
chunks = response['chunks']
qualities = list(chunks.keys())
for num, fragment in enumerate(zip(*chunks.values()), start=1):
formats = []
for fmt_num, fragment_fmt in enumerate(fragment):
format_id = qualities[fmt_num]
fmt = {
'url': fragment_fmt['url'],
'format_id': format_id,
'quality': 1 if format_id == 'live' else 0,
}
m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
if m:
fmt['height'] = int(m.group('height'))
formats.append(fmt)
self._sort_formats(formats)
entry = dict(info)
entry['id'] = '%s_%d' % (entry['id'], num)
entry['title'] = '%s part %d' % (entry['title'], num)
entry['formats'] = formats
entries.append(entry)
return self.playlist_result(entries, info['id'], info['title'])
def _extract_info(self, info):
return {
'id': info['_id'],
'title': info['title'],
'description': info['description'],
'duration': info['length'],
'thumbnail': info['preview'],
'uploader': info['channel']['display_name'],
'uploader_id': info['channel']['name'],
'timestamp': parse_iso8601(info['recorded_at']),
'view_count': info['views'],
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj.group('chapterid'):
return self._extract_media('c', mobj.group('chapterid'))
return self._extract_media(self._match_id(url))
"""
webpage = self._download_webpage(url, chapter_id)
m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
if not m:
raise ExtractorError('Cannot find archive of a chapter')
archive_id = m.group(1)
api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
doc = self._download_xml(
api, chapter_id,
note='Downloading chapter information',
errnote='Chapter information download failed')
for a in doc.findall('.//archive'):
if archive_id == a.find('./id').text:
break
else:
raise ExtractorError('Could not find chapter in chapter information')
class TwitchVideoIE(TwitchItemBaseIE):
IE_NAME = 'twitch:video'
_VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
_ITEM_TYPE = 'video'
_ITEM_SHORTCUT = 'a'
video_url = a.find('./video_file_url').text
video_ext = video_url.rpartition('.')[2] or 'flv'
_TEST = {
'url': 'http://www.twitch.tv/riotgames/b/577357806',
'info_dict': {
'id': 'a577357806',
'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
},
'playlist_mincount': 12,
}
chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id
chapter_info = self._download_json(
chapter_api_url, 'c' + chapter_id,
note='Downloading chapter metadata',
errnote='Download of chapter metadata failed')
bracket_start = int(doc.find('.//bracket_start').text)
bracket_end = int(doc.find('.//bracket_end').text)
class TwitchChapterIE(TwitchItemBaseIE):
IE_NAME = 'twitch:chapter'
_VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
_ITEM_TYPE = 'chapter'
_ITEM_SHORTCUT = 'c'
# TODO determine start (and probably fix up file)
# youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
#video_url += '?start=' + TODO:start_timestamp
# bracket_start is 13290, but we want 51670615
self._downloader.report_warning('Chapter detected, but we can just download the whole file. '
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
_TEST = {
'url': 'http://www.twitch.tv/acracingleague/c/5285812',
'info_dict': {
'id': 'c5285812',
'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
},
'playlist_mincount': 3,
}
info = {
'id': 'c' + chapter_id,
'url': video_url,
'ext': video_ext,
'title': chapter_info['title'],
'thumbnail': chapter_info['preview'],
'description': chapter_info['description'],
'uploader': chapter_info['channel']['display_name'],
'uploader_id': chapter_info['channel']['name'],
}
return info
"""
elif mobj.group('videoid'):
return self._extract_media('a', mobj.group('videoid'))
elif mobj.group('vodid'):
return self._extract_media('v', mobj.group('vodid'))
elif mobj.group('channelid'):
channel_id = mobj.group('channelid')
info = self._download_json(
'%s/kraken/channels/%s' % (self._API_BASE, channel_id),
channel_id, 'Downloading channel info JSON')
channel_name = info.get('display_name') or info.get('name')
entries = []
offset = 0
limit = self._PAGE_LIMIT
for counter in itertools.count(1):
response = self._download_json(
'%s/kraken/channels/%s/videos/?offset=%d&limit=%d'
% (self._API_BASE, channel_id, offset, limit),
channel_id, 'Downloading channel videos JSON page %d' % counter)
videos = response['videos']
if not videos:
break
entries.extend([self.url_result(video['url'], 'Twitch') for video in videos])
offset += limit
return self.playlist_result(entries, channel_id, channel_name)
class TwitchVodIE(TwitchItemBaseIE):
IE_NAME = 'twitch:vod'
_VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
_ITEM_TYPE = 'vod'
_ITEM_SHORTCUT = 'v'
_TEST = {
'url': 'http://www.twitch.tv/ksptv/v/3622000',
'info_dict': {
'id': 'v3622000',
'ext': 'mp4',
'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 6951,
'timestamp': 1419028564,
'upload_date': '20141219',
'uploader': 'KSPTV',
'uploader_id': 'ksptv',
'view_count': int,
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
item_id = self._match_id(url)
info = self._download_info(self._ITEM_SHORTCUT, item_id)
access_token = self._download_json(
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
'Downloading %s access token' % self._ITEM_TYPE)
formats = self._extract_m3u8_formats(
'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s'
% (item_id, access_token['token'], access_token['sig']),
item_id, 'mp4')
info['formats'] = formats
return info
class TwitchPlaylistBaseIE(TwitchBaseIE):
_PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
_PAGE_LIMIT = 100
def _extract_playlist(self, channel_id):
info = self._download_json(
'%s/kraken/channels/%s' % (self._API_BASE, channel_id),
channel_id, 'Downloading channel info JSON')
channel_name = info.get('display_name') or info.get('name')
entries = []
offset = 0
limit = self._PAGE_LIMIT
for counter in itertools.count(1):
response = self._download_json(
self._PLAYLIST_URL % (channel_id, offset, limit),
channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
videos = response['videos']
if not videos:
break
entries.extend([self.url_result(video['url']) for video in videos])
offset += limit
return self.playlist_result(entries, channel_id, channel_name)
def _real_extract(self, url):
return self._extract_playlist(self._match_id(url))
class TwitchProfileIE(TwitchPlaylistBaseIE):
IE_NAME = 'twitch:profile'
_VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
_PLAYLIST_TYPE = 'profile'
_TEST = {
'url': 'http://www.twitch.tv/vanillatv/profile',
'info_dict': {
'id': 'vanillatv',
'title': 'VanillaTV',
},
'playlist_mincount': 412,
}
class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
IE_NAME = 'twitch:past_broadcasts'
_VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
_PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true'
_PLAYLIST_TYPE = 'past broadcasts'
_TEST = {
'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts',
'info_dict': {
'id': 'spamfish',
'title': 'Spamfish',
},
'playlist_mincount': 54,
}

View File

@@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
from ..utils import (
remove_start,
@@ -16,20 +17,23 @@ class VideoMegaIE(InfoExtractor):
(?:iframe\.php)?\?ref=(?P<id>[A-Za-z0-9]+)
'''
_TEST = {
'url': 'http://videomega.tv/?ref=GKeGPVedBe',
'md5': '240fb5bcf9199961f48eb17839b084d6',
'url': 'http://videomega.tv/?ref=QR0HCUHI1661IHUCH0RQ',
'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
'info_dict': {
'id': 'GKeGPVedBe',
'id': 'QR0HCUHI1661IHUCH0RQ',
'ext': 'mp4',
'title': 'XXL - All Sports United',
'title': 'Big Buck Bunny',
'thumbnail': 're:^https?://.*\.jpg$',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id)
webpage = self._download_webpage(url, video_id)
iframe_url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id)
req = compat_urllib_request.Request(iframe_url)
req.add_header('Referer', url)
webpage = self._download_webpage(req, video_id)
escaped_data = self._search_regex(
r'unescape\("([^"]+)"\)', webpage, 'escaped data')
@@ -37,13 +41,13 @@ class VideoMegaIE(InfoExtractor):
thumbnail = self._search_regex(
r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False)
url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL')
video_url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL')
title = remove_start(self._html_search_regex(
r'<title>(.*?)</title>', webpage, 'title'), 'VideoMega.tv - ')
formats = [{
'format_id': 'sd',
'url': url,
'url': video_url,
}]
self._sort_formats(formats)
@@ -52,4 +56,5 @@ class VideoMegaIE(InfoExtractor):
'title': title,
'formats': formats,
'thumbnail': thumbnail,
'http_referer': iframe_url,
}

View File

@@ -394,6 +394,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'format': '141',
},
},
# JS player signature function name containing $
{
'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
'info_dict': {
'id': 'nfWlot6h_JM',
'ext': 'm4a',
'title': 'Taylor Swift - Shake It Off',
'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818',
},
'params': {
'youtube_include_dash_manifest': True,
'format': '141',
},
},
# Controversy video
{
'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
@@ -588,7 +605,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _parse_sig_js(self, jscode):
funcname = self._search_regex(
r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
'Initial JS player signature function name')
jsi = JSInterpreter(jscode)

View File

@@ -264,7 +264,7 @@ def parseOpts(overrideArguments=None):
authentication.add_option(
'-p', '--password',
dest='password', metavar='PASSWORD',
help='account password')
help='account password. If this option is left out, youtube-dl will ask interactively.')
authentication.add_option(
'-2', '--twofactor',
dest='twofactor', metavar='TWOFACTOR',

View File

@@ -475,15 +475,21 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
filename = information['filepath']
input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs]
opts = ['-map', '0:0', '-map', '0:1', '-c:v', 'copy', '-c:a', 'copy']
opts = [
'-map', '0',
'-c', 'copy',
# Don't copy the existing subtitles, we may be running the
# postprocessor a second time
'-map', '-0:s',
'-c:s', 'mov_text',
]
for (i, lang) in enumerate(sub_langs):
opts.extend(['-map', '%d:0' % (i + 1), '-c:s:%d' % i, 'mov_text'])
opts.extend(['-map', '%d:0' % (i + 1)])
lang_code = self._conver_lang_code(lang)
if lang_code is not None:
opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])
opts.extend(['-f', 'mp4'])
temp_filename = filename + '.temp'
temp_filename = prepend_extension(filename, 'temp')
self._downloader.to_screen('[ffmpeg] Embedding subtitles in \'%s\'' % filename)
self.run_ffmpeg_multiple_files(input_files, temp_filename, opts)
os.remove(encodeFilename(filename))

View File

@@ -1277,7 +1277,7 @@ def parse_duration(s):
s = s.strip()
m = re.match(
r'''(?ix)T?
r'''(?ix)(?:P?T)?
(?:
(?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
(?P<only_hours>[0-9.]+)\s*(?:hours?)|
@@ -1612,6 +1612,14 @@ def urlhandle_detect_ext(url_handle):
except AttributeError: # Python < 3
getheader = url_handle.info().getheader
cd = getheader('Content-Disposition')
if cd:
m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
if m:
e = determine_ext(m.group('filename'), default_ext=None)
if e:
return e
return getheader('Content-Type').split("/")[1]

View File

@@ -1,3 +1,3 @@
from __future__ import unicode_literals
__version__ = '2015.01.15'
__version__ = '2015.01.22'