Compare commits

...

121 Commits

Author SHA1 Message Date
Philipp Hagemeister
a0dfcdce5e release 2014.02.17 2014-02-17 11:33:13 +01:00
Philipp Hagemeister
96d1637082 Credit @Nikerabbit for helsinki 2014-02-17 11:33:01 +01:00
Philipp Hagemeister
960f317171 [helsinki] Simplify 2014-02-17 11:32:30 +01:00
Philipp Hagemeister
4412ca751d Merge remote-tracking branch 'Nikerabbit/hki' 2014-02-17 11:26:09 +01:00
Sergey M.
cbffec0c95 Credit @patheticpat for 4tube.com (#2398) 2014-02-17 09:08:38 +07:00
Sergey M.
0cea52cc18 Credit @pulpe for play.iprima.cz and stream.cz 2014-02-17 09:07:36 +07:00
Sergey M.
6d784e87f4 Credit @prutz1311 for normalboots.com (#2279) 2014-02-17 09:03:28 +07:00
Sergey M.
ae6cae78f1 [4tube] Minor changes and extract more metadata 2014-02-17 03:51:03 +07:00
Sergey M.
0f99566c01 Add one more format in unified_strdate 2014-02-17 03:47:03 +07:00
Sergey M.
2db806b4aa Improve parse_duration 2014-02-17 03:46:26 +07:00
Sergey M
3f32c0ba4c Merge branch '4tube' of https://github.com/patheticpat/youtube-dl into patheticpat-4tube 2014-02-17 02:21:45 +07:00
Sergey M.
541cb26c0d [smotri] Add entry for netrc authentication 2014-02-17 02:19:55 +07:00
Sergey M.
5544e038ab [vk] Add entry for netrc authentication 2014-02-17 02:17:10 +07:00
Sergey M.
9032dc28a6 [vk] Add login feature (Closes #2206) 2014-02-17 02:05:15 +07:00
Michael Kaiser
03635e2a71 Add support for 4tube.com. 2014-02-16 18:10:39 +01:00
Sergey M.
00cf938aa5 [nfb] Add rtmp app field to format 2014-02-16 06:11:38 +07:00
Philipp Hagemeister
a5f707c495 Merge branch 'master' of github.com:rg3/youtube-dl 2014-02-15 20:45:12 +01:00
Jaime Marquínez Ferrándiz
1824b48169 [f4m] Download only the first fragment with the --test option 2014-02-15 17:53:23 +01:00
Philipp Hagemeister
07ad22b8af [youtube:search] Mark "no results found" error as expected 2014-02-15 16:30:11 +01:00
Philipp Hagemeister
b53466e168 Fix f4m downloading on Python 2.6 2014-02-15 16:24:43 +01:00
Philipp Hagemeister
6a7a389679 Merge branch 'master' of github.com:rg3/youtube-dl 2014-02-15 15:34:17 +01:00
Philipp Hagemeister
4edff78531 Merge remote-tracking branch 'jaimeMF/f4m'
Conflicts:
	youtube_dl/extractor/__init__.py
2014-02-15 15:32:13 +01:00
Jaime Marquínez Ferrándiz
99043c2ea5 Replace test for dailymotion users 2014-02-15 13:17:31 +01:00
Jaime Marquínez Ferrándiz
e68abba910 [sohu] Skip test
Only available from China
2014-02-15 13:12:41 +01:00
Jaime Marquínez Ferrándiz
3165dc4d9f [france2.fr:generation-quoi] Skip test
The videos seem to not be available outside France
2014-02-15 13:04:31 +01:00
Niklas Laxström
66c43a53e4 Add support for video.helsinki.fi archives 2014-02-14 18:14:28 +02:00
Sergey M.
463b334616 [ndr] Replace 404 test 2014-02-14 23:12:15 +07:00
Sergey M.
b71dbc57c4 [vesti] Fix player regex (Closes #2382) 2014-02-14 22:26:13 +07:00
Philipp Hagemeister
72ca1d7f45 [vesti] Skip test 2 due to geo restrictions
At least that's how I interpret the error message "Просмотр вид��о ограничен в вашем регионе."
2014-02-13 22:19:59 +01:00
Philipp Hagemeister
76e461f395 release 2014.02.13 2014-02-13 19:13:05 +01:00
Sergey M.
1074982e6e [vesti] Add support for vesti.ru videos and live streams (Closes #2376) 2014-02-13 23:23:48 +07:00
Philipp Hagemeister
29b2aaf035 [jadorecettepub] Remove unused import 2014-02-13 16:33:12 +01:00
Philipp Hagemeister
6f90d098c5 [ecapist] modernize and fix id property 2014-02-13 16:32:42 +01:00
Sergey M.
0715161450 Merge pull request #2373 from pulpe/_description_fixes
[collegehumor, chilloutzone] changed description in tests
2014-02-12 06:22:03 -08:00
pulpe
896583517f [collegehumor, chilloutzone] changed description in tests 2014-02-12 15:11:57 +01:00
Sergey M.
713d31fac8 [gametrailers] Fix gametrailers test 2014-02-12 01:50:53 +07:00
Sergey M.
96cb10a5f5 [mtv] Improve title extraction 2014-02-12 01:07:30 +07:00
Sergey M.
c207c1044e Merge pull request #2372 from pulpe/dropbox_fix
[dropbox] replace not working test
2014-02-11 09:34:49 -08:00
pulpe
79629ec717 [dropbox] replace not working test 2014-02-11 17:27:36 +01:00
Sergey M.
008fda0f08 [ndr] Replace 404 video test 2014-02-11 21:21:05 +07:00
Jaime Marquínez Ferrándiz
0ae6b01937 [cnn] Add an extractor for blogs (closes #2361) 2014-02-11 14:38:17 +01:00
Jaime Marquínez Ferrándiz
def630e523 [xtube] Fix uploader extraction 2014-02-11 14:20:41 +01:00
Arjun Sreedharan
c5ba203e23 [xtube] use unicode_literals 2014-02-11 13:51:37 +01:00
Arjun Sreedharan
2317e6b2b3 [yahoo] use unicode_literals 2014-02-11 13:51:23 +01:00
Sergey M.
cb38928974 [firsttv] Skip test 2014-02-11 10:26:52 +07:00
Sergey M.
fa78f13302 [streamcz] Minor changes 2014-02-11 10:19:02 +07:00
Sergey M
18395217c4 Merge branch '_stream' of https://github.com/pulpe/youtube-dl into pulpe-_stream 2014-02-11 09:18:46 +07:00
Jaime Marquínez Ferrándiz
34bd987811 [freesound] Modernize 2014-02-10 21:03:14 +01:00
Jaime Marquínez Ferrándiz
af6ba6a1c4 [exfm] Modernize 2014-02-10 21:00:37 +01:00
Jaime Marquínez Ferrándiz
85409a0c69 [dotsub] Modernize 2014-02-10 20:52:53 +01:00
Jaime Marquínez Ferrándiz
ebfe352b62 [breakcom] Modernize 2014-02-10 20:48:46 +01:00
Jaime Marquínez Ferrándiz
fde56d2f17 [howcast] Modernize 2014-02-10 20:45:17 +01:00
Jaime Marquínez Ferrándiz
3501423dfe [googleplus] Modernize and simplify 2014-02-10 20:36:11 +01:00
Jaime Marquínez Ferrándiz
0de668af51 [instagram] Modernize 2014-02-10 20:24:12 +01:00
Sergey M.
2a584ea90a [firsttv] Fix video URL regex 2014-02-11 00:49:37 +07:00
Sergey M.
0f6ed94a15 [firsttv] Add support for 1tv.ru videoarchive 2014-02-11 00:20:41 +07:00
Sergey M.
bcb891e82b [lifenews] Minor improvements 2014-02-10 21:07:41 +07:00
Jaime Marquínez Ferrándiz
ac6e4ca1ed [brightcove] Unescape html entities from the 'og:video' url property (fixes #2360) 2014-02-10 07:50:10 +01:00
Philipp Hagemeister
2e20bba708 release 2014.02.10 2014-02-10 02:01:11 +01:00
Filippo Valsorda
e70dc1d14b [youtube] Correct a minor regex typo 2014-02-10 01:30:47 +01:00
pulpe
0793a7b3c7 [StreamCZ] Add support for stream.cz 2014-02-09 18:37:12 +01:00
Philipp Hagemeister
026fcc0495 Fix #2355 (date parsing with dashes) 2014-02-09 18:09:57 +01:00
Philipp Hagemeister
81c2f20b53 [youtube] Correct invalid JSON (Fixes #2353) 2014-02-09 17:56:10 +01:00
Jaime Marquínez Ferrándiz
1afe753462 [slideshare] Fix description extraction and modernize
The ‘og:description’  property doesn’t contain the full description
2014-02-09 14:23:19 +01:00
Jaime Marquínez Ferrándiz
524c2c716a [bloomberg] Fix extraction of ooyala embed code 2014-02-09 14:11:45 +01:00
Sergey M.
b542d4bbd7 [kontrtube] Add support for kontrtube.ru (Closes #2354) 2014-02-09 19:53:11 +07:00
Jaime Marquínez Ferrándiz
cf1eb45153 Add a downloader for f4m manifests 2014-02-09 12:24:54 +01:00
Jaime Marquínez Ferrándiz
a97bcd80ba Add an extractor for syfy.com
It uses theplatfrom.com, which has been updated to work with f4m manifests
2014-02-08 22:30:00 +01:00
Sergey M.
17968e444c [bbc.co.uk] Fix TV episode test 2014-02-09 04:04:21 +07:00
Sergey M
2e3fd9ec2f [bbc.co.uk] Improve overall extractor structure, add subtitles support
(#2184)

Everything from http://www.bbc.co.uk/iplayer/ should be downloadable
now.
2014-02-09 04:00:49 +07:00
Philipp Hagemeister
d6a283b025 release 2014.02.08.2 2014-02-08 19:20:35 +01:00
Philipp Hagemeister
9766538124 [jadorecettepub] Add extractor (Fixes #2148) 2014-02-08 19:20:23 +01:00
Philipp Hagemeister
98dbee8681 [jeuxvideo] Modernize 2014-02-08 18:43:12 +01:00
Philipp Hagemeister
e421491b3b release 2014.02.08.1 2014-02-08 18:38:05 +01:00
Philipp Hagemeister
6828d37c41 Merge branch 'master' of github.com:rg3/youtube-dl 2014-02-08 18:37:53 +01:00
Philipp Hagemeister
bf5f610099 [pbs] Add support for viralplayer links (Fixes #2350) 2014-02-08 18:37:33 +01:00
Sergey M.
8b7f73404a [bbc.co.uk] Fix regex 2014-02-08 22:55:43 +07:00
Sergey M
85cacb2f51 [bbc.co.uk] Add one more link format 2014-02-08 22:54:05 +07:00
Philipp Hagemeister
b3fa3917e2 release 2014.02.08 2014-02-08 16:25:03 +01:00
Sergey M.
082c6c867a [bbc.co.uk] Add support for bbc.co.uk radio programmes (Closes #2184) 2014-02-08 21:55:28 +07:00
Filippo Valsorda
03fcf1ab57 Merge pull request #2342 from MikeCol/tube8
[Tube8] Extended valid urls schema
2014-02-08 04:00:50 +01:00
MikeCol
3b00dea5eb Extended valid urls schema 2014-02-08 00:09:26 +01:00
Philipp Hagemeister
8bc6c8e3c0 [chilloutzone] Add additional tests (#2340) 2014-02-07 15:42:31 +01:00
Sergey M.
79bc27b53a [channel9] Simplify 2014-02-07 19:41:18 +07:00
Sergey M.
84dd703199 [ivi] Simplify 2014-02-07 19:36:50 +07:00
Sergey M.
c6fdba23a6 [nfb] Add workaround for python2.6 2014-02-07 19:23:53 +07:00
Philipp Hagemeister
b19fe521a9 Merge pull request #2340 from Fnordlab/master
[chilloutzone] Fixes refactoring bug
2014-02-07 12:46:56 +01:00
Andreas Schmitz
c1e672d121 [chilloutzone] fixes bug with youtube extraction
the id used for extracting the video from youtube is stored in
native_video_id not video_id. This id is only used on chilloutzone.net
2014-02-07 12:29:58 +01:00
Andreas Schmitz
f4371f4784 Merge remote-tracking branch 'upstream/master' 2014-02-07 12:20:58 +01:00
Philipp Hagemeister
d914d9d187 [chilloutzone] Add import 2014-02-07 12:03:19 +01:00
Philipp Hagemeister
845d14d377 credit @Fnordlab for chilloutzone 2014-02-07 12:00:58 +01:00
Philipp Hagemeister
4a9540b6d2 [chilloutzone] Simplify (#2338) 2014-02-07 12:00:25 +01:00
Philipp Hagemeister
9f31be7000 Merge remote-tracking branch 'Fnordlab/chilloutzone' 2014-02-07 11:50:26 +01:00
Philipp Hagemeister
41fa1b627d release 2014.02.06.3 2014-02-07 01:41:01 +01:00
Andreas Schmitz
c0c4e66b29 Merge branch 'chilloutzone' 2014-02-06 21:33:16 +01:00
Andreas Schmitz
cd8662de22 [chilloutzone] Bug fix, runs against tests
Fixes a bug with python3.3 and made the extractor run successfully
against tox
2014-02-06 21:31:04 +01:00
Sergey M.
3587159614 [nfb] Add encode POST data 2014-02-07 02:13:04 +07:00
Jaime Marquínez Ferrándiz
d67cc9fa7c [youtube:playlist] Recognize ‘top tracks’ urls (closes #2332)
The list parameter starts with ‘MC’ and can have more characters after it, including dots
2014-02-06 19:46:26 +01:00
Sergey M.
bf3a2fe923 [elpais] Fix typo 2014-02-07 00:38:29 +07:00
Sergey M.
e9ea0bf123 [ndr] Add support for ndr.de (Closes #2325) 2014-02-07 00:35:26 +07:00
Philipp Hagemeister
63424b6233 release 2014.02.06.2 2014-02-06 15:45:47 +01:00
Sergey M.
0bf35c5cf5 [nfb] Add support for onf.ca URLs 2014-02-06 21:41:31 +07:00
Sergey M.
95c29381eb [mooshare] Fix bogus video page URL 2014-02-06 21:26:12 +07:00
Sergey M.
94c4abce7f [nfb] Add support for nfb.ca (Closes #2069) 2014-02-06 21:19:13 +07:00
Andreas Schmitz
f2dffe55f8 Merge branch 'chilloutzone' 2014-02-06 11:49:38 +01:00
Andreas Schmitz
46a073bfac [chilloutzone] Added support for chilloutzone.net
Added support for chilloutzone.net videos including embedded youtube
and vimeo movies. In case you find a not working movie, drop me an
email.
2014-02-06 11:44:44 +01:00
Philipp Hagemeister
df872ec4e7 release 2014.02.06.1 2014-02-06 11:30:00 +01:00
Philipp Hagemeister
5de90176d9 [elpais] Add extractor 2014-02-06 11:29:46 +01:00
Philipp Hagemeister
dcf3eec47a [test_download] Skip over BadStatusLine errors
An error like https://travis-ci.org/rg3/youtube-dl/jobs/18317799#L449 is almost certainly the server's fault.
2014-02-06 04:19:57 +01:00
Philipp Hagemeister
e9e4f30d26 [pbs] Remove unused import 2014-02-06 04:19:43 +01:00
Philipp Hagemeister
83cebd73d4 [collegehumor] We only get shortened descriptions now 2014-02-06 04:16:22 +01:00
Philipp Hagemeister
1df4229bd7 [mtv/gametrailers] Change order of title preference
It looks like the plain title is better again
2014-02-06 04:15:12 +01:00
Philipp Hagemeister
3c995527e9 release 2014.02.06 2014-02-06 03:30:30 +01:00
Philipp Hagemeister
7c62b568a2 Merge branch 'master' of github.com:rg3/youtube-dl 2014-02-06 03:30:18 +01:00
Philipp Hagemeister
ccf9114e84 [googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 03:29:10 +01:00
Jaime Marquínez Ferrándiz
d8061908bb [ina] Improve _VALID_URL regex (fixes #2328)
Accept all letters in upper case and don’t require anything after the id
2014-02-05 23:01:24 +01:00
Philipp Hagemeister
211e17dd43 release 2014.02.05 2014-02-05 21:23:28 +01:00
Philipp Hagemeister
6cb38a9994 [firstpost] Add extractor (Fixes #2324) 2014-02-05 21:23:21 +01:00
Sergey M.
fa7df757a7 [thisav] Simplify and use unicode literals 2014-02-05 19:13:06 +07:00
Sergey M.
8c82077619 [toutv] Use unicode literals 2014-02-05 19:02:03 +07:00
Sergey M.
e5d1f9e50a [m6] Add support for m6.fr (Closes #2313) 2014-02-05 17:38:17 +07:00
63 changed files with 2242 additions and 430 deletions

View File

@@ -1,5 +1,7 @@
#!/usr/bin/env python
from __future__ import unicode_literals
# Allow direct execution
import os
import sys
@@ -13,6 +15,7 @@ from youtube_dl.extractor import (
FacebookIE,
gen_extractors,
JustinTVIE,
PBSIE,
YoutubeIE,
)
@@ -29,18 +32,20 @@ class TestAllURLsMatching(unittest.TestCase):
def test_youtube_playlist_matching(self):
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
assertPlaylist(u'ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist(u'UUBABnxM4Ar9ten8Mdjj1j0Q') #585
assertPlaylist(u'PL63F0C78739B09958')
assertPlaylist(u'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist(u'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668
self.assertFalse('youtube:playlist' in self.matching_ies(u'PLtS2H6bU1M'))
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') #585
assertPlaylist('PL63F0C78739B09958')
assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
# Top tracks
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M'))
self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))
self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])
@@ -80,7 +85,7 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361"))
def test_youtube_extract(self):
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE()._extract_id(url), id)
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)
assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc')
assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc')
assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc')
@@ -89,7 +94,7 @@ class TestAllURLsMatching(unittest.TestCase):
assertExtractId('BaW_jenozKc', 'BaW_jenozKc')
def test_facebook_matching(self):
self.assertTrue(FacebookIE.suitable(u'https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268'))
self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268'))
def test_no_duplicates(self):
ies = gen_extractors()
@@ -124,5 +129,9 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', ['Tumblr'])
self.assertMatch('http://tatianamaslanydaily.tumblr.com/post/54196191430', ['Tumblr'])
def test_pbs(self):
# https://github.com/rg3/youtube-dl/issues/2350
self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS'])
if __name__ == '__main__':
unittest.main()

View File

@@ -22,6 +22,7 @@ import socket
import youtube_dl.YoutubeDL
from youtube_dl.utils import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_HTTPError,
@@ -110,7 +111,7 @@ def generator(test_case):
ydl.download([test_case['url']])
except (DownloadError, ExtractorError) as err:
# Check if the exception is not a network related one
if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):
if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):
raise
if try_num == RETRIES:

View File

@@ -34,6 +34,7 @@ from youtube_dl.extractor import (
KhanAcademyIE,
EveryonesMixtapeIE,
RutubeChannelIE,
GoogleSearchIE,
GenericIE,
)
@@ -54,10 +55,10 @@ class TestPlaylists(unittest.TestCase):
def test_dailymotion_user(self):
dl = FakeYDL()
ie = DailymotionUserIE(dl)
result = ie.extract('http://www.dailymotion.com/user/generation-quoi/')
result = ie.extract('https://www.dailymotion.com/user/nqtv')
self.assertIsPlaylist(result)
self.assertEqual(result['title'], 'Génération Quoi')
self.assertTrue(len(result['entries']) >= 26)
self.assertEqual(result['title'], 'Rémi Gaillard')
self.assertTrue(len(result['entries']) >= 100)
def test_vimeo_channel(self):
dl = FakeYDL()
@@ -240,6 +241,14 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], 'Always/Never: A Little-Seen Movie About Nuclear Command and Control : The New Yorker')
self.assertEqual(len(result['entries']), 3)
def test_GoogleSearch(self):
dl = FakeYDL()
ie = GoogleSearchIE(dl)
result = ie.extract('gvsearch15:python language')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'python language')
self.assertEqual(result['title'], 'python language')
self.assertTrue(len(result['entries']) == 15)
if __name__ == '__main__':
unittest.main()

View File

@@ -25,6 +25,7 @@ from youtube_dl.utils import (
shell_quote,
smuggle_url,
str_to_int,
struct_unpack,
timeconvert,
unescapeHTML,
unified_strdate,
@@ -127,6 +128,7 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_strdate('8/7/2009'), '20090708')
self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
self.assertEqual(unified_strdate('1968-12-10'), '19681210')
def test_find_xpath_attr(self):
testxml = u'''<root>
@@ -200,7 +202,16 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_duration('1'), 1)
self.assertEqual(parse_duration('1337:12'), 80232)
self.assertEqual(parse_duration('9:12:43'), 33163)
self.assertEqual(parse_duration('12:00'), 720)
self.assertEqual(parse_duration('00:01:01'), 61)
self.assertEqual(parse_duration('x:y'), None)
self.assertEqual(parse_duration('3h11m53s'), 11513)
self.assertEqual(parse_duration('62m45s'), 3765)
self.assertEqual(parse_duration('6m59s'), 419)
self.assertEqual(parse_duration('49s'), 49)
self.assertEqual(parse_duration('0h0m0s'), 0)
self.assertEqual(parse_duration('0m0s'), 0)
self.assertEqual(parse_duration('0s'), 0)
def test_fix_xml_ampersands(self):
self.assertEqual(
@@ -236,5 +247,8 @@ class TestUtil(unittest.TestCase):
testPL(5, 2, (2, 99), [2, 3, 4])
testPL(5, 2, (20, 99), [])
def test_struct_unpack(self):
self.assertEqual(struct_unpack(u'!B', b'\x00'), (0,))
if __name__ == '__main__':
unittest.main()

View File

@@ -30,7 +30,7 @@ class TestYoutubeLists(unittest.TestCase):
result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
self.assertIsPlaylist(result)
self.assertEqual(result['title'], 'ytdl test PL')
ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
ytie_results = [YoutubeIE().extract_id(url['url']) for url in result['entries']]
self.assertEqual(ytie_results, [ 'bV9L5Ht9LgY', 'FXxLjLQi3Fg', 'tU3Bgo5qJZE'])
def test_youtube_playlist_noplaylist(self):
@@ -39,7 +39,7 @@ class TestYoutubeLists(unittest.TestCase):
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
self.assertEqual(result['_type'], 'url')
self.assertEqual(YoutubeIE()._extract_id(result['url']), 'FXxLjLQi3Fg')
self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg')
def test_issue_673(self):
dl = FakeYDL()
@@ -59,7 +59,7 @@ class TestYoutubeLists(unittest.TestCase):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
ytie_results = [YoutubeIE().extract_id(url['url']) for url in result['entries']]
self.assertFalse('pElCt5oNDuI' in ytie_results)
self.assertFalse('KdPEApIVdWM' in ytie_results)
@@ -76,9 +76,9 @@ class TestYoutubeLists(unittest.TestCase):
# TODO find a > 100 (paginating?) videos course
result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
entries = result['entries']
self.assertEqual(YoutubeIE()._extract_id(entries[0]['url']), 'j9WZyLZCBzs')
self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs')
self.assertEqual(len(entries), 25)
self.assertEqual(YoutubeIE()._extract_id(entries[-1]['url']), 'rYefUsYuEp0')
self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0')
def test_youtube_channel(self):
dl = FakeYDL()
@@ -117,6 +117,13 @@ class TestYoutubeLists(unittest.TestCase):
original_video = entries[0]
self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
def test_youtube_toptracks(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
entries = result['entries']
self.assertEqual(len(entries), 100)
def test_youtube_toplist(self):
dl = FakeYDL()
ie = YoutubeTopListIE(dl)

View File

@@ -41,6 +41,11 @@ __authors__ = (
'Chris Gahan',
'Saimadhav Heblikar',
'Mike Col',
'Oleg Prutz',
'pulpe',
'Andreas Schmitz',
'Michael Kaiser',
'Niklas Laxström',
)
__license__ = 'Public Domain'

View File

@@ -5,6 +5,7 @@ from .hls import HlsFD
from .http import HttpFD
from .mplayer import MplayerFD
from .rtmp import RtmpFD
from .f4m import F4mFD
from ..utils import (
determine_ext,
@@ -22,5 +23,7 @@ def get_suitable_downloader(info_dict):
return HlsFD
if url.startswith('mms') or url.startswith('rtsp'):
return MplayerFD
if determine_ext(url) == 'f4m':
return F4mFD
else:
return HttpFD

View File

@@ -0,0 +1,315 @@
from __future__ import unicode_literals
import base64
import io
import itertools
import os
import time
import xml.etree.ElementTree as etree
from .common import FileDownloader
from .http import HttpFD
from ..utils import (
struct_pack,
struct_unpack,
compat_urllib_request,
compat_urlparse,
format_bytes,
encodeFilename,
sanitize_open,
)
class FlvReader(io.BytesIO):
"""
Reader for Flv files
The file format is documented in https://www.adobe.com/devnet/f4v.html
"""
# Utility functions for reading numbers and strings
def read_unsigned_long_long(self):
return struct_unpack('!Q', self.read(8))[0]
def read_unsigned_int(self):
return struct_unpack('!I', self.read(4))[0]
def read_unsigned_char(self):
return struct_unpack('!B', self.read(1))[0]
def read_string(self):
res = b''
while True:
char = self.read(1)
if char == b'\x00':
break
res += char
return res
def read_box_info(self):
"""
Read a box and return the info as a tuple: (box_size, box_type, box_data)
"""
real_size = size = self.read_unsigned_int()
box_type = self.read(4)
header_end = 8
if size == 1:
real_size = self.read_unsigned_long_long()
header_end = 16
return real_size, box_type, self.read(real_size-header_end)
def read_asrt(self):
# version
self.read_unsigned_char()
# flags
self.read(3)
quality_entry_count = self.read_unsigned_char()
# QualityEntryCount
for i in range(quality_entry_count):
self.read_string()
segment_run_count = self.read_unsigned_int()
segments = []
for i in range(segment_run_count):
first_segment = self.read_unsigned_int()
fragments_per_segment = self.read_unsigned_int()
segments.append((first_segment, fragments_per_segment))
return {
'segment_run': segments,
}
def read_afrt(self):
# version
self.read_unsigned_char()
# flags
self.read(3)
# time scale
self.read_unsigned_int()
quality_entry_count = self.read_unsigned_char()
# QualitySegmentUrlModifiers
for i in range(quality_entry_count):
self.read_string()
fragments_count = self.read_unsigned_int()
fragments = []
for i in range(fragments_count):
first = self.read_unsigned_int()
first_ts = self.read_unsigned_long_long()
duration = self.read_unsigned_int()
if duration == 0:
discontinuity_indicator = self.read_unsigned_char()
else:
discontinuity_indicator = None
fragments.append({
'first': first,
'ts': first_ts,
'duration': duration,
'discontinuity_indicator': discontinuity_indicator,
})
return {
'fragments': fragments,
}
def read_abst(self):
# version
self.read_unsigned_char()
# flags
self.read(3)
# BootstrapinfoVersion
bootstrap_info_version = self.read_unsigned_int()
# Profile,Live,Update,Reserved
self.read(1)
# time scale
self.read_unsigned_int()
# CurrentMediaTime
self.read_unsigned_long_long()
# SmpteTimeCodeOffset
self.read_unsigned_long_long()
# MovieIdentifier
movie_identifier = self.read_string()
server_count = self.read_unsigned_char()
# ServerEntryTable
for i in range(server_count):
self.read_string()
quality_count = self.read_unsigned_char()
# QualityEntryTable
for i in range(server_count):
self.read_string()
# DrmData
self.read_string()
# MetaData
self.read_string()
segments_count = self.read_unsigned_char()
segments = []
for i in range(segments_count):
box_size, box_type, box_data = self.read_box_info()
assert box_type == b'asrt'
segment = FlvReader(box_data).read_asrt()
segments.append(segment)
fragments_run_count = self.read_unsigned_char()
fragments = []
for i in range(fragments_run_count):
box_size, box_type, box_data = self.read_box_info()
assert box_type == b'afrt'
fragments.append(FlvReader(box_data).read_afrt())
return {
'segments': segments,
'fragments': fragments,
}
def read_bootstrap_info(self):
total_size, box_type, box_data = self.read_box_info()
assert box_type == b'abst'
return FlvReader(box_data).read_abst()
def read_bootstrap_info(bootstrap_bytes):
return FlvReader(bootstrap_bytes).read_bootstrap_info()
def build_fragments_list(boot_info):
""" Return a list of (segment, fragment) for each fragment in the video """
res = []
segment_run_table = boot_info['segments'][0]
# I've only found videos with one segment
segment_run_entry = segment_run_table['segment_run'][0]
n_frags = segment_run_entry[1]
fragment_run_entry_table = boot_info['fragments'][0]['fragments']
first_frag_number = fragment_run_entry_table[0]['first']
for (i, frag_number) in zip(range(1, n_frags+1), itertools.count(first_frag_number)):
res.append((1, frag_number))
return res
def write_flv_header(stream, metadata):
"""Writes the FLV header and the metadata to stream"""
# FLV header
stream.write(b'FLV\x01')
stream.write(b'\x05')
stream.write(b'\x00\x00\x00\x09')
# FLV File body
stream.write(b'\x00\x00\x00\x00')
# FLVTAG
# Script data
stream.write(b'\x12')
# Size of the metadata with 3 bytes
stream.write(struct_pack('!L', len(metadata))[1:])
stream.write(b'\x00\x00\x00\x00\x00\x00\x00')
stream.write(metadata)
# Magic numbers extracted from the output files produced by AdobeHDS.php
#(https://github.com/K-S-V/Scripts)
stream.write(b'\x00\x00\x01\x73')
def _add_ns(prop):
return '{http://ns.adobe.com/f4m/1.0}%s' % prop
class HttpQuietDownloader(HttpFD):
def to_screen(self, *args, **kargs):
pass
class F4mFD(FileDownloader):
"""
A downloader for f4m manifests or AdobeHDS.
"""
def real_download(self, filename, info_dict):
man_url = info_dict['url']
self.to_screen('[download] Downloading f4m manifest')
manifest = self.ydl.urlopen(man_url).read()
self.report_destination(filename)
http_dl = HttpQuietDownloader(self.ydl,
{
'continuedl': True,
'quiet': True,
'noprogress': True,
'test': self.params.get('test', False),
})
doc = etree.fromstring(manifest)
formats = [(int(f.attrib.get('bitrate', -1)), f) for f in doc.findall(_add_ns('media'))]
formats = sorted(formats, key=lambda f: f[0])
rate, media = formats[-1]
base_url = compat_urlparse.urljoin(man_url, media.attrib['url'])
bootstrap = base64.b64decode(doc.find(_add_ns('bootstrapInfo')).text)
metadata = base64.b64decode(media.find(_add_ns('metadata')).text)
boot_info = read_bootstrap_info(bootstrap)
fragments_list = build_fragments_list(boot_info)
if self.params.get('test', False):
# We only download the first fragment
fragments_list = fragments_list[:1]
total_frags = len(fragments_list)
tmpfilename = self.temp_name(filename)
(dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb')
write_flv_header(dest_stream, metadata)
# This dict stores the download progress, it's updated by the progress
# hook
state = {
'downloaded_bytes': 0,
'frag_counter': 0,
}
start = time.time()
def frag_progress_hook(status):
frag_total_bytes = status.get('total_bytes', 0)
estimated_size = (state['downloaded_bytes'] +
(total_frags - state['frag_counter']) * frag_total_bytes)
if status['status'] == 'finished':
state['downloaded_bytes'] += frag_total_bytes
state['frag_counter'] += 1
progress = self.calc_percent(state['frag_counter'], total_frags)
byte_counter = state['downloaded_bytes']
else:
frag_downloaded_bytes = status['downloaded_bytes']
byte_counter = state['downloaded_bytes'] + frag_downloaded_bytes
frag_progress = self.calc_percent(frag_downloaded_bytes,
frag_total_bytes)
progress = self.calc_percent(state['frag_counter'], total_frags)
progress += frag_progress / float(total_frags)
eta = self.calc_eta(start, time.time(), estimated_size, byte_counter)
self.report_progress(progress, format_bytes(estimated_size),
status.get('speed'), eta)
http_dl.add_progress_hook(frag_progress_hook)
frags_filenames = []
for (seg_i, frag_i) in fragments_list:
name = 'Seg%d-Frag%d' % (seg_i, frag_i)
url = base_url + name
frag_filename = '%s-%s' % (tmpfilename, name)
success = http_dl.download(frag_filename, {'url': url})
if not success:
return False
with open(frag_filename, 'rb') as down:
down_data = down.read()
reader = FlvReader(down_data)
while True:
_, box_type, box_data = reader.read_box_info()
if box_type == b'mdat':
dest_stream.write(box_data)
break
frags_filenames.append(frag_filename)
self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start)
self.try_rename(tmpfilename, filename)
for frag_file in frags_filenames:
os.remove(frag_file)
fsize = os.path.getsize(encodeFilename(filename))
self._hook_progress({
'downloaded_bytes': fsize,
'total_bytes': fsize,
'filename': filename,
'status': 'finished',
})
return True

View File

@@ -87,8 +87,10 @@ class RtmpFD(FileDownloader):
url = info_dict['url']
player_url = info_dict.get('player_url', None)
page_url = info_dict.get('page_url', None)
app = info_dict.get('app', None)
play_path = info_dict.get('play_path', None)
tc_url = info_dict.get('tc_url', None)
flash_version = info_dict.get('flash_version', None)
live = info_dict.get('rtmp_live', False)
conn = info_dict.get('rtmp_conn', None)
@@ -111,12 +113,16 @@ class RtmpFD(FileDownloader):
basic_args += ['--swfVfy', player_url]
if page_url is not None:
basic_args += ['--pageUrl', page_url]
if app is not None:
basic_args += ['--app', app]
if play_path is not None:
basic_args += ['--playpath', play_path]
if tc_url is not None:
basic_args += ['--tcUrl', url]
if test:
basic_args += ['--stop', '1']
if flash_version is not None:
basic_args += ['--flashVer', flash_version]
if live:
basic_args += ['--live']
if conn:

View File

@@ -15,6 +15,7 @@ from .arte import (
from .auengine import AUEngineIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
from .bbccouk import BBCCoUkIE
from .blinkx import BlinkxIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .bloomberg import BloombergIE
@@ -25,12 +26,16 @@ from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cbs import CBSIE
from .channel9 import Channel9IE
from .chilloutzone import ChilloutzoneIE
from .cinemassacre import CinemassacreIE
from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE
from .cmt import CMTIE
from .cnn import CNNIE
from .cnn import (
CNNIE,
CNNBlogsIE,
)
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
from .condenast import CondeNastIE
@@ -54,17 +59,21 @@ from .ebaumsworld import EbaumsWorldIE
from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .eitb import EitbIE
from .elpais import ElPaisIE
from .escapist import EscapistIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE
from .faz import FazIE
from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
from .fktv import (
FKTVIE,
FKTVPosteckeIE,
)
from .flickr import FlickrIE
from .fourtube import FourTubeIE
from .franceinter import FranceInterIE
from .francetv import (
PluzzIE,
@@ -83,6 +92,7 @@ from .generic import GenericIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
from .hark import HarkIE
from .helsinki import HelsinkiIE
from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .huffpost import HuffPostIE
@@ -101,6 +111,7 @@ from .ivi import (
IviIE,
IviCompilationIE
)
from .jadorecettepub import JadoreCettePubIE
from .jeuxvideo import JeuxVideoIE
from .jukebox import JukeboxIE
from .justintv import JustinTVIE
@@ -110,6 +121,7 @@ from .keezmovies import KeezMoviesIE
from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE
from .keek import KeekIE
from .kontrtube import KontrTubeIE
from .la7 import LA7IE
from .lifenews import LifeNewsIE
from .liveleak import LiveLeakIE
@@ -118,6 +130,7 @@ from .lynda import (
LyndaIE,
LyndaCourseIE
)
from .m6 import M6IE
from .macgamestore import MacGameStoreIE
from .malemotion import MalemotionIE
from .mdr import MDRIE
@@ -139,8 +152,10 @@ from .myvideo import MyVideoIE
from .naver import NaverIE
from .nba import NBAIE
from .nbc import NBCNewsIE
from .ndr import NDRIE
from .ndtv import NDTVIE
from .newgrounds import NewgroundsIE
from .nfb import NFBIE
from .nhl import NHLIE, NHLVideocenterIE
from .niconico import NiconicoIE
from .ninegag import NineGagIE
@@ -195,6 +210,8 @@ from .stanfordoc import StanfordOpenClassroomIE
from .statigram import StatigramIE
from .steam import SteamIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .syfy import SyfyIE
from .sztvhu import SztvHuIE
from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
@@ -216,6 +233,7 @@ from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
from .veoh import VeohIE
from .vesti import VestiIE
from .vevo import VevoIE
from .vice import ViceIE
from .viddler import ViddlerIE

View File

@@ -0,0 +1,217 @@
from __future__ import unicode_literals
import re
from .subtitles import SubtitlesInfoExtractor
from ..utils import ExtractorError
class BBCCoUkIE(SubtitlesInfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})'
_TESTS = [
{
'url': 'http://www.bbc.co.uk/programmes/p01q7wz1',
'info_dict': {
'id': 'p01q7wz4',
'ext': 'flv',
'title': 'Friction: Blu Mar Ten guest mix: Blu Mar Ten - Guest Mix',
'description': 'Blu Mar Ten deliver a Guest Mix for Friction.',
'duration': 1936,
},
'params': {
# rtmp download
'skip_download': True,
}
},
{
'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
'info_dict': {
'id': 'b00yng1d',
'ext': 'flv',
'title': 'The Man in Black: Series 3: The Printed Name',
'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
'duration': 1800,
},
'params': {
# rtmp download
'skip_download': True,
}
},
{
'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
'info_dict': {
'id': 'b00yng1d',
'ext': 'flv',
'title': 'The Voice UK: Series 3: Blind Auditions 5',
'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
'duration': 5100,
},
'params': {
# rtmp download
'skip_download': True,
},
'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
}
]
def _extract_asx_playlist(self, connection, programme_id):
asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
return [ref.get('href') for ref in asx.findall('./Entry/ref')]
def _extract_connection(self, connection, programme_id):
formats = []
protocol = connection.get('protocol')
supplier = connection.get('supplier')
if protocol == 'http':
href = connection.get('href')
# ASX playlist
if supplier == 'asx':
for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
formats.append({
'url': ref,
'format_id': 'ref%s_%s' % (i, supplier),
})
# Direct link
else:
formats.append({
'url': href,
'format_id': supplier,
})
elif protocol == 'rtmp':
application = connection.get('application', 'ondemand')
auth_string = connection.get('authString')
identifier = connection.get('identifier')
server = connection.get('server')
formats.append({
'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
'play_path': identifier,
'app': '%s?%s' % (application, auth_string),
'page_url': 'http://www.bbc.co.uk',
'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
'rtmp_live': False,
'ext': 'flv',
'format_id': supplier,
})
return formats
def _extract_items(self, playlist):
return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
def _extract_medias(self, media_selection):
return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
def _extract_connections(self, media):
return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
def _extract_video(self, media, programme_id):
formats = []
vbr = int(media.get('bitrate'))
vcodec = media.get('encoding')
service = media.get('service')
width = int(media.get('width'))
height = int(media.get('height'))
file_size = int(media.get('media_file_size'))
for connection in self._extract_connections(media):
conn_formats = self._extract_connection(connection, programme_id)
for format in conn_formats:
format.update({
'format_id': '%s_%s' % (service, format['format_id']),
'width': width,
'height': height,
'vbr': vbr,
'vcodec': vcodec,
'filesize': file_size,
})
formats.extend(conn_formats)
return formats
def _extract_audio(self, media, programme_id):
formats = []
abr = int(media.get('bitrate'))
acodec = media.get('encoding')
service = media.get('service')
for connection in self._extract_connections(media):
conn_formats = self._extract_connection(connection, programme_id)
for format in conn_formats:
format.update({
'format_id': '%s_%s' % (service, format['format_id']),
'abr': abr,
'acodec': acodec,
})
formats.extend(conn_formats)
return formats
def _extract_captions(self, media, programme_id):
subtitles = {}
for connection in self._extract_connections(media):
captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
srt = ''
for pos, p in enumerate(ps):
srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'),
p.text.strip() if p.text is not None else '')
subtitles[lang] = srt
return subtitles
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
group_id = mobj.group('id')
playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id,
'Downloading playlist XML')
no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
if no_items is not None:
reason = no_items.get('reason')
if reason == 'preAvailability':
msg = 'Episode %s is not yet available' % group_id
elif reason == 'postAvailability':
msg = 'Episode %s is no longer available' % group_id
else:
msg = 'Episode %s is not available: %s' % (group_id, reason)
raise ExtractorError(msg, expected=True)
formats = []
subtitles = None
for item in self._extract_items(playlist):
kind = item.get('kind')
if kind != 'programme' and kind != 'radioProgramme':
continue
title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
programme_id = item.get('identifier')
duration = int(item.get('duration'))
media_selection = self._download_xml(
'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id,
programme_id, 'Downloading media selection XML')
for media in self._extract_medias(media_selection):
kind = media.get('kind')
if kind == 'audio':
formats.extend(self._extract_audio(media, programme_id))
elif kind == 'video':
formats.extend(self._extract_video(media, programme_id))
elif kind == 'captions':
subtitles = self._extract_captions(media, programme_id)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(programme_id, subtitles)
return
self._sort_formats(formats)
return {
'id': programme_id,
'title': title,
'description': description,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
}

View File

@@ -24,5 +24,7 @@ class BloombergIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
webpage = self._download_webpage(url, name)
ooyala_url = self._twitter_search_player(webpage)
return self.url_result(ooyala_url, OoyalaIE.ie_key())
embed_code = self._search_regex(
r'<source src="https?://[^/]+/[^/]+/[^/]+/([^/]+)', webpage,
'embed code')
return OoyalaIE._build_url_result(embed_code)

View File

@@ -1,18 +1,20 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..utils import determine_ext
class BreakIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?break\.com/video/([^/]+)'
_VALID_URL = r'http://(?:www\.)?break\.com/video/([^/]+)'
_TEST = {
u'url': u'http://www.break.com/video/when-girls-act-like-guys-2468056',
u'file': u'2468056.mp4',
u'md5': u'a3513fb1547fba4fb6cfac1bffc6c46b',
u'info_dict': {
u"title": u"When Girls Act Like D-Bags"
'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056',
'md5': 'a3513fb1547fba4fb6cfac1bffc6c46b',
'info_dict': {
'id': '2468056',
'ext': 'mp4',
'title': 'When Girls Act Like D-Bags',
}
}
@@ -22,17 +24,16 @@ class BreakIE(InfoExtractor):
embed_url = 'http://www.break.com/embed/%s' % video_id
webpage = self._download_webpage(embed_url, video_id)
info_json = self._search_regex(r'var embedVars = ({.*?});', webpage,
u'info json', flags=re.DOTALL)
'info json', flags=re.DOTALL)
info = json.loads(info_json)
video_url = info['videoUri']
m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url)
if m_youtube is not None:
return self.url_result(m_youtube.group(1), 'Youtube')
final_url = video_url + '?' + info['AuthToken']
return [{
'id': video_id,
'url': final_url,
'ext': determine_ext(final_url),
'title': info['contentName'],
return {
'id': video_id,
'url': final_url,
'title': info['contentName'],
'thumbnail': info['thumbUri'],
}]
}

View File

@@ -17,6 +17,7 @@ from ..utils import (
ExtractorError,
unsmuggle_url,
unescapeHTML,
)
@@ -139,7 +140,7 @@ class BrightcoveIE(InfoExtractor):
url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage)
if url_m:
return [url_m.group(1)]
return [unescapeHTML(url_m.group(1))]
matches = re.findall(
r'''(?sx)<object

View File

@@ -15,14 +15,15 @@ class Channel9IE(InfoExtractor):
'''
IE_DESC = 'Channel 9'
IE_NAME = 'channel9'
_VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
_VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
_TESTS = [
{
'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
'file': 'Events_TechEd_Australia_2013_KOS002.mp4',
'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
'info_dict': {
'id': 'Events/TechEd/Australia/2013/KOS002',
'ext': 'mp4',
'title': 'Developer Kick-Off Session: Stuff We Love',
'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
'duration': 4576,
@@ -35,9 +36,10 @@ class Channel9IE(InfoExtractor):
},
{
'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
'file': 'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
'info_dict': {
'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
'ext': 'mp4',
'title': 'Self-service BI with Power BI - nuclear testing',
'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
'duration': 1540,

View File

@@ -0,0 +1,97 @@
from __future__ import unicode_literals
import re
import base64
import json
from .common import InfoExtractor
from ..utils import (
clean_html,
ExtractorError
)
class ChilloutzoneIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html'
_TESTS = [{
'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html',
'md5': 'a76f3457e813ea0037e5244f509e66d1',
'info_dict': {
'id': 'enemene-meck-alle-katzen-weg',
'ext': 'mp4',
'title': 'Enemene Meck - Alle Katzen weg',
'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?',
},
}, {
'note': 'Video hosted at YouTube',
'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html',
'info_dict': {
'id': '1YVQaAgHyRU',
'ext': 'mp4',
'title': '16 Photos Taken 1 Second Before Disaster',
'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814',
'uploader': 'BuzzFeedVideo',
'uploader_id': 'BuzzFeedVideo',
'upload_date': '20131105',
},
}, {
'note': 'Video hosted at Vimeo',
'url': 'http://www.chilloutzone.net/video/icon-blending.html',
'md5': '2645c678b8dc4fefcc0e1b60db18dac1',
'info_dict': {
'id': '85523671',
'ext': 'mp4',
'title': 'The Sunday Times - Icons',
'description': 'md5:3e1c0dc6047498d6728dcdaad0891762',
'uploader': 'Us',
'uploader_id': 'usfilms',
'upload_date': '20140131'
},
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
base64_video_info = self._html_search_regex(
r'var cozVidData = "(.+?)";', webpage, 'video data')
decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
video_info_dict = json.loads(decoded_video_info)
# get video information from dict
video_url = video_info_dict['mediaUrl']
description = clean_html(video_info_dict.get('description'))
title = video_info_dict['title']
native_platform = video_info_dict['nativePlatform']
native_video_id = video_info_dict['nativeVideoId']
source_priority = video_info_dict['sourcePriority']
# If nativePlatform is None a fallback mechanism is used (i.e. youtube embed)
if native_platform is None:
youtube_url = self._html_search_regex(
r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
webpage, 'fallback video URL', default=None)
if youtube_url is not None:
return self.url_result(youtube_url, ie='Youtube')
# Non Fallback: Decide to use native source (e.g. youtube or vimeo) or
# the own CDN
if source_priority == 'native':
if native_platform == 'youtube':
return self.url_result(native_video_id, ie='Youtube')
if native_platform == 'vimeo':
return self.url_result(
'http://vimeo.com/' + native_video_id, ie='Vimeo')
if not video_url:
raise ExtractorError('No video found')
return {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
'description': description,
}

View File

@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_duration,
url_basename,
)
@@ -98,3 +99,28 @@ class CNNIE(InfoExtractor):
'duration': duration,
'upload_date': upload_date,
}
class CNNBlogsIE(InfoExtractor):
_VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+'
_TEST = {
'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/',
'md5': '3e56f97b0b6ffb4b79f4ea0749551084',
'info_dict': {
'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn',
'ext': 'mp4',
'title': 'Criminalizing journalism?',
'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.',
'upload_date': '20140209',
},
'add_ie': ['CNN'],
}
def _real_extract(self, url):
webpage = self._download_webpage(url, url_basename(url))
cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url')
return {
'_type': 'url',
'url': cnn_url,
'ie_key': CNNIE.ie_key(),
}

View File

@@ -4,6 +4,7 @@ import json
import re
from .common import InfoExtractor
from ..utils import int_or_none
class CollegeHumorIE(InfoExtractor):
@@ -11,22 +12,25 @@ class CollegeHumorIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
'file': '6902724.mp4',
'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd',
'info_dict': {
'id': '6902724',
'ext': 'mp4',
'title': 'Comic-Con Cosplay Catastrophe',
'description': 'Fans get creative this year at San Diego. Too',
'description': 'Fans get creative this year',
'age_limit': 13,
},
},
{
'url': 'http://www.collegehumor.com/video/3505939/font-conference',
'file': '3505939.mp4',
'md5': '72fa701d8ef38664a4dbb9e2ab721816',
'info_dict': {
'id': '3505939',
'ext': 'mp4',
'title': 'Font Conference',
'description': 'This video wasn\'t long enough, so we made it double-spaced.',
'description': 'This video wasn\'t long enough,',
'age_limit': 10,
'duration': 179,
},
},
# embedded youtube video
@@ -38,7 +42,7 @@ class CollegeHumorIE(InfoExtractor):
'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]',
'uploader': 'Funnyplox TV',
'uploader_id': 'funnyploxtv',
'description': 'md5:11812366244110c3523968aa74f02521',
'description': 'md5:7ded37421526d54afdf005e25bc2b7a3',
'upload_date': '20140128',
},
'params': {
@@ -82,6 +86,8 @@ class CollegeHumorIE(InfoExtractor):
})
self._sort_formats(formats)
duration = int_or_none(vdata.get('duration'), 1000)
return {
'id': video_id,
'title': vdata['title'],
@@ -89,4 +95,5 @@ class CollegeHumorIE(InfoExtractor):
'thumbnail': vdata.get('thumbnail'),
'formats': formats,
'age_limit': age_limit,
'duration': duration,
}

View File

@@ -271,8 +271,11 @@ class InfoExtractor(object):
def _download_json(self, url_or_request, video_id,
note=u'Downloading JSON metadata',
errnote=u'Unable to download JSON metadata'):
errnote=u'Unable to download JSON metadata',
transform_source=None):
json_string = self._download_webpage(url_or_request, video_id, note, errnote)
if transform_source:
json_string = transform_source(json_string)
try:
return json.loads(json_string)
except ValueError as ve:

View File

@@ -1,41 +1,42 @@
from __future__ import unicode_literals
import re
import json
import time
from .common import InfoExtractor
class DotsubIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?dotsub\.com/view/([^/]+)'
_VALID_URL = r'http://(?:www\.)?dotsub\.com/view/(?P<id>[^/]+)'
_TEST = {
u'url': u'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27',
u'file': u'aed3b8b2-1889-4df5-ae63-ad85f5572f27.flv',
u'md5': u'0914d4d69605090f623b7ac329fea66e',
u'info_dict': {
u"title": u"Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary",
u"uploader": u"4v4l0n42",
u'description': u'Pyramids of Waste (2010) also known as "The lightbulb conspiracy" is a documentary about how our economic system based on consumerism and planned obsolescence is breaking our planet down.\r\n\r\nSolutions to this can be found at:\r\nhttp://robotswillstealyourjob.com\r\nhttp://www.federicopistono.org\r\n\r\nhttp://opensourceecology.org\r\nhttp://thezeitgeistmovement.com',
u'thumbnail': u'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
u'upload_date': u'20101213',
'url': 'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27',
'md5': '0914d4d69605090f623b7ac329fea66e',
'info_dict': {
'id': 'aed3b8b2-1889-4df5-ae63-ad85f5572f27',
'ext': 'flv',
'title': 'Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary',
'uploader': '4v4l0n42',
'description': 'Pyramids of Waste (2010) also known as "The lightbulb conspiracy" is a documentary about how our economic system based on consumerism and planned obsolescence is breaking our planet down.\r\n\r\nSolutions to this can be found at:\r\nhttp://robotswillstealyourjob.com\r\nhttp://www.federicopistono.org\r\n\r\nhttp://opensourceecology.org\r\nhttp://thezeitgeistmovement.com',
'thumbnail': 'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
'upload_date': '20101213',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
info_url = "https://dotsub.com/api/media/%s/metadata" %(video_id)
webpage = self._download_webpage(info_url, video_id)
info = json.loads(webpage)
video_id = mobj.group('id')
info_url = "https://dotsub.com/api/media/%s/metadata" % video_id
info = self._download_json(info_url, video_id)
date = time.gmtime(info['dateCreated']/1000) # The timestamp is in miliseconds
return [{
'id': video_id,
'url': info['mediaURI'],
'ext': 'flv',
'title': info['title'],
'thumbnail': info['screenshotURI'],
return {
'id': video_id,
'url': info['mediaURI'],
'ext': 'flv',
'title': info['title'],
'thumbnail': info['screenshotURI'],
'description': info['description'],
'uploader': info['user'],
'view_count': info['numberOfViews'],
'upload_date': u'%04i%02i%02i' % (date.tm_year, date.tm_mon, date.tm_mday),
}]
'uploader': info['user'],
'view_count': info['numberOfViews'],
'upload_date': '%04i%02i%02i' % (date.tm_year, date.tm_mon, date.tm_mday),
}

View File

@@ -10,11 +10,12 @@ from .common import InfoExtractor
class DropboxIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P<id>[a-zA-Z0-9]{15})/(?P<title>[^?#]*)'
_TEST = {
'url': 'https://www.dropbox.com/s/mcnzehi9wo55th4/20131219_085616.mp4',
'file': 'mcnzehi9wo55th4.mp4',
'md5': 'f6d65b1b326e82fd7ab7720bea3dacae',
'url': 'https://www.dropbox.com/s/0qr9sai2veej4f8/THE_DOCTOR_GAMES.mp4',
'md5': '8ae17c51172fb7f93bdd6a214cc8c896',
'info_dict': {
'title': '20131219_085616'
'id': '0qr9sai2veej4f8',
'ext': 'mp4',
'title': 'THE_DOCTOR_GAMES'
}
}

View File

@@ -0,0 +1,58 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import unified_strdate
class ElPaisIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
IE_DESC = 'El País'
_TEST = {
'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',
'md5': '98406f301f19562170ec071b83433d55',
'info_dict': {
'id': 'tiempo-nuevo-recetas-viejas',
'ext': 'mp4',
'title': 'Tiempo nuevo, recetas viejas',
'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.',
'upload_date': '20140206',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
prefix = self._html_search_regex(
r'var url_cache = "([^"]+)";', webpage, 'URL prefix')
video_suffix = self._search_regex(
r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL')
video_url = prefix + video_suffix
thumbnail_suffix = self._search_regex(
r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL',
fatal=False)
thumbnail = (
None if thumbnail_suffix is None
else prefix + thumbnail_suffix)
title = self._html_search_regex(
'<h2 class="entry-header entry-title.*?>(.*?)</h2>',
webpage, 'title')
date_str = self._search_regex(
r'<p class="date-header date-int updated"\s+title="([^"]+)">',
webpage, 'upload date', fatal=False)
upload_date = (None if date_str is None else unified_strdate(date_str))
return {
'id': video_id,
'url': video_url,
'title': title,
'description': self._og_search_description(webpage),
'thumbnail': thumbnail,
'upload_date': upload_date,
}

View File

@@ -1,9 +1,9 @@
import json
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse,
ExtractorError,
@@ -11,70 +11,68 @@ from ..utils import (
class EscapistIE(InfoExtractor):
_VALID_URL = r'^https?://?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
_VALID_URL = r'^https?://?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<id>[0-9]+)-'
_TEST = {
u'url': u'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
u'file': u'6618-Breaking-Down-Baldurs-Gate.mp4',
u'md5': u'ab3a706c681efca53f0a35f1415cf0d1',
u'info_dict': {
u"description": u"Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
u"uploader": u"the-escapist-presents",
u"title": u"Breaking Down Baldur's Gate"
'url': 'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
'md5': 'ab3a706c681efca53f0a35f1415cf0d1',
'info_dict': {
'id': '6618',
'ext': 'mp4',
'description': "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
'uploader': 'the-escapist-presents',
'title': "Breaking Down Baldur's Gate",
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
showName = mobj.group('showname')
videoId = mobj.group('episode')
video_id = mobj.group('id')
self.report_extraction(videoId)
webpage = self._download_webpage(url, videoId)
self.report_extraction(video_id)
webpage = self._download_webpage(url, video_id)
videoDesc = self._html_search_regex(
r'<meta name="description" content="([^"]*)"',
webpage, u'description', fatal=False)
webpage, 'description', fatal=False)
playerUrl = self._og_search_video_url(webpage, name=u'player URL')
title = self._html_search_regex(
r'<meta name="title" content="([^"]*)"',
webpage, u'title').split(' : ')[-1]
webpage, 'title').split(' : ')[-1]
configUrl = self._search_regex('config=(.*)$', playerUrl, u'config URL')
configUrl = self._search_regex('config=(.*)$', playerUrl, 'config URL')
configUrl = compat_urllib_parse.unquote(configUrl)
formats = []
def _add_format(name, cfgurl):
configJSON = self._download_webpage(
cfgurl, videoId,
u'Downloading ' + name + ' configuration',
u'Unable to download ' + name + ' configuration')
def _add_format(name, cfgurl, quality):
config = self._download_json(
cfgurl, video_id,
'Downloading ' + name + ' configuration',
'Unable to download ' + name + ' configuration',
transform_source=lambda s: s.replace("'", '"'))
# Technically, it's JavaScript, not JSON
configJSON = configJSON.replace("'", '"')
try:
config = json.loads(configJSON)
except (ValueError,) as err:
raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
playlist = config['playlist']
formats.append({
'url': playlist[1]['url'],
'format_id': name,
'quality': quality,
})
_add_format(u'normal', configUrl)
_add_format('normal', configUrl, quality=0)
hq_url = (configUrl +
('&hq=1' if '?' in configUrl else configUrl + '?hq=1'))
try:
_add_format(u'hq', hq_url)
_add_format('hq', hq_url, quality=1)
except ExtractorError:
pass # That's fine, we'll just use normal quality
self._sort_formats(formats)
return {
'id': videoId,
'id': video_id,
'formats': formats,
'uploader': showName,
'title': title,

View File

@@ -1,56 +1,58 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
class ExfmIE(InfoExtractor):
IE_NAME = u'exfm'
IE_DESC = u'ex.fm'
_VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)'
_SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream'
IE_NAME = 'exfm'
IE_DESC = 'ex.fm'
_VALID_URL = r'http://(?:www\.)?ex\.fm/song/(?P<id>[^/]+)'
_SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream'
_TESTS = [
{
u'url': u'http://ex.fm/song/eh359',
u'file': u'44216187.mp3',
u'md5': u'e45513df5631e6d760970b14cc0c11e7',
u'info_dict': {
u"title": u"Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive",
u"uploader": u"deadjournalist",
u'upload_date': u'20120424',
u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive',
'url': 'http://ex.fm/song/eh359',
'md5': 'e45513df5631e6d760970b14cc0c11e7',
'info_dict': {
'id': '44216187',
'ext': 'mp3',
'title': 'Test House "Love Is Not Enough" (Extended Mix) DeadJournalist Exclusive',
'uploader': 'deadjournalist',
'upload_date': '20120424',
'description': 'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive',
},
u'note': u'Soundcloud song',
u'skip': u'The site is down too often',
'note': 'Soundcloud song',
'skip': 'The site is down too often',
},
{
u'url': u'http://ex.fm/song/wddt8',
u'file': u'wddt8.mp3',
u'md5': u'966bd70741ac5b8570d8e45bfaed3643',
u'info_dict': {
u'title': u'Safe and Sound',
u'uploader': u'Capital Cities',
'url': 'http://ex.fm/song/wddt8',
'md5': '966bd70741ac5b8570d8e45bfaed3643',
'info_dict': {
'id': 'wddt8',
'ext': 'mp3',
'title': 'Safe and Sound',
'uploader': 'Capital Cities',
},
u'skip': u'The site is down too often',
'skip': 'The site is down too often',
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
song_id = mobj.group(1)
info_url = "http://ex.fm/api/v3/song/%s" %(song_id)
webpage = self._download_webpage(info_url, song_id)
info = json.loads(webpage)
song_url = info['song']['url']
song_id = mobj.group('id')
info_url = "http://ex.fm/api/v3/song/%s" % song_id
info = self._download_json(info_url, song_id)['song']
song_url = info['url']
if re.match(self._SOUNDCLOUD_URL, song_url) is not None:
self.to_screen('Soundcloud song detected')
return self.url_result(song_url.replace('/stream',''), 'Soundcloud')
return [{
'id': song_id,
'url': song_url,
'ext': 'mp3',
'title': info['song']['title'],
'thumbnail': info['song']['image']['large'],
'uploader': info['song']['artist'],
'view_count': info['song']['loved_count'],
}]
return self.url_result(song_url.replace('/stream', ''), 'Soundcloud')
return {
'id': song_id,
'url': song_url,
'ext': 'mp3',
'title': info['title'],
'thumbnail': info['image']['large'],
'uploader': info['artist'],
'view_count': info['loved_count'],
}

View File

@@ -0,0 +1,38 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class FirstpostIE(InfoExtractor):
IE_NAME = 'Firstpost.com'
_VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P<id>[0-9]+)\.html'
_TEST = {
'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html',
'md5': 'ee9114957692f01fb1263ed87039112a',
'info_dict': {
'id': '1025403',
'ext': 'mp4',
'title': 'India to launch indigenous aircraft carrier INS Vikrant today',
'description': 'Its flight deck is over twice the size of a football field, its power unit can light up the entire Kochi city and the cabling is enough to cover the distance between here to Delhi.',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
r'<div.*?name="div_video".*?flashvars="([^"]+)">',
webpage, 'video URL')
return {
'id': video_id,
'url': video_url,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}

View File

@@ -0,0 +1,60 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import int_or_none
class FirstTVIE(InfoExtractor):
IE_NAME = 'firsttv'
IE_DESC = 'Видеоархив - Первый канал'
_VALID_URL = r'http://(?:www\.)?1tv\.ru/videoarchive/(?P<id>\d+)'
_TEST = {
'url': 'http://www.1tv.ru/videoarchive/73390',
'md5': '3de6390cf0cca4a5eae1d1d83895e5ad',
'info_dict': {
'id': '73390',
'ext': 'mp4',
'title': 'Олимпийские канатные дороги',
'description': 'md5:cc730d2bf4215463e37fff6a1e277b13',
'thumbnail': 'http://img1.1tv.ru/imgsize640x360/PR20140210114657.JPG',
'duration': 149,
},
'skip': 'Only works from Russia',
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id, 'Downloading page')
video_url = self._html_search_regex(
r'''(?s)jwplayer\('flashvideoportal_1'\)\.setup\({.*?'file': '([^']+)'.*?}\);''', webpage, 'video URL')
title = self._html_search_regex(
r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', webpage, 'title')
description = self._html_search_regex(
r'<div class="descr">\s*<div>&nbsp;</div>\s*<p>([^<]*)</p></div>', webpage, 'description', fatal=False)
thumbnail = self._og_search_thumbnail(webpage)
duration = self._og_search_property('video:duration', webpage, 'video duration', fatal=False)
like_count = self._html_search_regex(r'title="Понравилось".*?/></label> \[(\d+)\]',
webpage, 'like count', fatal=False)
dislike_count = self._html_search_regex(r'title="Не понравилось".*?/></label> \[(\d+)\]',
webpage, 'dislike count', fatal=False)
return {
'id': video_id,
'url': video_url,
'thumbnail': thumbnail,
'title': title,
'description': description,
'duration': int_or_none(duration),
'like_count': int_or_none(like_count),
'dislike_count': int_or_none(dislike_count),
}

View File

@@ -0,0 +1,95 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_request,
unified_strdate,
str_to_int,
parse_duration,
)
from youtube_dl.utils import clean_html
class FourTubeIE(InfoExtractor):
IE_NAME = '4tube'
_VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)'
_TEST = {
'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
'md5': '6516c8ac63b03de06bc8eac14362db4f',
'info_dict': {
'id': '209733',
'ext': 'mp4',
'title': 'Hot Babe Holly Michaels gets her ass stuffed by black',
'uploader': 'WCP Club',
'uploader_id': 'wcp-club',
'upload_date': '20131031',
'duration': 583,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'http://www.4tube.com/videos/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
self.report_extraction(video_id)
playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, 'Playlist')
media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, 'Media Id')
sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, 'Sources').split(',')
title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, 'Title')
thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, 'Thumbnail', fatal=False)
uploader_str = self._search_regex(r'<span>Uploaded by</span>(.*?)<span>', webpage, 'uploader', fatal=False)
mobj = re.search(r'<a href="/sites/(?P<id>[^"]+)"><strong>(?P<name>[^<]+)</strong></a>', uploader_str)
(uploader, uploader_id) = (mobj.group('name'), mobj.group('id')) if mobj else (clean_html(uploader_str), None)
upload_date = None
view_count = None
duration = None
description = self._html_search_meta('description', webpage, 'description')
if description:
upload_date = self._search_regex(r'Published Date: (\d{2} [a-zA-Z]{3} \d{4})', description, 'upload date',
fatal=False)
if upload_date:
upload_date = unified_strdate(upload_date)
view_count = self._search_regex(r'Views: ([\d,\.]+)', description, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
duration = parse_duration(self._search_regex(r'Length: (\d+m\d+s)', description, 'duration', fatal=False))
token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources))
headers = {
b'Content-Type': b'application/x-www-form-urlencoded',
b'Origin': b'http://www.4tube.com',
}
token_req = compat_urllib_request.Request(token_url, b'{}', headers)
tokens = self._download_json(token_req, video_id)
formats = [{
'url': tokens[format]['token'],
'format_id': format + 'p',
'resolution': format + 'p',
'quality': int(format),
} for format in sources]
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail_url,
'uploader': uploader,
'uploader_id': uploader_id,
'upload_date': upload_date,
'view_count': view_count,
'duration': duration,
'age_limit': 18,
'webpage_url': webpage_url,
}

View File

@@ -184,6 +184,7 @@ class GenerationQuoiIE(InfoExtractor):
# It uses Dailymotion
'skip_download': True,
},
'skip': 'Only available from France',
}
def _real_extract(self, url):

View File

@@ -1,18 +1,21 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import determine_ext
class FreesoundIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?freesound\.org/people/([^/]+)/sounds/(?P<id>[^/]+)'
_VALID_URL = r'https?://(?:www\.)?freesound\.org/people/([^/]+)/sounds/(?P<id>[^/]+)'
_TEST = {
u'url': u'http://www.freesound.org/people/miklovan/sounds/194503/',
u'file': u'194503.mp3',
u'md5': u'12280ceb42c81f19a515c745eae07650',
u'info_dict': {
u"title": u"gulls in the city.wav",
u"uploader" : u"miklovan",
u'description': u'the sounds of seagulls in the city',
'url': 'http://www.freesound.org/people/miklovan/sounds/194503/',
'md5': '12280ceb42c81f19a515c745eae07650',
'info_dict': {
'id': '194503',
'ext': 'mp3',
'title': 'gulls in the city.wav',
'uploader': 'miklovan',
'description': 'the sounds of seagulls in the city',
}
}
@@ -20,17 +23,17 @@ class FreesoundIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
music_id = mobj.group('id')
webpage = self._download_webpage(url, music_id)
title = self._html_search_regex(r'<div id="single_sample_header">.*?<a href="#">(.+?)</a>',
webpage, 'music title', flags=re.DOTALL)
music_url = self._og_search_property('audio', webpage, 'music url')
description = self._html_search_regex(r'<div id="sound_description">(.*?)</div>',
webpage, 'description', fatal=False, flags=re.DOTALL)
title = self._html_search_regex(
r'<div id="single_sample_header">.*?<a href="#">(.+?)</a>',
webpage, 'music title', flags=re.DOTALL)
description = self._html_search_regex(
r'<div id="sound_description">(.*?)</div>', webpage, 'description',
fatal=False, flags=re.DOTALL)
return [{
'id': music_id,
'title': title,
'url': music_url,
return {
'id': music_id,
'title': title,
'url': self._og_search_property('audio', webpage, 'music url'),
'uploader': self._og_search_property('audio:artist', webpage, 'music uploader'),
'ext': determine_ext(music_url),
'description': description,
}]
}

View File

@@ -7,10 +7,11 @@ class GametrailersIE(MTVServicesInfoExtractor):
_VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
_TEST = {
'url': 'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
'file': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
'md5': '4c8e67681a0ea7ec241e8c09b3ea8cf7',
'info_dict': {
'title': 'Mirror\'s Edge 2|E3 2013: Debut Trailer',
'id': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d',
'ext': 'mp4',
'title': 'E3 2013: Debut Trailer',
'description': 'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
},
}

View File

@@ -1,4 +1,5 @@
# coding: utf-8
from __future__ import unicode_literals
import datetime
import re
@@ -10,32 +11,28 @@ from ..utils import (
class GooglePlusIE(InfoExtractor):
IE_DESC = u'Google Plus'
_VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
IE_NAME = u'plus.google'
IE_DESC = 'Google Plus'
_VALID_URL = r'https://plus\.google\.com/(?:[^/]+/)*?posts/(?P<id>\w+)'
IE_NAME = 'plus.google'
_TEST = {
u"url": u"https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH",
u"file": u"ZButuJc6CtH.flv",
u"info_dict": {
u"upload_date": u"20120613",
u"uploader": u"井上ヨシマサ",
u"title": u"嘆きの天使 降臨"
'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH',
'info_dict': {
'id': 'ZButuJc6CtH',
'ext': 'flv',
'upload_date': '20120613',
'uploader': '井上ヨシマサ',
'title': '嘆きの天使 降臨',
}
}
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
post_url = mobj.group(0)
video_id = mobj.group(1)
video_extension = 'flv'
video_id = mobj.group('id')
# Step 1, Retrieve post webpage to extract further information
webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
webpage = self._download_webpage(url, video_id, 'Downloading entry webpage')
self.report_extraction(video_id)
@@ -43,7 +40,7 @@ class GooglePlusIE(InfoExtractor):
upload_date = self._html_search_regex(
r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*>
([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''',
webpage, u'upload date', fatal=False, flags=re.VERBOSE)
webpage, 'upload date', fatal=False, flags=re.VERBOSE)
if upload_date:
# Convert timestring to a format suitable for filename
upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
@@ -51,28 +48,27 @@ class GooglePlusIE(InfoExtractor):
# Extract uploader
uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
webpage, u'uploader', fatal=False)
webpage, 'uploader', fatal=False)
# Extract title
# Get the first line for title
video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
webpage, 'title', default=u'NA')
webpage, 'title', default='NA')
# Step 2, Simulate clicking the image box to launch video
DOMAIN = 'https://plus.google.com/'
video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
webpage, u'video page URL')
webpage, 'video page URL')
if not video_page.startswith(DOMAIN):
video_page = DOMAIN + video_page
webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
webpage = self._download_webpage(video_page, video_id, 'Downloading video page')
# Extract video links on video page
"""Extract video links of all sizes"""
# Extract video links all sizes
pattern = r'\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
mobj = re.findall(pattern, webpage)
if len(mobj) == 0:
raise ExtractorError(u'Unable to extract video links')
raise ExtractorError('Unable to extract video links')
# Sort in resolution
links = sorted(mobj)
@@ -87,12 +83,11 @@ class GooglePlusIE(InfoExtractor):
except AttributeError: # Python 3
video_url = bytes(video_url, 'ascii').decode('unicode-escape')
return [{
'id': video_id,
'url': video_url,
return {
'id': video_id,
'url': video_url,
'uploader': uploader,
'upload_date': upload_date,
'title': video_title,
'ext': video_extension,
}]
'upload_date': upload_date,
'title': video_title,
'ext': 'flv',
}

View File

@@ -1,3 +1,5 @@
from __future__ import unicode_literals
import itertools
import re
@@ -8,32 +10,42 @@ from ..utils import (
class GoogleSearchIE(SearchInfoExtractor):
IE_DESC = u'Google Video search'
_MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
IE_DESC = 'Google Video search'
_MAX_RESULTS = 1000
IE_NAME = u'video.google:search'
IE_NAME = 'video.google:search'
_SEARCH_KEY = 'gvsearch'
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
entries = []
res = {
'_type': 'playlist',
'id': query,
'entries': []
'title': query,
}
for pagenum in itertools.count(1):
result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
webpage = self._download_webpage(result_url, u'gvsearch:' + query,
note='Downloading result page ' + str(pagenum))
for pagenum in itertools.count():
result_url = (
'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en'
% (compat_urllib_parse.quote_plus(query), pagenum * 10))
for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
e = {
webpage = self._download_webpage(
result_url, 'gvsearch:' + query,
note='Downloading result page ' + str(pagenum + 1))
for hit_idx, mobj in enumerate(re.finditer(
r'<h3 class="r"><a href="([^"]+)"', webpage)):
# Skip playlists
if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
continue
entries.append({
'_type': 'url',
'url': mobj.group(1)
}
res['entries'].append(e)
})
if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
if (len(entries) >= n) or not re.search(r'class="pn" id="pnnext"', webpage):
res['entries'] = entries[:n]
return res

View File

@@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class HelsinkiIE(InfoExtractor):
IE_DESC = 'helsinki.fi'
_VALID_URL = r'https?://video\.helsinki\.fi/Arkisto/flash\.php\?id=(?P<id>\d+)'
_TEST = {
'url': 'http://video.helsinki.fi/Arkisto/flash.php?id=20258',
'info_dict': {
'id': '20258',
'ext': 'mp4',
'title': 'Tietotekniikkafoorumi-iltapäivä',
'description': 'md5:f5c904224d43c133225130fe156a5ee0',
},
'params': {
'skip_download': True, # RTMP
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
formats = []
mobj = re.search(r'file=((\w+):[^&]+)', webpage)
if mobj:
formats.append({
'ext': mobj.group(2),
'play_path': mobj.group(1),
'url': 'rtmp://flashvideo.it.helsinki.fi/vod/',
'player_url': 'http://video.helsinki.fi/player.swf',
'format_note': 'sd',
'quality': 0,
})
mobj = re.search(r'hd\.file=((\w+):[^&]+)', webpage)
if mobj:
formats.append({
'ext': mobj.group(2),
'play_path': mobj.group(1),
'url': 'rtmp://flashvideo.it.helsinki.fi/vod/',
'player_url': 'http://video.helsinki.fi/player.swf',
'format_note': 'hd',
'quality': 1,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': self._og_search_title(webpage).replace('Video: ', ''),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
}

View File

@@ -1,17 +1,20 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class HowcastIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
_TEST = {
u'url': u'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
u'file': u'390161.mp4',
u'md5': u'8b743df908c42f60cf6496586c7f12c3',
u'info_dict': {
u"description": u"The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here's the proper way to tie a square knot.",
u"title": u"How to Tie a Square Knot Properly"
'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
'md5': '8b743df908c42f60cf6496586c7f12c3',
'info_dict': {
'id': '390161',
'ext': 'mp4',
'description': 'The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here\'s the proper way to tie a square knot.',
'title': 'How to Tie a Square Knot Properly',
}
}
@@ -24,22 +27,15 @@ class HowcastIE(InfoExtractor):
self.report_extraction(video_id)
video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
webpage, u'video URL')
video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
webpage, u'title')
webpage, 'video URL')
video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
webpage, u'description', fatal=False)
webpage, 'description', fatal=False)
thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': video_title,
return {
'id': video_id,
'url': video_url,
'title': self._og_search_title(webpage),
'description': video_description,
'thumbnail': thumbnail,
}]
'thumbnail': self._og_search_thumbnail(webpage),
}

View File

@@ -7,7 +7,7 @@ from .common import InfoExtractor
class InaIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-F0-9]+)/.*'
_VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
_TEST = {
'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
'md5': 'a667021bf2b41f8dc6049479d9bb38a3',

View File

@@ -1,35 +1,39 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class InstagramIE(InfoExtractor):
_VALID_URL = r'(?:http://)?instagram\.com/p/(.*?)/'
_VALID_URL = r'http://instagram\.com/p/(?P<id>.*?)/'
_TEST = {
u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
u'file': u'aye83DjauH.mp4',
u'md5': u'0d2da106a9d2631273e192b372806516',
u'info_dict': {
u"uploader_id": u"naomipq",
u"title": u"Video by naomipq",
u'description': u'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
'md5': '0d2da106a9d2631273e192b372806516',
'info_dict': {
'id': 'aye83DjauH',
'ext': 'mp4',
'uploader_id': 'naomipq',
'title': 'Video by naomipq',
'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
webpage, u'uploader id', fatal=False)
desc = self._search_regex(r'"caption":"(.*?)"', webpage, u'description',
webpage, 'uploader id', fatal=False)
desc = self._search_regex(r'"caption":"(.*?)"', webpage, 'description',
fatal=False)
return [{
'id': video_id,
'url': self._og_search_video_url(webpage, secure=False),
'ext': 'mp4',
'title': u'Video by %s' % uploader_id,
return {
'id': video_id,
'url': self._og_search_video_url(webpage, secure=False),
'ext': 'mp4',
'title': 'Video by %s' % uploader_id,
'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id' : uploader_id,
'uploader_id': uploader_id,
'description': desc,
}]
}

View File

@@ -14,15 +14,16 @@ from ..utils import (
class IviIE(InfoExtractor):
IE_DESC = 'ivi.ru'
IE_NAME = 'ivi'
_VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)'
_VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)'
_TESTS = [
# Single movie
{
'url': 'http://www.ivi.ru/watch/53141',
'file': '53141.mp4',
'md5': '6ff5be2254e796ed346251d117196cf4',
'info_dict': {
'id': '53141',
'ext': 'mp4',
'title': 'Иван Васильевич меняет профессию',
'description': 'md5:b924063ea1677c8fe343d8a72ac2195f',
'duration': 5498,
@@ -33,9 +34,10 @@ class IviIE(InfoExtractor):
# Serial's serie
{
'url': 'http://www.ivi.ru/watch/dezhurnyi_angel/74791',
'file': '74791.mp4',
'md5': '3e6cc9a848c1d2ebcc6476444967baa9',
'info_dict': {
'id': '74791',
'ext': 'mp4',
'title': 'Дежурный ангел - 1 серия',
'duration': 2490,
'thumbnail': 'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg',
@@ -124,7 +126,7 @@ class IviIE(InfoExtractor):
class IviCompilationIE(InfoExtractor):
IE_DESC = 'ivi.ru compilations'
IE_NAME = 'ivi:compilation'
_VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$'
_VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$'
def _extract_entries(self, html, compilation_id):
return [self.url_result('http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), 'Ivi')

View File

@@ -0,0 +1,48 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .youtube import YoutubeIE
class JadoreCettePubIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P<id>.*?)\.html'
_TEST = {
'url': 'http://www.jadorecettepub.com/2010/12/star-wars-massacre-par-les-japonais.html',
'md5': '401286a06067c70b44076044b66515de',
'info_dict': {
'id': 'jLMja3tr7a4',
'ext': 'mp4',
'title': 'La pire utilisation de Star Wars',
'description': "Jadorecettepub.com vous a gratifié de plusieurs pubs géniales utilisant Star Wars et Dark Vador plus particulièrement... Mais l'heure est venue de vous proposer une version totalement massacrée, venue du Japon. Quand les Japonais détruisent l'image de Star Wars pour vendre du thon en boite, ça promet...",
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
title = self._html_search_regex(
r'<span style="font-size: x-large;"><b>(.*?)</b></span>',
webpage, 'title')
description = self._html_search_regex(
r'(?s)<div id="fb-root">(.*?)<script>', webpage, 'description',
fatal=False)
real_url = self._search_regex(
r'\[/postlink\](.*)endofvid', webpage, 'video URL')
video_id = YoutubeIE.extract_id(real_url)
return {
'_type': 'url_transparent',
'url': real_url,
'id': video_id,
'title': title,
'description': description,
}

View File

@@ -1,5 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
@@ -10,12 +12,13 @@ class JeuxVideoIE(InfoExtractor):
_VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm'
_TEST = {
u'url': u'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm',
u'file': u'5182.mp4',
u'md5': u'046e491afb32a8aaac1f44dd4ddd54ee',
u'info_dict': {
u'title': u'GC 2013 : Tearaway nous présente ses papiers d\'identité',
u'description': u'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n',
'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm',
'md5': '046e491afb32a8aaac1f44dd4ddd54ee',
'info_dict': {
'id': '5182',
'ext': 'mp4',
'title': 'GC 2013 : Tearaway nous présente ses papiers d\'identité',
'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n',
},
}
@@ -25,14 +28,14 @@ class JeuxVideoIE(InfoExtractor):
webpage = self._download_webpage(url, title)
xml_link = self._html_search_regex(
r'<param name="flashvars" value="config=(.*?)" />',
webpage, u'config URL')
webpage, 'config URL')
video_id = self._search_regex(
r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml',
xml_link, u'video ID')
xml_link, 'video ID')
config = self._download_xml(
xml_link, title, u'Downloading XML config')
xml_link, title, 'Downloading XML config')
info_json = config.find('format.json').text
info = json.loads(info_json)['versions'][0]

View File

@@ -0,0 +1,66 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class KontrTubeIE(InfoExtractor):
IE_NAME = 'kontrtube'
IE_DESC = 'KontrTube.ru - Труба зовёт'
_VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/.+'
_TEST = {
'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/',
'md5': '975a991a4926c9a85f383a736a2e6b80',
'info_dict': {
'id': '2678',
'ext': 'mp4',
'title': 'Над олимпийской деревней в Сочи поднят российский флаг',
'description': 'md5:80edc4c613d5887ae8ccf1d59432be41',
'thumbnail': 'http://www.kontrtube.ru/contents/videos_screenshots/2000/2678/preview.mp4.jpg',
'duration': 270,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id, 'Downloading page')
video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')
thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)
title = self._html_search_regex(r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage,
'video title')
description = self._html_search_meta('description', webpage, 'video description')
mobj = re.search(r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
webpage)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
view_count = self._html_search_regex(r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage,
'view count', fatal=False)
view_count = int(view_count) if view_count is not None else None
comment_count = None
comment_str = self._html_search_regex(r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count',
fatal=False)
if comment_str.startswith('комментариев нет'):
comment_count = 0
else:
mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str)
if mobj:
comment_count = int(mobj.group('total'))
return {
'id': video_id,
'url': video_url,
'thumbnail': thumbnail,
'title': title,
'description': description,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
}

View File

@@ -4,19 +4,23 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import unified_strdate
from ..utils import (
int_or_none,
unified_strdate
)
class LifeNewsIE(InfoExtractor):
IE_NAME = 'lifenews'
IE_DESC = 'LIFE | NEWS'
_VALID_URL = r'http://lifenews\.ru/(?:mobile/)?news/(?P<id>\d+)'
_TEST = {
'url': 'http://lifenews.ru/news/126342',
'file': '126342.mp4',
'md5': 'e1b50a5c5fb98a6a544250f2e0db570a',
'info_dict': {
'id': '126342',
'ext': 'mp4',
'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом',
'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.',
'thumbnail': 'http://lifenews.ru/static/posts/2014/1/126342/.video.jpg',
@@ -32,7 +36,7 @@ class LifeNewsIE(InfoExtractor):
video_url = self._html_search_regex(
r'<video.*?src="([^"]+)".*?></video>', webpage, 'video URL')
thumbnail = self._html_search_regex(
r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail')
@@ -44,12 +48,14 @@ class LifeNewsIE(InfoExtractor):
description = self._og_search_description(webpage)
view_count = self._html_search_regex(
r'<div class=\'views\'>(\d+)</div>', webpage, 'view count')
r'<div class=\'views\'>(\d+)</div>', webpage, 'view count', fatal=False)
comment_count = self._html_search_regex(
r'<div class=\'comments\'>(\d+)</div>', webpage, 'comment count')
r'<div class=\'comments\'>(\d+)</div>', webpage, 'comment count', fatal=False)
upload_date = self._html_search_regex(
r'<time datetime=\'([^\']+)\'>', webpage, 'upload date')
r'<time datetime=\'([^\']+)\'>', webpage, 'upload date',fatal=False)
if upload_date is not None:
upload_date = unified_strdate(upload_date)
return {
'id': video_id,
@@ -57,7 +63,7 @@ class LifeNewsIE(InfoExtractor):
'thumbnail': thumbnail,
'title': title,
'description': description,
'view_count': view_count,
'comment_count': comment_count,
'upload_date': unified_strdate(upload_date),
'view_count': int_or_none(view_count),
'comment_count': int_or_none(comment_count),
'upload_date': upload_date,
}

View File

@@ -0,0 +1,56 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class M6IE(InfoExtractor):
IE_NAME = 'm6'
_VALID_URL = r'http://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html'
_TEST = {
'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html',
'md5': '242994a87de2c316891428e0176bcb77',
'info_dict': {
'id': '11323908',
'ext': 'mp4',
'title': 'Emeline est la Reine du Shopping sur le thème « Ma fête danniversaire ! »',
'description': 'md5:1212ae8fb4b7baa4dc3886c5676007c2',
'duration': 100,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id,
'Downloading video RSS')
title = rss.find('./channel/item/title').text
description = rss.find('./channel/item/description').text
thumbnail = rss.find('./channel/item/visuel_clip_big').text
duration = int(rss.find('./channel/item/duration').text)
view_count = int(rss.find('./channel/item/nombre_vues').text)
formats = []
for format_id in ['lq', 'sd', 'hq', 'hd']:
video_url = rss.find('./channel/item/url_video_%s' % format_id)
if video_url is None:
continue
formats.append({
'url': video_url.text,
'format_id': format_id,
})
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
'formats': formats,
}

View File

@@ -61,7 +61,7 @@ class MooshareIE(InfoExtractor):
}
request = compat_urllib_request.Request(
'http://mooshare.biz/8dqtk4bjbp8g', compat_urllib_parse.urlencode(download_form))
'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self.to_screen('%s: Waiting for timeout' % video_id)
@@ -111,4 +111,4 @@ class MooshareIE(InfoExtractor):
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}
}

View File

@@ -86,6 +86,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
if title_el is None:
title_el = itemdoc.find('.//title')
if title_el.text is None:
title_el = None
title = title_el.text
if title is None:
raise ExtractorError('Could not find video title')

View File

@@ -0,0 +1,89 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
class NDRIE(InfoExtractor):
IE_NAME = 'ndr'
IE_DESC = 'NDR.de - Mediathek'
_VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
_TESTS = [
{
'url': 'http://www.ndr.de/fernsehen/sendungen/markt/markt7959.html',
'md5': 'e7a6079ca39d3568f4996cb858dd6708',
'note': 'Video file',
'info_dict': {
'id': '7959',
'ext': 'mp4',
'title': 'Markt - die ganze Sendung',
'description': 'md5:af9179cf07f67c5c12dc6d9997e05725',
'duration': 2655,
},
},
{
'url': 'http://www.ndr.de/info/audio51535.html',
'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
'note': 'Audio file',
'info_dict': {
'id': '51535',
'ext': 'mp3',
'title': 'La Valette entgeht der Hinrichtung',
'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
'duration': 884,
}
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
title = self._og_search_title(page)
description = self._og_search_description(page)
mobj = re.search(
r'<div class="duration"><span class="min">(?P<minutes>\d+)</span>:<span class="sec">(?P<seconds>\d+)</span></div>',
page)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
formats = []
mp3_url = re.search(r'''{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
if mp3_url:
formats.append({
'url': mp3_url.group('audio'),
'format_id': 'mp3',
})
thumbnail = None
video_url = re.search(r'''3: {src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
if video_url:
thumbnail = self._html_search_regex(r'(?m)title: "NDR PLAYER",\s*poster: "([^"]+)",',
page, 'thumbnail', fatal=False)
if thumbnail:
thumbnail = 'http://www.ndr.de' + thumbnail
for format_id in ['lo', 'hi', 'hq']:
formats.append({
'url': '%s.%s.mp4' % (video_url.group('video'), format_id),
'format_id': format_id,
})
if not formats:
raise ExtractorError('No media links available for %s' % video_id)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}

View File

@@ -0,0 +1,94 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_request,
compat_urllib_parse,
)
class NFBIE(InfoExtractor):
IE_NAME = 'nfb'
IE_DESC = 'National Film Board of Canada'
_VALID_URL = r'https?://(?:www\.)?(nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
_TEST = {
'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
'info_dict': {
'id': 'qallunaat_why_white_people_are_funny',
'ext': 'mp4',
'title': 'Qallunaat! Why White People Are Funny ',
'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
'duration': 3128,
'uploader': 'Mark Sandiford',
'uploader_id': 'mark-sandiford',
},
'params': {
# rtmp download
'skip_download': True,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage('https://www.nfb.ca/film/%s' % video_id, video_id, 'Downloading film page')
uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
page, 'director id', fatal=False)
uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
page, 'director name', fatal=False)
request = compat_urllib_request.Request('https://www.nfb.ca/film/%s/player_config' % video_id,
compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii'))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
config = self._download_xml(request, video_id, 'Downloading player config XML')
title = None
description = None
thumbnail = None
duration = None
formats = []
def extract_thumbnail(media):
thumbnails = {}
for asset in media.findall('assets/asset'):
thumbnails[asset.get('quality')] = asset.find('default/url').text
if not thumbnails:
return None
if 'high' in thumbnails:
return thumbnails['high']
return list(thumbnails.values())[0]
for media in config.findall('./player/stream/media'):
if media.get('type') == 'posterImage':
thumbnail = extract_thumbnail(media)
elif media.get('type') == 'video':
duration = int(media.get('duration'))
title = media.find('title').text
description = media.find('description').text
# It seems assets always go from lower to better quality, so no need to sort
formats = [{
'url': x.find('default/streamerURI').text,
'app': x.find('default/streamerURI').text.split('/', 3)[3],
'play_path': x.find('default/url').text,
'rtmp_live': False,
'ext': 'mp4',
'format_id': x.get('quality'),
} for x in media.findall('assets/asset')]
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
'formats': formats,
}

View File

@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
@@ -10,7 +9,7 @@ class PBSIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://
(?:
# Direct video URL
video\.pbs\.org/video/(?P<id>[0-9]+)/? |
video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
# Article with embedded player
(?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) |
# Player

View File

@@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re
import json
@@ -12,11 +14,12 @@ class SlideshareIE(InfoExtractor):
_VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)'
_TEST = {
u'url': u'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
u'file': u'25665706.mp4',
u'info_dict': {
u'title': u'Managing Scale and Complexity',
u'description': u'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix',
'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
'info_dict': {
'id': '25665706',
'ext': 'mp4',
'title': 'Managing Scale and Complexity',
'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.',
},
}
@@ -26,15 +29,17 @@ class SlideshareIE(InfoExtractor):
webpage = self._download_webpage(url, page_title)
slideshare_obj = self._search_regex(
r'var slideshare_object = ({.*?}); var user_info =',
webpage, u'slideshare object')
webpage, 'slideshare object')
info = json.loads(slideshare_obj)
if info['slideshow']['type'] != u'video':
raise ExtractorError(u'Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
if info['slideshow']['type'] != 'video':
raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
doc = info['doc']
bucket = info['jsplayer']['video_bucket']
ext = info['jsplayer']['video_extension']
video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
description = self._html_search_regex(
r'<p class="description.*?"[^>]*>(.*?)</p>', webpage, 'description')
return {
'_type': 'video',
@@ -43,5 +48,5 @@ class SlideshareIE(InfoExtractor):
'ext': ext,
'url': video_url,
'thumbnail': info['slideshow']['pin_image_url'],
'description': self._og_search_description(webpage),
'description': description,
}

View File

@@ -20,6 +20,7 @@ class SmotriIE(InfoExtractor):
IE_DESC = 'Smotri.com'
IE_NAME = 'smotri'
_VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'
_NETRC_MACHINE = 'smotri'
_TESTS = [
# real video id 2610366

View File

@@ -17,6 +17,7 @@ class SohuIE(InfoExtractor):
u'info_dict': {
u'title': u'MVFar East Movement《The Illest》',
},
u'skip': u'Only available from China',
}
def _real_extract(self, url):

View File

@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..utils import int_or_none
class StreamCZIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<videoid>.+)'
_TEST = {
'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
'md5': '6d3ca61a8d0633c9c542b92fcb936b0c',
'info_dict': {
'id': '765767',
'ext': 'mp4',
'title': 'Peklo na talíři: Éčka pro děti',
'description': 'md5:49ace0df986e95e331d0fe239d421519',
'thumbnail': 'http://im.stream.cz/episode/52961d7e19d423f8f06f0100',
'duration': 256,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
webpage = self._download_webpage(url, video_id)
data = self._html_search_regex(r'Stream\.Data\.Episode\((.+?)\);', webpage, 'stream data')
jsonData = json.loads(data)
formats = []
for video in jsonData['instances']:
for video_format in video['instances']:
format_id = video_format['quality']
if format_id == '240p':
quality = 0
elif format_id == '360p':
quality = 1
elif format_id == '480p':
quality = 2
elif format_id == '720p':
quality = 3
formats.append({
'format_id': '%s-%s' % (video_format['type'].split('/')[1], format_id),
'url': video_format['source'],
'quality': quality,
})
self._sort_formats(formats)
return {
'id': str(jsonData['id']),
'title': self._og_search_title(webpage),
'thumbnail': jsonData['episode_image_original_url'].replace('//', 'http://'),
'formats': formats,
'description': self._og_search_description(webpage),
'duration': int_or_none(jsonData['duration']),
'view_count': int_or_none(jsonData['stats_total']),
}

View File

@@ -0,0 +1,27 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class SyfyIE(InfoExtractor):
_VALID_URL = r'https?://www\.syfy\.com/videos/.+?vid:(?P<id>\d+)'
_TEST = {
'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458',
'md5': 'e07de1d52c7278adbb9b9b1c93a66849',
'info_dict': {
'id': 'NmqMrGnXvmO1',
'ext': 'flv',
'title': 'George Lucas has Advice for his Daughter',
'description': 'Listen to what insights George Lucas give his daughter Amanda.',
},
'add_ie': ['ThePlatform'],
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
return self.url_result(self._og_search_video_url(webpage))

View File

@@ -11,7 +11,10 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language
class ThePlatformIE(InfoExtractor):
_VALID_URL = r'(?:https?://link\.theplatform\.com/s/[^/]+/|theplatform:)(?P<id>[^/\?]+)'
_VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/
(?P<config>[^/\?]+/(?:swf|config)/select/)?
|theplatform:)(?P<id>[^/\?&]+)'''
_TEST = {
# from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
@@ -29,9 +32,7 @@ class ThePlatformIE(InfoExtractor):
},
}
def _get_info(self, video_id):
smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
'format=smil&mbr=true'.format(video_id))
def _get_info(self, video_id, smil_url):
meta = self._download_xml(smil_url, video_id)
try:
@@ -50,26 +51,34 @@ class ThePlatformIE(InfoExtractor):
head = meta.find(_x('smil:head'))
body = meta.find(_x('smil:body'))
base_url = head.find(_x('smil:meta')).attrib['base']
switch = body.find(_x('smil:switch'))
formats = []
for f in switch.findall(_x('smil:video')):
attr = f.attrib
width = int(attr['width'])
height = int(attr['height'])
vbr = int(attr['system-bitrate']) // 1000
format_id = '%dx%d_%dk' % (width, height, vbr)
formats.append({
'format_id': format_id,
'url': base_url,
'play_path': 'mp4:' + attr['src'],
'ext': 'flv',
'width': width,
'height': height,
'vbr': vbr,
})
self._sort_formats(formats)
f4m_node = body.find(_x('smil:seq/smil:video'))
if f4m_node is not None:
formats = [{
'ext': 'flv',
# the parameters are from syfy.com, other sites may use others
'url': f4m_node.attrib['src'] + '?g=UXWGVKRWHFSP&hdcore=3.0.3',
}]
else:
base_url = head.find(_x('smil:meta')).attrib['base']
switch = body.find(_x('smil:switch'))
formats = []
for f in switch.findall(_x('smil:video')):
attr = f.attrib
width = int(attr['width'])
height = int(attr['height'])
vbr = int(attr['system-bitrate']) // 1000
format_id = '%dx%d_%dk' % (width, height, vbr)
formats.append({
'format_id': format_id,
'url': base_url,
'play_path': 'mp4:' + attr['src'],
'ext': 'flv',
'width': width,
'height': height,
'vbr': vbr,
})
self._sort_formats(formats)
return {
'id': video_id,
@@ -83,4 +92,13 @@ class ThePlatformIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
return self._get_info(video_id)
if mobj.group('config'):
config_url = url+ '&form=json'
config_url = config_url.replace('swf/', 'config/')
config_json = self._download_webpage(config_url, video_id, u'Downloading config')
config = json.loads(config_json)
smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4'
else:
smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
'format=smil&mbr=true'.format(video_id))
return self._get_info(video_id, smil_url)

View File

@@ -1,22 +1,23 @@
#coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
)
from ..utils import determine_ext
class ThisAVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*'
_TEST = {
u"url": u"http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html",
u"file": u"47734.flv",
u"md5": u"0480f1ef3932d901f0e0e719f188f19b",
u"info_dict": {
u"title": u"高樹マリア - Just fit",
u"uploader": u"dj7970",
u"uploader_id": u"dj7970"
'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html',
'md5': '0480f1ef3932d901f0e0e719f188f19b',
'info_dict': {
'id': '47734',
'ext': 'flv',
'title': '高樹マリア - Just fit',
'uploader': 'dj7970',
'uploader_id': 'dj7970'
}
}
@@ -25,19 +26,18 @@ class ThisAVIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, u'title')
title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, 'title')
video_url = self._html_search_regex(
r"addVariable\('file','([^']+)'\);", webpage, u'video url')
r"addVariable\('file','([^']+)'\);", webpage, 'video url')
uploader = self._html_search_regex(
r': <a href="http://www.thisav.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>',
webpage, u'uploader name', fatal=False)
webpage, 'uploader name', fatal=False)
uploader_id = self._html_search_regex(
r': <a href="http://www.thisav.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>',
webpage, u'uploader id', fatal=False)
webpage, 'uploader id', fatal=False)
ext = determine_ext(video_url)
return {
'_type': 'video',
'id': video_id,
'url': video_url,
'uploader': uploader,

View File

@@ -1,4 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -9,25 +11,25 @@ from ..utils import (
class TouTvIE(InfoExtractor):
IE_NAME = u'tou.tv'
IE_NAME = 'tou.tv'
_VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
_TEST = {
u'url': u'http://www.tou.tv/30-vies/S04E41',
u'file': u'30-vies_S04E41.mp4',
u'info_dict': {
u'title': u'30 vies Saison 4 / Épisode 41',
u'description': u'md5:da363002db82ccbe4dafeb9cab039b09',
u'age_limit': 8,
u'uploader': u'Groupe des Nouveaux Médias',
u'duration': 1296,
u'upload_date': u'20131118',
u'thumbnail': u'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
'url': 'http://www.tou.tv/30-vies/S04E41',
'file': '30-vies_S04E41.mp4',
'info_dict': {
'title': '30 vies Saison 4 / Épisode 41',
'description': 'md5:da363002db82ccbe4dafeb9cab039b09',
'age_limit': 8,
'uploader': 'Groupe des Nouveaux Médias',
'duration': 1296,
'upload_date': '20131118',
'thumbnail': 'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
},
u'params': {
u'skip_download': True, # Requires rtmpdump
'params': {
'skip_download': True, # Requires rtmpdump
},
u'skip': 'Only available in Canada'
'skip': 'Only available in Canada'
}
def _real_extract(self, url):
@@ -36,25 +38,25 @@ class TouTvIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
mediaId = self._search_regex(
r'"idMedia":\s*"([^"]+)"', webpage, u'media ID')
r'"idMedia":\s*"([^"]+)"', webpage, 'media ID')
streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId
streams_url = 'http://release.theplatform.com/content.select?pid=' + mediaId
streams_doc = self._download_xml(
streams_url, video_id, note=u'Downloading stream list')
streams_url, video_id, note='Downloading stream list')
video_url = next(n.text
for n in streams_doc.findall('.//choice/url')
if u'//ad.doubleclick' not in n.text)
if '//ad.doubleclick' not in n.text)
if video_url.endswith('/Unavailable.flv'):
raise ExtractorError(
u'Access to this video is blocked from outside of Canada',
'Access to this video is blocked from outside of Canada',
expected=True)
duration_str = self._html_search_meta(
'video:duration', webpage, u'duration')
'video:duration', webpage, 'duration')
duration = int(duration_str) if duration_str else None
upload_date_str = self._html_search_meta(
'video:release_date', webpage, u'upload date')
'video:release_date', webpage, 'upload date')
upload_date = unified_strdate(upload_date_str) if upload_date_str else None
return {

View File

@@ -11,7 +11,7 @@ from ..aes import (
)
class Tube8IE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>tube8\.com/[^/]+/[^/]+/(?P<videoid>[0-9]+)/?)'
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>tube8\.com/.+?/(?P<videoid>\d+)/?)$'
_TEST = {
u'url': u'http://www.tube8.com/teen/kasia-music-video/229795/',
u'file': u'229795.mp4',

View File

@@ -0,0 +1,170 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none
)
class VestiIE(InfoExtractor):
IE_NAME = 'vesti'
IE_DESC = 'Вести.Ru'
_VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P<id>.+)'
_TESTS = [
{
'url': 'http://www.vesti.ru/videos?vid=575582&cid=1',
'info_dict': {
'id': '765035',
'ext': 'mp4',
'title': 'Вести.net: биткоины в России не являются законными',
'description': 'md5:d4bb3859dc1177b28a94c5014c35a36b',
'duration': 302,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://www.vesti.ru/only_video.html?vid=576180',
'info_dict': {
'id': '766048',
'ext': 'mp4',
'title': 'США заморозило, Британию затопило',
'description': 'md5:f0ed0695ec05aed27c56a70a58dc4cc1',
'duration': 87,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403',
'info_dict': {
'id': '766403',
'ext': 'mp4',
'title': 'XXII зимние Олимпийские игры. Российские хоккеисты стартовали на Олимпиаде с победы',
'description': 'md5:55805dfd35763a890ff50fa9e35e31b3',
'duration': 271,
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Blocked outside Russia'
},
{
'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
'info_dict': {
'id': '51499',
'ext': 'flv',
'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
},
'params': {
# rtmp download
'skip_download': True,
},
'skip': 'Translation has finished'
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
mobj = re.search(r'<meta property="og:video" content=".+?\.swf\?v?id=(?P<id>\d+).*?" />', page)
if mobj:
video_type = 'video'
video_id = mobj.group('id')
else:
mobj = re.search(
r'<iframe.+?src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*".*?></iframe>', page)
if not mobj:
raise ExtractorError('No media found')
video_type = mobj.group('type')
video_id = mobj.group('id')
json_data = self._download_json(
'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id),
video_id, 'Downloading JSON')
if json_data['errors']:
raise ExtractorError('vesti returned error: %s' % json_data['errors'], expected=True)
playlist = json_data['data']['playlist']
medialist = playlist['medialist']
media = medialist[0]
if media['errors']:
raise ExtractorError('vesti returned error: %s' % media['errors'], expected=True)
view_count = playlist.get('count_views')
priority_transport = playlist['priority_transport']
thumbnail = media['picture']
width = media['width']
height = media['height']
description = media['anons']
title = media['title']
duration = int_or_none(media.get('duration'))
formats = []
for transport, links in media['sources'].items():
for quality, url in links.items():
if transport == 'rtmp':
mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
if not mobj:
continue
fmt = {
'url': mobj.group('url'),
'play_path': mobj.group('playpath'),
'app': mobj.group('app'),
'page_url': 'http://player.rutv.ru',
'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22',
'rtmp_live': True,
'ext': 'flv',
'vbr': int(quality),
}
elif transport == 'm3u8':
fmt = {
'url': url,
'ext': 'mp4',
}
else:
fmt = {
'url': url
}
fmt.update({
'width': width,
'height': height,
'format_id': '%s-%s' % (transport, quality),
'preference': -1 if priority_transport == transport else -2,
})
formats.append(fmt)
if not formats:
raise ExtractorError('No media links available for %s' % video_id)
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'view_count': view_count,
'duration': duration,
'formats': formats,
}

View File

@@ -6,6 +6,9 @@ import json
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_request,
compat_urllib_parse,
compat_str,
unescapeHTML,
)
@@ -14,31 +17,80 @@ from ..utils import (
class VKIE(InfoExtractor):
IE_NAME = 'vk.com'
_VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'
_NETRC_MACHINE = 'vk'
_TESTS = [{
'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
'file': '162222515.flv',
'md5': '0deae91935c54e00003c2a00646315f0',
'info_dict': {
'title': 'ProtivoGunz - Хуёвая песня',
'uploader': 'Noize MC',
_TESTS = [
{
'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
'md5': '0deae91935c54e00003c2a00646315f0',
'info_dict': {
'id': '162222515',
'ext': 'flv',
'title': 'ProtivoGunz - Хуёвая песня',
'uploader': 'Noize MC',
'duration': 195,
},
},
},
{
'url': 'http://vk.com/video4643923_163339118',
'file': '163339118.mp4',
'md5': 'f79bccb5cd182b1f43502ca5685b2b36',
'info_dict': {
'uploader': 'Elvira Dzhonik',
'title': 'Dream Theater - Hollow Years Live at Budokan 720*',
{
'url': 'http://vk.com/video4643923_163339118',
'md5': 'f79bccb5cd182b1f43502ca5685b2b36',
'info_dict': {
'id': '163339118',
'ext': 'mp4',
'uploader': 'Elvira Dzhonik',
'title': 'Dream Theater - Hollow Years Live at Budokan 720*',
'duration': 558,
}
},
{
'url': 'http://vk.com/video-8871596_164049491',
'md5': 'a590bcaf3d543576c9bd162812387666',
'note': 'Only available for registered users',
'info_dict': {
'id': '164049491',
'ext': 'mp4',
'uploader': 'Триллеры',
'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]\u00a0',
'duration': 8352,
},
'skip': 'Requires vk account credentials',
}
}]
]
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
login_form = {
'act': 'login',
'role': 'al_frame',
'expire': '1',
'email': username,
'pass': password,
}
request = compat_urllib_request.Request('https://login.vk.com/?act=login',
compat_urllib_parse.urlencode(login_form).encode('utf-8'))
login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
if re.search(r'onLoginFailed', login_page):
raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
def _real_initialize(self):
self._login()
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
info_page = self._download_webpage(info_url, video_id)
if re.search(r'<!>Please log in or <', info_page):
raise ExtractorError('This video is only available for registered users, '
'use --username and --password options to provide account credentials.', expected=True)
m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
if m_yt is not None:
self.to_screen(u'Youtube video detected')
@@ -60,4 +112,5 @@ class VKIE(InfoExtractor):
'title': unescapeHTML(data['md_title']),
'thumbnail': data.get('jpg'),
'uploader': data.get('md_author'),
'duration': data.get('duration')
}

View File

@@ -1,3 +1,5 @@
from __future__ import unicode_literals
import os
import re
@@ -10,14 +12,14 @@ from ..utils import (
class XTubeIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
_TEST = {
u'url': u'http://www.xtube.com/watch.php?v=kVTUy_G222_',
u'file': u'kVTUy_G222_.mp4',
u'md5': u'092fbdd3cbe292c920ef6fc6a8a9cdab',
u'info_dict': {
u"title": u"strange erotica",
u"description": u"surreal gay themed erotica...almost an ET kind of thing",
u"uploader": u"greenshowers",
u"age_limit": 18,
'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
'file': 'kVTUy_G222_.mp4',
'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
'info_dict': {
"title": "strange erotica",
"description": "surreal gay themed erotica...almost an ET kind of thing",
"uploader": "greenshowers",
"age_limit": 18,
}
}
@@ -30,10 +32,10 @@ class XTubeIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, u'title')
video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, u'uploader', fatal=False)
video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', fatal=False)
video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, u'video_url').replace('\\/', '/')
video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, 'description', fatal=False)
video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]

View File

@@ -1,3 +1,5 @@
from __future__ import unicode_literals
import itertools
import json
import re
@@ -12,25 +14,25 @@ from ..utils import (
class YahooIE(InfoExtractor):
IE_DESC = u'Yahoo screen'
IE_DESC = 'Yahoo screen'
_VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
_TESTS = [
{
u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
u'file': u'214727115.mp4',
u'md5': u'4962b075c08be8690a922ee026d05e69',
u'info_dict': {
u'title': u'Julian Smith & Travis Legg Watch Julian Smith',
u'description': u'Julian and Travis watch Julian Smith',
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
'file': '214727115.mp4',
'md5': '4962b075c08be8690a922ee026d05e69',
'info_dict': {
'title': 'Julian Smith & Travis Legg Watch Julian Smith',
'description': 'Julian and Travis watch Julian Smith',
},
},
{
u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
u'file': u'103000935.mp4',
u'md5': u'd6e6fc6e1313c608f316ddad7b82b306',
u'info_dict': {
u'title': u'Codefellas - The Cougar Lies with Spanish Moss',
u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
'file': '103000935.mp4',
'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
'info_dict': {
'title': 'Codefellas - The Cougar Lies with Spanish Moss',
'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
},
},
]
@@ -41,7 +43,7 @@ class YahooIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
items_json = self._search_regex(r'mediaItems: ({.*?})$',
webpage, u'items', flags=re.MULTILINE)
webpage, 'items', flags=re.MULTILINE)
items = json.loads(items_json)
info = items['mediaItems']['query']['results']['mediaObj'][0]
# The 'meta' field is not always in the video webpage, we request it
@@ -60,7 +62,7 @@ class YahooIE(InfoExtractor):
})
query_result_json = self._download_webpage(
'http://video.query.yahoo.com/v1/public/yql?' + data,
video_id, u'Downloading video info')
video_id, 'Downloading video info')
query_result = json.loads(query_result_json)
info = query_result['query']['results']['mediaObj'][0]
meta = info['meta']
@@ -103,13 +105,13 @@ class YahooNewsIE(YahooIE):
_VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
_TEST = {
u'url': u'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
u'md5': u'67010fdf3a08d290e060a4dd96baa07b',
u'info_dict': {
u'id': u'104538833',
u'ext': u'mp4',
u'title': u'China Moses Is Crazy About the Blues',
u'description': u'md5:9900ab8cd5808175c7b3fe55b979bed0',
'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
'md5': '67010fdf3a08d290e060a4dd96baa07b',
'info_dict': {
'id': '104538833',
'ext': 'mp4',
'title': 'China Moses Is Crazy About the Blues',
'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
},
}
@@ -120,14 +122,14 @@ class YahooNewsIE(YahooIE):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, u'long id')
long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id')
return self._get_info(long_id, video_id)
class YahooSearchIE(SearchInfoExtractor):
IE_DESC = u'Yahoo screen search'
IE_DESC = 'Yahoo screen search'
_MAX_RESULTS = 1000
IE_NAME = u'screen.yahoo:search'
IE_NAME = 'screen.yahoo:search'
_SEARCH_KEY = 'yvsearch'
def _get_n_results(self, query, n):
@@ -139,12 +141,12 @@ class YahooSearchIE(SearchInfoExtractor):
'entries': []
}
for pagenum in itertools.count(0):
result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
webpage = self._download_webpage(result_url, query,
note='Downloading results page '+str(pagenum+1))
info = json.loads(webpage)
m = info[u'm']
results = info[u'results']
m = info['m']
results = info['results']
for (i, r) in enumerate(results):
if (pagenum * 30) +i >= n:
@@ -152,7 +154,7 @@ class YahooSearchIE(SearchInfoExtractor):
mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
res['entries'].append(e)
if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1)):
if (pagenum * 30 +i >= n) or (m['last'] >= (m['total'] -1)):
break
return res

View File

@@ -34,6 +34,7 @@ from ..utils import (
unified_strdate,
orderedSet,
write_json_file,
uppercase_escape,
)
class YoutubeBaseInfoExtractor(InfoExtractor):
@@ -136,7 +137,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
(?:https?://|//)? # http(s):// or protocol-independent URL (optional)
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
(?:www\.)?deturl\.com/www\.youtube\.com/|
(?:www\.)?pwnyoutube\.com|
(?:www\.)?pwnyoutube\.com/|
tube\.majestyc\.net/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
@@ -1085,8 +1086,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
self._downloader.report_warning(err_msg)
return {}
def _extract_id(self, url):
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@classmethod
def extract_id(cls, url):
mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group(2)
@@ -1115,7 +1117,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
mobj = re.search(self._NEXT_URL_RE, url)
if mobj:
url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
video_id = self._extract_id(url)
video_id = self.extract_id(url)
# Get video webpage
url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
@@ -1422,7 +1424,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
IE_DESC = u'YouTube.com playlists'
_VALID_URL = r"""(?:
_VALID_URL = r"""(?x)(?:
(?:https?://)?
(?:\w+\.)?
youtube\.com/
@@ -1431,7 +1433,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
\? (?:.*?&)*? (?:p|a|list)=
| p/
)
((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
(
(?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
# Top tracks, they can also include dots
|(?:MC)[\w\.]*
)
.*
|
((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
@@ -1441,11 +1447,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
_VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
IE_NAME = u'youtube:playlist'
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def _real_initialize(self):
self._login()
@@ -1469,7 +1470,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
playlist_id = mobj.group(1) or mobj.group(2)
@@ -1590,11 +1591,10 @@ class YoutubeChannelIE(InfoExtractor):
# Download all channel pages using the json-based channel_ajax query
for pagenum in itertools.count(1):
url = self._MORE_PAGES_URL % (pagenum, channel_id)
page = self._download_webpage(url, channel_id,
u'Downloading page #%s' % pagenum)
page = json.loads(page)
page = self._download_json(
url, channel_id, note=u'Downloading page #%s' % pagenum,
transform_source=uppercase_escape)
ids_in_page = self.extract_videos_from_page(page['content_html'])
video_ids.extend(ids_in_page)
@@ -1694,7 +1694,8 @@ class YoutubeSearchIE(SearchInfoExtractor):
api_response = data['data']
if 'items' not in api_response:
raise ExtractorError(u'[youtube] No video results')
raise ExtractorError(
u'[youtube] No video results', expected=True)
new_ids = list(video['id'] for video in api_response['items'])
video_ids += new_ids

View File

@@ -17,6 +17,7 @@ import platform
import re
import ssl
import socket
import struct
import subprocess
import sys
import traceback
@@ -751,15 +752,17 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
https_request = http_request
https_response = http_response
def unified_strdate(date_str):
"""Return a string with the date in the format YYYYMMDD"""
upload_date = None
#Replace commas
date_str = date_str.replace(',',' ')
date_str = date_str.replace(',', ' ')
# %z (UTC offset) is only supported in python>=3.2
date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
format_expressions = [
'%d %B %Y',
'%d %b %Y',
'%B %d %Y',
'%b %d %Y',
'%Y-%m-%d',
@@ -771,11 +774,12 @@ def unified_strdate(date_str):
'%Y-%m-%dT%H:%M:%S.%fZ',
'%Y-%m-%dT%H:%M:%S.%f0Z',
'%Y-%m-%dT%H:%M:%S',
'%Y-%m-%dT%H:%M',
]
for expression in format_expressions:
try:
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
except:
except ValueError:
pass
if upload_date is None:
timetuple = email.utils.parsedate_tz(date_str)
@@ -1141,7 +1145,7 @@ def parse_duration(s):
return None
m = re.match(
r'(?:(?:(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)$', s)
r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
if not m:
return None
res = int(m.group('secs'))
@@ -1212,3 +1216,26 @@ class PagedList(object):
if end == nextfirstid:
break
return res
def uppercase_escape(s):
return re.sub(
r'\\U([0-9a-fA-F]{8})',
lambda m: compat_chr(int(m.group(1), base=16)), s)
try:
struct.pack(u'!I', 0)
except TypeError:
# In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
def struct_pack(spec, *args):
if isinstance(spec, compat_str):
spec = spec.encode('ascii')
return struct.pack(spec, *args)
def struct_unpack(spec, *args):
if isinstance(spec, compat_str):
spec = spec.encode('ascii')
return struct.unpack(spec, *args)
else:
struct_pack = struct.pack
struct_unpack = struct.unpack

View File

@@ -1,2 +1,2 @@
__version__ = '2014.02.04.1'
__version__ = '2014.02.17'