Compare commits

..

13 Commits

Author SHA1 Message Date
Ricardo Garcia
44e16fa17f Bump version number 2010-10-31 11:26:34 +01:00
Ricardo Garcia
d983524781 Add --no-progress option (fixes issue #98) 2010-10-31 11:26:34 +01:00
Ricardo Garcia
1392f3f52c Give preference to format 34 before format 5 in quality list 2010-10-31 11:26:34 +01:00
Ricardo Garcia
43ab0ca432 Do not error out on problems printing the file name 2010-10-31 11:26:34 +01:00
Ricardo Garcia
31cbdaafd4 Properly support simple titles in the newest InfoExtractors 2010-10-31 11:26:34 +01:00
Ricardo Garcia
bd3cdf6dc4 Do not pass URLs around in Unicode form (fixes issue #92) 2010-10-31 11:26:34 +01:00
Ricardo Garcia
8cc468de75 Bump version number 2010-10-31 11:26:31 +01:00
Ricardo Garcia
31bcb48001 Tweak final filename in the open attempt, to be platform and filename-agnostic 2010-10-31 11:26:30 +01:00
Ricardo Garcia
c201ebc915 Fix SyntaxError triggered by mistake in user-agent commit 2010-10-31 11:26:30 +01:00
Ricardo Garcia
ce9c6a3097 Fix problem with sanitize_title not replacing Windows directory separator 2010-10-31 11:26:30 +01:00
Ricardo Garcia
4cfeb46544 Update user-agent string 2010-10-31 11:26:30 +01:00
Ricardo Garcia
490fd7aea7 Cherry-pick obeythepenguin's changes and merge them into main branch 2010-10-31 11:26:30 +01:00
Ricardo Garcia
c05fc6a345 Support simplest new URLs in YouTube 2010-10-31 11:26:30 +01:00
2 changed files with 202 additions and 48 deletions

View File

@@ -1 +1 @@
2010.01.19
2010.03.07

View File

@@ -27,7 +27,7 @@ except ImportError:
from cgi import parse_qs
std_headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'en-us,en;q=0.5',
@@ -51,6 +51,59 @@ def preferredencoding():
yield pref
return yield_preferredencoding().next()
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a Unicode character.
This function receives a match object and is intended to be used with
the re.sub() function.
"""
entity = matchobj.group(1)
# Known non-numeric HTML entity
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
# Unicode character
mobj = re.match(ur'(?u)#(x?\d+)', entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith(u'x'):
base = 16
numstr = u'0%s' % numstr
else:
base = 10
return unichr(long(numstr, base))
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename."""
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
return utitle.replace(unicode(os.sep), u'%')
def sanitize_open(filename, open_mode):
"""Try to open the given filename, and slightly tweak it if this fails.
Attempts to open the given filename. If this fails, it tries to change
the filename slightly, step by step, until it's either able to open it
or it fails and raises a final exception, like the standard open()
function.
It returns the tuple (stream, definitive_file_name).
"""
try:
stream = open(filename, open_mode)
return (stream, filename)
except (IOError, OSError), err:
# In case of error, try to remove win32 forbidden chars
filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
# An exception here should be caught in the caller
stream = open(filename, open_mode)
return (stream, filename)
class DownloadError(Exception):
"""Download Error exception.
@@ -139,6 +192,7 @@ class FileDownloader(object):
ratelimit: Download speed limit, in bytes/sec.
nooverwrites: Prevent overwriting files.
continuedl: Try to continue downloads if possible.
noprogress: Do not print the progress bar.
"""
params = None
@@ -247,11 +301,15 @@ class FileDownloader(object):
self._pps.append(pp)
pp.set_downloader(self)
def to_stdout(self, message, skip_eol=False):
def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
"""Print message to stdout if not in quiet mode."""
if not self.params.get('quiet', False):
print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
try:
if not self.params.get('quiet', False):
print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
sys.stdout.flush()
except (UnicodeEncodeError), err:
if not ignore_encoding_errors:
raise
def to_stderr(self, message):
"""Print message to stderr."""
@@ -289,10 +347,12 @@ class FileDownloader(object):
def report_destination(self, filename):
"""Report destination filename."""
self.to_stdout(u'[download] Destination: %s' % filename)
self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
"""Report download progress."""
if self.params.get('noprogress', False):
return
self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
@@ -302,7 +362,10 @@ class FileDownloader(object):
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
self.to_stdout(u'[download] %s has already been downloaded' % file_name)
try:
self.to_stdout(u'[download] %s has already been downloaded' % file_name)
except (UnicodeEncodeError), err:
self.to_stdout(u'[download] The file has already been downloaded')
def report_unable_to_resume(self):
"""Report it was impossible to resume download."""
@@ -310,7 +373,10 @@ class FileDownloader(object):
def report_finish(self):
"""Report download finished."""
self.to_stdout(u'')
if self.params.get('noprogress', False):
self.to_stdout(u'[download] Download completed')
else:
self.to_stdout(u'')
def process_info(self, info_dict):
"""Process a single dictionary returned by an InfoExtractor."""
@@ -325,9 +391,9 @@ class FileDownloader(object):
# Forced printings
if self.params.get('forcetitle', False):
print info_dict['title'].encode(preferredencoding())
print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
if self.params.get('forceurl', False):
print info_dict['url'].encode(preferredencoding())
print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
return
@@ -485,7 +551,7 @@ class FileDownloader(object):
# Open file just in time
if stream is None:
try:
stream = open(filename, open_mode)
(stream, filename) = sanitize_open(filename, open_mode)
self.report_destination(filename)
except (OSError, IOError), err:
self.trouble('ERROR: unable to open for writing: %s' % str(err))
@@ -571,12 +637,12 @@ class InfoExtractor(object):
class YoutubeIE(InfoExtractor):
"""Information extractor for youtube.com."""
_VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
_VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
_LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
_LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NETRC_MACHINE = 'youtube'
_available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
_available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
_video_extensions = {
'13': '3gp',
'17': 'mp4',
@@ -589,29 +655,6 @@ class YoutubeIE(InfoExtractor):
def suitable(url):
return (re.match(YoutubeIE._VALID_URL, url) is not None)
@staticmethod
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a Unicode character."""
entity = matchobj.group(1)
# Known non-numeric HTML entity
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
# Unicode character
mobj = re.match(ur'(?u)#(x?\d+)', entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith(u'x'):
base = 16
numstr = u'0%s' % numstr
else:
base = 10
return unichr(long(numstr, base))
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
def report_lang(self):
"""Report attempt to set language."""
self._downloader.to_stdout(u'[youtube] Setting language')
@@ -778,8 +821,7 @@ class YoutubeIE(InfoExtractor):
return
video_title = urllib.unquote_plus(video_info['title'][0])
video_title = video_title.decode('utf-8')
video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
video_title = video_title.replace(os.sep, u'%')
video_title = sanitize_title(video_title)
# simplified title
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
@@ -919,6 +961,7 @@ class MetacafeIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
if mobj is None:
@@ -943,7 +986,7 @@ class MetacafeIE(InfoExtractor):
class GoogleIE(InfoExtractor):
"""Information extractor for video.google.com."""
_VALID_URL = r'(?:http://)?video\.google\.com/videoplay\?docid=([^\&]+).*'
_VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@@ -975,7 +1018,7 @@ class GoogleIE(InfoExtractor):
video_extension = 'mp4'
# Retrieve video webpage to extract further information
request = urllib2.Request('http://video.google.com/videoplay?docid=%s' % video_id)
request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
@@ -985,7 +1028,10 @@ class GoogleIE(InfoExtractor):
# Extract URL, uploader, and title from webpage
self.report_extraction(video_id)
mobj = re.search(r"download_url:'(.*)'", webpage)
mobj = re.search(r"download_url:'([^']+)'", webpage)
if mobj is None:
video_extension = 'flv'
mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract media URL')
return
@@ -1000,9 +1046,11 @@ class GoogleIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
# Google Video doesn't show uploader nicknames?
video_uploader = 'uploader'
video_uploader = 'NA'
try:
# Process video information
@@ -1010,8 +1058,8 @@ class GoogleIE(InfoExtractor):
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader.decode('utf-8'),
'title': video_title.decode('utf-8'),
'stitle': video_title.decode('utf-8'),
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
})
except UnavailableFormatError:
@@ -1076,6 +1124,8 @@ class PhotobucketIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
video_uploader = mobj.group(2).decode('utf-8')
@@ -1084,9 +1134,103 @@ class PhotobucketIE(InfoExtractor):
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader.decode('utf-8'),
'title': video_title.decode('utf-8'),
'stitle': video_title.decode('utf-8'),
'uploader': video_uploader,
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
class GenericIE(InfoExtractor):
"""Generic last-resort information extractor."""
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return True
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
def _real_initialize(self):
return
def _real_extract(self, url):
video_id = url.split('/')[-1]
request = urllib2.Request(url)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
except ValueError, err:
# since this is the last-resort InfoExtractor, if
# this error is thrown, it'll be thrown here
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
# Broaden the search a little bit
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
# It's possible that one of the regexes
# matched, but returned an empty group:
if mobj.group(1) is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
video_url = urllib.unquote(mobj.group(1))
video_id = os.path.basename(video_url)
# here's a fun little line of code for you:
video_extension = os.path.splitext(video_id)[1][1:]
video_id = os.path.splitext(video_id)[0]
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
mobj = re.search(r'<title>(.*)</title>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
# video uploader is domain name
mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_uploader = mobj.group(1).decode('utf-8')
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader,
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
})
except UnavailableFormatError:
@@ -1112,6 +1256,7 @@ class YoutubeSearchIE(InfoExtractor):
def report_download_page(self, query, pagenum):
"""Report attempt to download playlist page with given number."""
query = query.decode(preferredencoding())
self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
def _real_initialize(self):
@@ -1125,6 +1270,7 @@ class YoutubeSearchIE(InfoExtractor):
prefix, query = query.split(':')
prefix = prefix[8:]
query = query.encode('utf-8')
if prefix == '':
self._download_n_results(query, 1)
return
@@ -1374,7 +1520,7 @@ if __name__ == '__main__':
# Parse command line
parser = optparse.OptionParser(
usage='Usage: %prog [options] url...',
version='2010.01.19',
version='2010.03.07',
conflict_handler='resolve',
)
@@ -1418,6 +1564,8 @@ if __name__ == '__main__':
action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
verbosity.add_option('-e', '--get-title',
action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
verbosity.add_option('--no-progress',
action='store_true', dest='noprogress', help='do not print progress bar', default=False)
parser.add_option_group(verbosity)
filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
@@ -1473,6 +1621,7 @@ if __name__ == '__main__':
youtube_search_ie = YoutubeSearchIE(youtube_ie)
google_ie = GoogleIE()
photobucket_ie = PhotobucketIE()
generic_ie = GenericIE()
# File downloader
fd = FileDownloader({
@@ -1492,6 +1641,7 @@ if __name__ == '__main__':
'ratelimit': opts.ratelimit,
'nooverwrites': opts.nooverwrites,
'continuedl': opts.continue_dl,
'noprogress': opts.noprogress,
})
fd.add_info_extractor(youtube_search_ie)
fd.add_info_extractor(youtube_pl_ie)
@@ -1501,6 +1651,10 @@ if __name__ == '__main__':
fd.add_info_extractor(google_ie)
fd.add_info_extractor(photobucket_ie)
# This must come last since it's the
# fallback if none of the others work
fd.add_info_extractor(generic_ie)
# Update version
if opts.update_self:
update_self(fd, sys.argv[0])