X-Git-Url: http://git.jankratochvil.net/?a=blobdiff_plain;f=youtube-dl;h=1b381d7b75307008b3686e036417edf155d896e7;hb=597e7b18054b7632db6f8ba316e2410ccf748023;hp=d7e9c50c0076016d2dcabfec86d49c8cdab1b04e;hpb=c379c181e057491171a43228752fcb7e20c86d5f;p=youtube-dl.git
diff --git a/youtube-dl b/youtube-dl
index d7e9c50..1b381d7 100755
--- a/youtube-dl
+++ b/youtube-dl
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-__author__ = (
+__authors__ = (
'Ricardo Garcia Gonzalez',
'Danny Colligan',
'Benjamin Johnson',
@@ -18,12 +18,14 @@ __author__ = (
)
__license__ = 'Public Domain'
-__version__ = '2011.11.23'
+__version__ = '2012.02.27'
UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
+
import cookielib
import datetime
+import getpass
import gzip
import htmlentitydefs
import HTMLParser
@@ -31,9 +33,11 @@ import httplib
import locale
import math
import netrc
+import optparse
import os
import os.path
import re
+import shlex
import socket
import string
import subprocess
@@ -259,14 +263,14 @@ def sanitize_open(filename, open_mode):
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
return (sys.stdout, filename)
- stream = open(filename, open_mode)
+ stream = open(_encodeFilename(filename), open_mode)
return (stream, filename)
except (IOError, OSError), err:
# In case of error, try to remove win32 forbidden chars
filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
# An exception here should be caught in the caller
- stream = open(filename, open_mode)
+ stream = open(_encodeFilename(filename), open_mode)
return (stream, filename)
@@ -282,6 +286,38 @@ def _simplify_title(title):
expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
return expr.sub(u'_', title).strip(u'_')
+def _orderedSet(iterable):
+ """ Remove all duplicates from the input iterable """
+ res = []
+ for el in iterable:
+ if el not in res:
+ res.append(el)
+ return res
+
+def _unescapeHTML(s):
+ """
+ @param s a string (of type unicode)
+ """
+ assert type(s) == type(u'')
+
+ htmlParser = HTMLParser.HTMLParser()
+ return htmlParser.unescape(s)
+
+def _encodeFilename(s):
+ """
+ @param s The name of the file (of type unicode)
+ """
+
+ assert type(s) == type(u'')
+
+ if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
+ # Pass u'' directly to use Unicode APIs on Windows 2000 and up
+ # (Detecting Windows NT 4 is tricky because 'major >= 4' would
+ # match Windows 9x series as well. Besides, NT 4 is obsolete.)
+ return s
+ else:
+ return s.encode(sys.getfilesystemencoding(), 'ignore')
+
class DownloadError(Exception):
"""Download Error exception.
@@ -309,6 +345,10 @@ class PostProcessingError(Exception):
"""
pass
+class MaxDownloadsReached(Exception):
+ """ --max-downloads limit has been reached. """
+ pass
+
class UnavailableVideoError(Exception):
"""Unavailable Format exception.
@@ -542,16 +582,17 @@ class FileDownloader(object):
self._pps.append(pp)
pp.set_downloader(self)
- def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
+ def to_screen(self, message, skip_eol=False):
"""Print message to stdout if not in quiet mode."""
- try:
- if not self.params.get('quiet', False):
- terminator = [u'\n', u''][skip_eol]
- print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
+ assert type(message) == type(u'')
+ if not self.params.get('quiet', False):
+ terminator = [u'\n', u''][skip_eol]
+ output = message + terminator
+
+ if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
+ output = output.encode(preferredencoding(), 'ignore')
+ self._screen_file.write(output)
self._screen_file.flush()
- except (UnicodeEncodeError), err:
- if not ignore_encoding_errors:
- raise
def to_stderr(self, message):
"""Print message to stderr."""
@@ -601,7 +642,7 @@ class FileDownloader(object):
def temp_name(self, filename):
"""Returns a temporary filename for the given filename."""
if self.params.get('nopart', False) or filename == u'-' or \
- (os.path.exists(filename) and not os.path.isfile(filename)):
+ (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
return filename
return filename + u'.part'
@@ -614,7 +655,7 @@ class FileDownloader(object):
try:
if old_filename == new_filename:
return
- os.rename(old_filename, new_filename)
+ os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
except (IOError, OSError), err:
self.trouble(u'ERROR: unable to rename file')
@@ -622,7 +663,7 @@ class FileDownloader(object):
"""Try to set the last-modified time of the given file."""
if last_modified_hdr is None:
return
- if not os.path.isfile(filename):
+ if not os.path.isfile(_encodeFilename(filename)):
return
timestr = last_modified_hdr
if timestr is None:
@@ -638,15 +679,15 @@ class FileDownloader(object):
def report_writedescription(self, descfn):
""" Report that the description file is being written """
- self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
+ self.to_screen(u'[info] Writing video description to: ' + descfn)
def report_writeinfojson(self, infofn):
""" Report that the metadata file has been written """
- self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
+ self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
def report_destination(self, filename):
"""Report destination filename."""
- self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
+ self.to_screen(u'[download] Destination: ' + filename)
def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
"""Report download progress."""
@@ -699,15 +740,31 @@ class FileDownloader(object):
self.trouble(u'ERROR: invalid system charset or erroneous output template')
return None
+ def _match_entry(self, info_dict):
+ """ Returns None iff the file should be downloaded """
+
+ title = info_dict['title']
+ matchtitle = self.params.get('matchtitle', False)
+ if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
+ return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
+ rejecttitle = self.params.get('rejecttitle', False)
+ if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
+ return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
+ return None
+
def process_info(self, info_dict):
"""Process a single dictionary returned by an InfoExtractor."""
+ reason = self._match_entry(info_dict)
+ if reason is not None:
+ self.to_screen(u'[download] ' + reason)
+ return
+
max_downloads = self.params.get('max_downloads')
if max_downloads is not None:
if self._num_downloads > int(max_downloads):
- self.to_screen(u'[download] Maximum number of downloads reached. Skipping ' + info_dict['title'])
- return
-
+ raise MaxDownloadsReached()
+
filename = self.prepare_filename(info_dict)
# Forced printings
@@ -731,23 +788,9 @@ class FileDownloader(object):
if filename is None:
return
- matchtitle=self.params.get('matchtitle',False)
- rejecttitle=self.params.get('rejecttitle',False)
- title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
- if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
- self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
- return
- if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
- self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
- return
-
- if self.params.get('nooverwrites', False) and os.path.exists(filename):
- self.to_stderr(u'WARNING: file exists and will be skipped')
- return
-
try:
- dn = os.path.dirname(filename)
- if dn != '' and not os.path.exists(dn):
+ dn = os.path.dirname(_encodeFilename(filename))
+ if dn != '' and not os.path.exists(dn): # dn is already encoded
os.makedirs(dn)
except (OSError, IOError), err:
self.trouble(u'ERROR: unable to create directory ' + unicode(err))
@@ -755,9 +798,9 @@ class FileDownloader(object):
if self.params.get('writedescription', False):
try:
- descfn = filename + '.description'
+ descfn = filename + u'.description'
self.report_writedescription(descfn)
- descfile = open(descfn, 'wb')
+ descfile = open(_encodeFilename(descfn), 'wb')
try:
descfile.write(info_dict['description'].encode('utf-8'))
finally:
@@ -767,7 +810,7 @@ class FileDownloader(object):
return
if self.params.get('writeinfojson', False):
- infofn = filename + '.info.json'
+ infofn = filename + u'.info.json'
self.report_writeinfojson(infofn)
try:
json.dump
@@ -775,7 +818,7 @@ class FileDownloader(object):
self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
return
try:
- infof = open(infofn, 'wb')
+ infof = open(_encodeFilename(infofn), 'wb')
try:
json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
json.dump(json_info_dict, infof)
@@ -786,16 +829,19 @@ class FileDownloader(object):
return
if not self.params.get('skip_download', False):
- try:
- success = self._do_download(filename, info_dict)
- except (OSError, IOError), err:
- raise UnavailableVideoError
- except (urllib2.URLError, httplib.HTTPException, socket.error), err:
- self.trouble(u'ERROR: unable to download video data: %s' % str(err))
- return
- except (ContentTooShortError, ), err:
- self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
- return
+ if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
+ success = True
+ else:
+ try:
+ success = self._do_download(filename, info_dict)
+ except (OSError, IOError), err:
+ raise UnavailableVideoError
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self.trouble(u'ERROR: unable to download video data: %s' % str(err))
+ return
+ except (ContentTooShortError, ), err:
+ self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
+ return
if success:
try:
@@ -854,13 +900,21 @@ class FileDownloader(object):
# the connection was interrumpted and resuming appears to be
# possible. This is part of rtmpdump's normal usage, AFAIK.
basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
- retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
+ args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
+ if self.params.get('verbose', False):
+ try:
+ import pipes
+ shell_quote = lambda args: ' '.join(map(pipes.quote, args))
+ except ImportError:
+ shell_quote = repr
+ self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
+ retval = subprocess.call(args)
while retval == 2 or retval == 1:
- prevsize = os.path.getsize(tmpfilename)
+ prevsize = os.path.getsize(_encodeFilename(tmpfilename))
self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
time.sleep(5.0) # This seems to be needed
retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
- cursize = os.path.getsize(tmpfilename)
+ cursize = os.path.getsize(_encodeFilename(tmpfilename))
if prevsize == cursize and retval == 1:
break
# Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
@@ -869,7 +923,7 @@ class FileDownloader(object):
retval = 0
break
if retval == 0:
- self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
+ self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
self.try_rename(tmpfilename, filename)
return True
else:
@@ -881,7 +935,7 @@ class FileDownloader(object):
player_url = info_dict.get('player_url', None)
# Check file already present
- if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
+ if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
self.report_file_already_downloaded(filename)
return True
@@ -898,8 +952,8 @@ class FileDownloader(object):
request = urllib2.Request(url, None, headers)
# Establish possible resume length
- if os.path.isfile(tmpfilename):
- resume_len = os.path.getsize(tmpfilename)
+ if os.path.isfile(_encodeFilename(tmpfilename)):
+ resume_len = os.path.getsize(_encodeFilename(tmpfilename))
else:
resume_len = 0
@@ -1103,6 +1157,7 @@ class YoutubeIE(InfoExtractor):
_NETRC_MACHINE = 'youtube'
# Listed in order of quality
_available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
+ _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
_video_extensions = {
'13': '3gp',
'17': 'mp4',
@@ -1326,10 +1381,9 @@ class YoutubeIE(InfoExtractor):
lxml.etree
except NameError:
video_description = u'No description available.'
- if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
- mobj = re.search(r'', video_webpage)
- if mobj is not None:
- video_description = mobj.group(1).decode('utf-8')
+ mobj = re.search(r'', video_webpage)
+ if mobj is not None:
+ video_description = mobj.group(1).decode('utf-8')
else:
html_parser = lxml.etree.HTMLParser(encoding='utf-8')
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
@@ -1352,10 +1406,11 @@ class YoutubeIE(InfoExtractor):
url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
format_limit = self._downloader.params.get('format_limit', None)
- if format_limit is not None and format_limit in self._available_formats:
- format_list = self._available_formats[self._available_formats.index(format_limit):]
+ available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
+ if format_limit is not None and format_limit in available_formats:
+ format_list = available_formats[available_formats.index(format_limit):]
else:
- format_list = self._available_formats
+ format_list = available_formats
existing_formats = [x for x in format_list if x in url_map]
if len(existing_formats) == 0:
self._downloader.trouble(u'ERROR: no known formats available for video')
@@ -1581,7 +1636,6 @@ class DailymotionIE(InfoExtractor):
self._downloader.increment_downloads()
video_id = mobj.group(1)
- simple_title = mobj.group(2).decode('utf-8')
video_extension = 'flv'
# Retrieve video webpage to extract further information
@@ -1611,12 +1665,13 @@ class DailymotionIE(InfoExtractor):
video_url = mediaURL
- mobj = re.search(r'(?im)
Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?', webpage)
+ mobj = re.search(r'', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
- video_title = mobj.group(1).decode('utf-8')
+ video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
video_title = sanitize_title(video_title)
+ simple_title = _simplify_title(video_title)
mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage)
if mobj is None:
@@ -2003,7 +2058,7 @@ class VimeoIE(InfoExtractor):
video_id = mobj.group(1)
# Retrieve video webpage to extract further information
- request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
+ request = urllib2.Request(url, None, std_headers)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
@@ -2016,77 +2071,75 @@ class VimeoIE(InfoExtractor):
# and latter we extract those that are Vimeo specific.
self.report_extraction(video_id)
- # Extract title
- mobj = re.search(r'(.*?)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video title')
+ # Extract the config JSON
+ config = webpage.split(' = {config:')[1].split(',assets:')[0]
+ try:
+ config = json.loads(config)
+ except:
+ self._downloader.trouble(u'ERROR: unable to extract info section')
return
- video_title = mobj.group(1).decode('utf-8')
+
+ # Extract title
+ video_title = config["video"]["title"]
simple_title = _simplify_title(video_title)
# Extract uploader
- mobj = re.search(r'http://vimeo.com/(.*?)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video uploader')
- return
- video_uploader = mobj.group(1).decode('utf-8')
+ video_uploader = config["video"]["owner"]["name"]
# Extract video thumbnail
- mobj = re.search(r'(.*?)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
- return
- video_thumbnail = mobj.group(1).decode('utf-8')
-
- # # Extract video description
- # mobj = re.search(r'', webpage)
- # if mobj is None:
- # self._downloader.trouble(u'ERROR: unable to extract video description')
- # return
- # video_description = mobj.group(1).decode('utf-8')
- # if not video_description: video_description = 'No description available.'
- video_description = 'Foo.'
-
- # Vimeo specific: extract request signature
- mobj = re.search(r'(.*?)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract request signature')
- return
- sig = mobj.group(1).decode('utf-8')
-
- # Vimeo specific: extract video quality information
- mobj = re.search(r'(\d+)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video quality information')
- return
- quality = mobj.group(1).decode('utf-8')
+ video_thumbnail = config["video"]["thumbnail"]
- if int(quality) == 1:
- quality = 'hd'
+ # Extract video description
+ try:
+ lxml.etree
+ except NameError:
+ video_description = u'No description available.'
+ mobj = re.search(r'', webpage, re.MULTILINE)
+ if mobj is not None:
+ video_description = mobj.group(1)
else:
- quality = 'sd'
+ html_parser = lxml.etree.HTMLParser()
+ vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
+ video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
+ # TODO use another parser
- # Vimeo specific: Extract request signature expiration
- mobj = re.search(r'(.*?)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
+ # Extract upload date
+ video_upload_date = u'NA'
+ mobj = re.search(r'[^:]*: (.*?)( \([^\(]*\))?', webpage)
+ if mobj is not None:
+ video_upload_date = mobj.group(1)
+
+ # Vimeo specific: extract request signature and timestamp
+ sig = config['request']['signature']
+ timestamp = config['request']['timestamp']
+
+ # Vimeo specific: extract video codec and quality information
+ # TODO bind to format param
+ codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
+ for codec in codecs:
+ if codec[0] in config["video"]["files"]:
+ video_codec = codec[0]
+ video_extension = codec[1]
+ if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
+ else: quality = 'sd'
+ break
+ else:
+ self._downloader.trouble(u'ERROR: no known codec found')
return
- sig_exp = mobj.group(1).decode('utf-8')
- video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
+ video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
+ %(video_id, sig, timestamp, quality, video_codec.upper())
try:
# Process video information
self._downloader.process_info({
- 'id': video_id.decode('utf-8'),
+ 'id': video_id,
'url': video_url,
'uploader': video_uploader,
- 'upload_date': u'NA',
+ 'upload_date': video_upload_date,
'title': video_title,
'stitle': simple_title,
- 'ext': u'mp4',
- 'thumbnail': video_thumbnail.decode('utf-8'),
- 'description': video_description,
+ 'ext': video_extension,
'thumbnail': video_thumbnail,
'description': video_description,
'player_url': None,
@@ -2285,8 +2338,8 @@ class GoogleSearchIE(InfoExtractor):
"""Information Extractor for Google Video search queries."""
_VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
_TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
- _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
- _MORE_PAGES_INDICATOR = r'Next'
+ _VIDEO_INDICATOR = r'', html)
+ mMovieParams = re.findall('(?:/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P[^&]+)(&video=(?P