youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # Author: Paweł Paprota
   9 # Author: Gergely Imreh
  10 # Author: Philipp Hagemeister <phihag@phihag.de>
  11 # License: Public domain code
  12 from __future__ import with_statement
  13 import contextlib
  14 import cookielib
  15 import ctypes
  16 import datetime
  17 import email.utils
  18 import gzip
  19 import htmlentitydefs
  20 import httplib
  21 import locale
  22 import math
  23 import netrc
  24 import os
  25 import os.path
  26 import re
  27 import socket
  28 import string
  29 import subprocess
  30 import sys
  31 import time
  32 import urllib
  33 import urllib2
  34 import warnings
  35 import zlib
  36
  37 try:
  38         import cStringIO as StringIO
  39 except ImportError:
  40         import StringIO
  41
  42 # parse_qs was moved from the cgi module to the urlparse module recently.
  43 try:
  44         from urlparse import parse_qs
  45 except ImportError:
  46         from cgi import parse_qs
  47
  48 try:
  49         import lxml.etree
  50 except ImportError: # Python < 2.6
  51         pass # Handled below
  52
  53 std_headers = {
  54         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
  55         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  56         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  57         'Accept-Encoding': 'gzip, deflate',
  58         'Accept-Language': 'en-us,en;q=0.5',
  59 }
  60
  61 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  62
  63 try:
  64         import json
  65 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  66         import re
  67         class json(object):
  68                 @staticmethod
  69                 def loads(s):
  70                         s = s.decode('UTF-8')
  71                         def raiseError(msg, i):
  72                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  73                         def skipSpace(i, expectMore=True):
  74                                 while i < len(s) and s[i] in ' \t\r\n':
  75                                         i += 1
  76                                 if expectMore:
  77                                         if i >= len(s):
  78                                                 raiseError('Premature end', i)
  79                                 return i
  80                         def decodeEscape(match):
  81                                 esc = match.group(1)
  82                                 _STATIC = {
  83                                         '"': '"',
  84                                         '\\': '\\',
  85                                         '/': '/',
  86                                         'b': unichr(0x8),
  87                                         'f': unichr(0xc),
  88                                         'n': '\n',
  89                                         'r': '\r',
  90                                         't': '\t',
  91                                 }
  92                                 if esc in _STATIC:
  93                                         return _STATIC[esc]
  94                                 if esc[0] == 'u':
  95                                         if len(esc) == 1+4:
  96                                                 return unichr(int(esc[1:5], 16))
  97                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
  98                                                 hi = int(esc[1:5], 16)
  99                                                 low = int(esc[7:11], 16)
 100                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 101                                 raise ValueError('Unknown escape ' + str(esc))
 102                         def parseString(i):
 103                                 i += 1
 104                                 e = i
 105                                 while True:
 106                                         e = s.index('"', e)
 107                                         bslashes = 0
 108                                         while s[e-bslashes-1] == '\\':
 109                                                 bslashes += 1
 110                                         if bslashes % 2 == 1:
 111                                                 e += 1
 112                                                 continue
 113                                         break
 114                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 115                                 stri = rexp.sub(decodeEscape, s[i:e])
 116                                 return (e+1,stri)
 117                         def parseObj(i):
 118                                 i += 1
 119                                 res = {}
 120                                 i = skipSpace(i)
 121                                 if s[i] == '}': # Empty dictionary
 122                                         return (i+1,res)
 123                                 while True:
 124                                         if s[i] != '"':
 125                                                 raiseError('Expected a string object key', i)
 126                                         i,key = parseString(i)
 127                                         i = skipSpace(i)
 128                                         if i >= len(s) or s[i] != ':':
 129                                                 raiseError('Expected a colon', i)
 130                                         i,val = parse(i+1)
 131                                         res[key] = val
 132                                         i = skipSpace(i)
 133                                         if s[i] == '}':
 134                                                 return (i+1, res)
 135                                         if s[i] != ',':
 136                                                 raiseError('Expected comma or closing curly brace', i)
 137                                         i = skipSpace(i+1)
 138                         def parseArray(i):
 139                                 res = []
 140                                 i = skipSpace(i+1)
 141                                 if s[i] == ']': # Empty array
 142                                         return (i+1,res)
 143                                 while True:
 144                                         i,val = parse(i)
 145                                         res.append(val)
 146                                         i = skipSpace(i) # Raise exception if premature end
 147                                         if s[i] == ']':
 148                                                 return (i+1, res)
 149                                         if s[i] != ',':
 150                                                 raiseError('Expected a comma or closing bracket', i)
 151                                         i = skipSpace(i+1)
 152                         def parseDiscrete(i):
 153                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 154                                         if s.startswith(k, i):
 155                                                 return (i+len(k), v)
 156                                 raiseError('Not a boolean (or null)', i)
 157                         def parseNumber(i):
 158                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 159                                 if mobj is None:
 160                                         raiseError('Not a number', i)
 161                                 nums = mobj.group(1)
 162                                 if '.' in nums or 'e' in nums or 'E' in nums:
 163                                         return (i+len(nums), float(nums))
 164                                 return (i+len(nums), int(nums))
 165                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 166                         def parse(i):
 167                                 i = skipSpace(i)
 168                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 169                                 i = skipSpace(i, False)
 170                                 return (i,res)
 171                         i,res = parse(0)
 172                         if i < len(s):
 173                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 174                         return res
 175
 176 def preferredencoding():
 177         """Get preferred encoding.
 178
 179         Returns the best encoding scheme for the system, based on
 180         locale.getpreferredencoding() and some further tweaks.
 181         """
 182         def yield_preferredencoding():
 183                 try:
 184                         pref = locale.getpreferredencoding()
 185                         u'TEST'.encode(pref)
 186                 except:
 187                         pref = 'UTF-8'
 188                 while True:
 189                         yield pref
 190         return yield_preferredencoding().next()
 191
 192 def htmlentity_transform(matchobj):
 193         """Transforms an HTML entity to a Unicode character.
 194
 195         This function receives a match object and is intended to be used with
 196         the re.sub() function.
 197         """
 198         entity = matchobj.group(1)
 199
 200         # Known non-numeric HTML entity
 201         if entity in htmlentitydefs.name2codepoint:
 202                 return unichr(htmlentitydefs.name2codepoint[entity])
 203
 204         # Unicode character
 205         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 206         if mobj is not None:
 207                 numstr = mobj.group(1)
 208                 if numstr.startswith(u'x'):
 209                         base = 16
 210                         numstr = u'0%s' % numstr
 211                 else:
 212                         base = 10
 213                 return unichr(long(numstr, base))
 214
 215         # Unknown entity in name, return its literal representation
 216         return (u'&%s;' % entity)
 217
 218 def sanitize_title(utitle):
 219         """Sanitizes a video title so it could be used as part of a filename."""
 220         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 221         return utitle.replace(unicode(os.sep), u'%')
 222
 223 def sanitize_open(filename, open_mode):
 224         """Try to open the given filename, and slightly tweak it if this fails.
 225
 226         Attempts to open the given filename. If this fails, it tries to change
 227         the filename slightly, step by step, until it's either able to open it
 228         or it fails and raises a final exception, like the standard open()
 229         function.
 230
 231         It returns the tuple (stream, definitive_file_name).
 232         """
 233         try:
 234                 if filename == u'-':
 235                         if sys.platform == 'win32':
 236                                 import msvcrt
 237                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 238                         return (sys.stdout, filename)
 239                 stream = open(filename, open_mode)
 240                 return (stream, filename)
 241         except (IOError, OSError), err:
 242                 # In case of error, try to remove win32 forbidden chars
 243                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 244
 245                 # An exception here should be caught in the caller
 246                 stream = open(filename, open_mode)
 247                 return (stream, filename)
 248
 249 def timeconvert(timestr):
 250     """Convert RFC 2822 defined time string into system timestamp"""
 251     timestamp = None
 252     timetuple = email.utils.parsedate_tz(timestr)
 253     if timetuple is not None:
 254         timestamp = email.utils.mktime_tz(timetuple)
 255     return timestamp
 256
 257 class DownloadError(Exception):
 258         """Download Error exception.
 259
 260         This exception may be thrown by FileDownloader objects if they are not
 261         configured to continue on errors. They will contain the appropriate
 262         error message.
 263         """
 264         pass
 265
 266 class SameFileError(Exception):
 267         """Same File exception.
 268
 269         This exception will be thrown by FileDownloader objects if they detect
 270         multiple files would have to be downloaded to the same file on disk.
 271         """
 272         pass
 273
 274 class PostProcessingError(Exception):
 275         """Post Processing exception.
 276
 277         This exception may be raised by PostProcessor's .run() method to
 278         indicate an error in the postprocessing task.
 279         """
 280         pass
 281
 282 class UnavailableVideoError(Exception):
 283         """Unavailable Format exception.
 284
 285         This exception will be thrown when a video is requested
 286         in a format that is not available for that video.
 287         """
 288         pass
 289
 290 class ContentTooShortError(Exception):
 291         """Content Too Short exception.
 292
 293         This exception may be raised by FileDownloader objects when a file they
 294         download is too small for what the server announced first, indicating
 295         the connection was probably interrupted.
 296         """
 297         # Both in bytes
 298         downloaded = None
 299         expected = None
 300
 301         def __init__(self, downloaded, expected):
 302                 self.downloaded = downloaded
 303                 self.expected = expected
 304
 305 class YoutubeDLHandler(urllib2.HTTPHandler):
 306         """Handler for HTTP requests and responses.
 307
 308         This class, when installed with an OpenerDirector, automatically adds
 309         the standard headers to every HTTP request and handles gzipped and
 310         deflated responses from web servers. If compression is to be avoided in
 311         a particular request, the original request in the program code only has
 312         to include the HTTP header "Youtubedl-No-Compression", which will be
 313         removed before making the real request.
 314
 315         Part of this code was copied from:
 316
 317           http://techknack.net/python-urllib2-handlers/
 318
 319         Andrew Rowls, the author of that code, agreed to release it to the
 320         public domain.
 321         """
 322
 323         @staticmethod
 324         def deflate(data):
 325                 try:
 326                         return zlib.decompress(data, -zlib.MAX_WBITS)
 327                 except zlib.error:
 328                         return zlib.decompress(data)
 329
 330         @staticmethod
 331         def addinfourl_wrapper(stream, headers, url, code):
 332                 if hasattr(urllib2.addinfourl, 'getcode'):
 333                         return urllib2.addinfourl(stream, headers, url, code)
 334                 ret = urllib2.addinfourl(stream, headers, url)
 335                 ret.code = code
 336                 return ret
 337
 338         def http_request(self, req):
 339                 for h in std_headers:
 340                         if h in req.headers:
 341                                 del req.headers[h]
 342                         req.add_header(h, std_headers[h])
 343                 if 'Youtubedl-no-compression' in req.headers:
 344                         if 'Accept-encoding' in req.headers:
 345                                 del req.headers['Accept-encoding']
 346                         del req.headers['Youtubedl-no-compression']
 347                 return req
 348
 349         def http_response(self, req, resp):
 350                 old_resp = resp
 351                 # gzip
 352                 if resp.headers.get('Content-encoding', '') == 'gzip':
 353                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 354                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 355                         resp.msg = old_resp.msg
 356                 # deflate
 357                 if resp.headers.get('Content-encoding', '') == 'deflate':
 358                         gz = StringIO.StringIO(self.deflate(resp.read()))
 359                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 360                         resp.msg = old_resp.msg
 361                 return resp
 362
 363 class FileDownloader(object):
 364         """File Downloader class.
 365
 366         File downloader objects are the ones responsible of downloading the
 367         actual video file and writing it to disk if the user has requested
 368         it, among some other tasks. In most cases there should be one per
 369         program. As, given a video URL, the downloader doesn't know how to
 370         extract all the needed information, task that InfoExtractors do, it
 371         has to pass the URL to one of them.
 372
 373         For this, file downloader objects have a method that allows
 374         InfoExtractors to be registered in a given order. When it is passed
 375         a URL, the file downloader handles it to the first InfoExtractor it
 376         finds that reports being able to handle it. The InfoExtractor extracts
 377         all the information about the video or videos the URL refers to, and
 378         asks the FileDownloader to process the video information, possibly
 379         downloading the video.
 380
 381         File downloaders accept a lot of parameters. In order not to saturate
 382         the object constructor with arguments, it receives a dictionary of
 383         options instead. These options are available through the params
 384         attribute for the InfoExtractors to use. The FileDownloader also
 385         registers itself as the downloader in charge for the InfoExtractors
 386         that are added to it, so this is a "mutual registration".
 387
 388         Available options:
 389
 390         username:         Username for authentication purposes.
 391         password:         Password for authentication purposes.
 392         usenetrc:         Use netrc for authentication instead.
 393         quiet:            Do not print messages to stdout.
 394         forceurl:         Force printing final URL.
 395         forcetitle:       Force printing title.
 396         forcethumbnail:   Force printing thumbnail URL.
 397         forcedescription: Force printing description.
 398         forcefilename:    Force printing final filename.
 399         simulate:         Do not download the video files.
 400         format:           Video format code.
 401         format_limit:     Highest quality format to try.
 402         outtmpl:          Template for output names.
 403         ignoreerrors:     Do not stop on download errors.
 404         ratelimit:        Download speed limit, in bytes/sec.
 405         nooverwrites:     Prevent overwriting files.
 406         retries:          Number of times to retry for HTTP error 5xx
 407         continuedl:       Try to continue downloads if possible.
 408         noprogress:       Do not print the progress bar.
 409         playliststart:    Playlist item to start at.
 410         playlistend:      Playlist item to end at.
 411         logtostderr:      Log messages to stderr instead of stdout.
 412         consoletitle:     Display progress in console window's titlebar.
 413         nopart:           Do not use temporary .part files.
 414         updatetime:       Use the Last-modified header to set output file timestamps.
 415         writedescription: Write the video description to a .description file
 416         writeinfojson:    Write the video description to a .info.json file
 417         """
 418
 419         params = None
 420         _ies = []
 421         _pps = []
 422         _download_retcode = None
 423         _num_downloads = None
 424         _screen_file = None
 425
 426         def __init__(self, params):
 427                 """Create a FileDownloader object with the given options."""
 428                 self._ies = []
 429                 self._pps = []
 430                 self._download_retcode = 0
 431                 self._num_downloads = 0
 432                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 433                 self.params = params
 434
 435         @staticmethod
 436         def pmkdir(filename):
 437                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 438                 components = filename.split(os.sep)
 439                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 440                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 441                 for dir in aggregate:
 442                         if not os.path.exists(dir):
 443                                 os.mkdir(dir)
 444
 445         @staticmethod
 446         def format_bytes(bytes):
 447                 if bytes is None:
 448                         return 'N/A'
 449                 if type(bytes) is str:
 450                         bytes = float(bytes)
 451                 if bytes == 0.0:
 452                         exponent = 0
 453                 else:
 454                         exponent = long(math.log(bytes, 1024.0))
 455                 suffix = 'bkMGTPEZY'[exponent]
 456                 converted = float(bytes) / float(1024**exponent)
 457                 return '%.2f%s' % (converted, suffix)
 458
 459         @staticmethod
 460         def calc_percent(byte_counter, data_len):
 461                 if data_len is None:
 462                         return '---.-%'
 463                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 464
 465         @staticmethod
 466         def calc_eta(start, now, total, current):
 467                 if total is None:
 468                         return '--:--'
 469                 dif = now - start
 470                 if current == 0 or dif < 0.001: # One millisecond
 471                         return '--:--'
 472                 rate = float(current) / dif
 473                 eta = long((float(total) - float(current)) / rate)
 474                 (eta_mins, eta_secs) = divmod(eta, 60)
 475                 if eta_mins > 99:
 476                         return '--:--'
 477                 return '%02d:%02d' % (eta_mins, eta_secs)
 478
 479         @staticmethod
 480         def calc_speed(start, now, bytes):
 481                 dif = now - start
 482                 if bytes == 0 or dif < 0.001: # One millisecond
 483                         return '%10s' % '---b/s'
 484                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 485
 486         @staticmethod
 487         def best_block_size(elapsed_time, bytes):
 488                 new_min = max(bytes / 2.0, 1.0)
 489                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 490                 if elapsed_time < 0.001:
 491                         return long(new_max)
 492                 rate = bytes / elapsed_time
 493                 if rate > new_max:
 494                         return long(new_max)
 495                 if rate < new_min:
 496                         return long(new_min)
 497                 return long(rate)
 498
 499         @staticmethod
 500         def parse_bytes(bytestr):
 501                 """Parse a string indicating a byte quantity into a long integer."""
 502                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 503                 if matchobj is None:
 504                         return None
 505                 number = float(matchobj.group(1))
 506                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 507                 return long(round(number * multiplier))
 508
 509         def add_info_extractor(self, ie):
 510                 """Add an InfoExtractor object to the end of the list."""
 511                 self._ies.append(ie)
 512                 ie.set_downloader(self)
 513
 514         def add_post_processor(self, pp):
 515                 """Add a PostProcessor object to the end of the chain."""
 516                 self._pps.append(pp)
 517                 pp.set_downloader(self)
 518
 519         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 520                 """Print message to stdout if not in quiet mode."""
 521                 try:
 522                         if not self.params.get('quiet', False):
 523                                 terminator = [u'\n', u''][skip_eol]
 524                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 525                         self._screen_file.flush()
 526                 except (UnicodeEncodeError), err:
 527                         if not ignore_encoding_errors:
 528                                 raise
 529
 530         def to_stderr(self, message):
 531                 """Print message to stderr."""
 532                 print >>sys.stderr, message.encode(preferredencoding())
 533
 534         def to_cons_title(self, message):
 535                 """Set console/terminal window title to message."""
 536                 if not self.params.get('consoletitle', False):
 537                         return
 538                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 539                         # c_wchar_p() might not be necessary if `message` is
 540                         # already of type unicode()
 541                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 542                 elif 'TERM' in os.environ:
 543                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 544
 545         def fixed_template(self):
 546                 """Checks if the output template is fixed."""
 547                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 548
 549         def trouble(self, message=None):
 550                 """Determine action to take when a download problem appears.
 551
 552                 Depending on if the downloader has been configured to ignore
 553                 download errors or not, this method may throw an exception or
 554                 not when errors are found, after printing the message.
 555                 """
 556                 if message is not None:
 557                         self.to_stderr(message)
 558                 if not self.params.get('ignoreerrors', False):
 559                         raise DownloadError(message)
 560                 self._download_retcode = 1
 561
 562         def slow_down(self, start_time, byte_counter):
 563                 """Sleep if the download speed is over the rate limit."""
 564                 rate_limit = self.params.get('ratelimit', None)
 565                 if rate_limit is None or byte_counter == 0:
 566                         return
 567                 now = time.time()
 568                 elapsed = now - start_time
 569                 if elapsed <= 0.0:
 570                         return
 571                 speed = float(byte_counter) / elapsed
 572                 if speed > rate_limit:
 573                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 574
 575         def temp_name(self, filename):
 576                 """Returns a temporary filename for the given filename."""
 577                 if self.params.get('nopart', False) or filename == u'-' or \
 578                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 579                         return filename
 580                 return filename + u'.part'
 581
 582         def undo_temp_name(self, filename):
 583                 if filename.endswith(u'.part'):
 584                         return filename[:-len(u'.part')]
 585                 return filename
 586
 587         def try_rename(self, old_filename, new_filename):
 588                 try:
 589                         if old_filename == new_filename:
 590                                 return
 591                         os.rename(old_filename, new_filename)
 592                 except (IOError, OSError), err:
 593                         self.trouble(u'ERROR: unable to rename file')
 594
 595         def try_utime(self, filename, last_modified_hdr):
 596                 """Try to set the last-modified time of the given file."""
 597                 if last_modified_hdr is None:
 598                         return
 599                 if not os.path.isfile(filename):
 600                         return
 601                 timestr = last_modified_hdr
 602                 if timestr is None:
 603                         return
 604                 filetime = timeconvert(timestr)
 605                 if filetime is None:
 606                         return
 607                 try:
 608                         os.utime(filename,(time.time(), filetime))
 609                 except:
 610                         pass
 611
 612         def report_writedescription(self, descfn):
 613                 """ Report that the description file is being written """
 614                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 615
 616         def report_writeinfojson(self, infofn):
 617                 """ Report that the metadata file has been written """
 618                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 619
 620         def report_destination(self, filename):
 621                 """Report destination filename."""
 622                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 623
 624         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 625                 """Report download progress."""
 626                 if self.params.get('noprogress', False):
 627                         return
 628                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 629                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 630                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 631                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 632
 633         def report_resuming_byte(self, resume_len):
 634                 """Report attempt to resume at given byte."""
 635                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 636
 637         def report_retry(self, count, retries):
 638                 """Report retry in case of HTTP error 5xx"""
 639                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 640
 641         def report_file_already_downloaded(self, file_name):
 642                 """Report file has already been fully downloaded."""
 643                 try:
 644                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 645                 except (UnicodeEncodeError), err:
 646                         self.to_screen(u'[download] The file has already been downloaded')
 647
 648         def report_unable_to_resume(self):
 649                 """Report it was impossible to resume download."""
 650                 self.to_screen(u'[download] Unable to resume')
 651
 652         def report_finish(self):
 653                 """Report download finished."""
 654                 if self.params.get('noprogress', False):
 655                         self.to_screen(u'[download] Download completed')
 656                 else:
 657                         self.to_screen(u'')
 658
 659         def increment_downloads(self):
 660                 """Increment the ordinal that assigns a number to each file."""
 661                 self._num_downloads += 1
 662
 663         def prepare_filename(self, info_dict):
 664                 """Generate the output filename."""
 665                 try:
 666                         template_dict = dict(info_dict)
 667                         template_dict['epoch'] = unicode(long(time.time()))
 668                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 669                         filename = self.params['outtmpl'] % template_dict
 670                         return filename
 671                 except (ValueError, KeyError), err:
 672                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 673                         return None
 674
 675         def process_info(self, info_dict):
 676                 """Process a single dictionary returned by an InfoExtractor."""
 677                 filename = self.prepare_filename(info_dict)
 678                 # Do nothing else if in simulate mode
 679                 if self.params.get('simulate', False):
 680                         # Forced printings
 681                         if self.params.get('forcetitle', False):
 682                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 683                         if self.params.get('forceurl', False):
 684                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 685                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 686                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 687                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 688                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 689                         if self.params.get('forcefilename', False) and filename is not None:
 690                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 691
 692                         return
 693
 694                 if filename is None:
 695                         return
 696                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 697                         self.to_stderr(u'WARNING: file exists and will be skipped')
 698                         return
 699
 700                 try:
 701                         self.pmkdir(filename)
 702                 except (OSError, IOError), err:
 703                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 704                         return
 705
 706                 if self.params.get('writedescription', False):
 707                         try:
 708                                 descfn = filename + '.description'
 709                                 self.report_writedescription(descfn)
 710                                 with contextlib.closing(open(descfn, 'wb')) as descfile:
 711                                         descfile.write(info_dict['description'].encode('utf-8'))
 712                         except (OSError, IOError):
 713                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 714                                 return
 715
 716                 if self.params.get('writeinfojson', False):
 717                         infofn = filename + '.info.json'
 718                         self.report_writeinfojson(infofn)
 719                         try:
 720                                 json.dump
 721                         except (NameError,AttributeError):
 722                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 723                                 return
 724                         try:
 725                                 with contextlib.closing(open(infofn, 'wb')) as infof:
 726                                         json.dump(info_dict, infof)
 727                         except (OSError, IOError):
 728                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 729                                 return
 730
 731                 try:
 732                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 733                 except (OSError, IOError), err:
 734                         raise UnavailableVideoError
 735                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 736                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 737                         return
 738                 except (ContentTooShortError, ), err:
 739                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 740                         return
 741
 742                 if success:
 743                         try:
 744                                 self.post_process(filename, info_dict)
 745                         except (PostProcessingError), err:
 746                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 747                                 return
 748
 749         def download(self, url_list):
 750                 """Download a given list of URLs."""
 751                 if len(url_list) > 1 and self.fixed_template():
 752                         raise SameFileError(self.params['outtmpl'])
 753
 754                 for url in url_list:
 755                         suitable_found = False
 756                         for ie in self._ies:
 757                                 # Go to next InfoExtractor if not suitable
 758                                 if not ie.suitable(url):
 759                                         continue
 760
 761                                 # Suitable InfoExtractor found
 762                                 suitable_found = True
 763
 764                                 # Extract information from URL and process it
 765                                 ie.extract(url)
 766
 767                                 # Suitable InfoExtractor had been found; go to next URL
 768                                 break
 769
 770                         if not suitable_found:
 771                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 772
 773                 return self._download_retcode
 774
 775         def post_process(self, filename, ie_info):
 776                 """Run the postprocessing chain on the given file."""
 777                 info = dict(ie_info)
 778                 info['filepath'] = filename
 779                 for pp in self._pps:
 780                         info = pp.run(info)
 781                         if info is None:
 782                                 break
 783
 784         def _download_with_rtmpdump(self, filename, url, player_url):
 785                 self.report_destination(filename)
 786                 tmpfilename = self.temp_name(filename)
 787
 788                 # Check for rtmpdump first
 789                 try:
 790                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 791                 except (OSError, IOError):
 792                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 793                         return False
 794
 795                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 796                 # the connection was interrumpted and resuming appears to be
 797                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 798                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 799                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 800                 while retval == 2 or retval == 1:
 801                         prevsize = os.path.getsize(tmpfilename)
 802                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 803                         time.sleep(5.0) # This seems to be needed
 804                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 805                         cursize = os.path.getsize(tmpfilename)
 806                         if prevsize == cursize and retval == 1:
 807                                 break
 808                 if retval == 0:
 809                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 810                         self.try_rename(tmpfilename, filename)
 811                         return True
 812                 else:
 813                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 814                         return False
 815
 816         def _do_download(self, filename, url, player_url):
 817                 # Check file already present
 818                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 819                         self.report_file_already_downloaded(filename)
 820                         return True
 821
 822                 # Attempt to download using rtmpdump
 823                 if url.startswith('rtmp'):
 824                         return self._download_with_rtmpdump(filename, url, player_url)
 825
 826                 tmpfilename = self.temp_name(filename)
 827                 stream = None
 828                 open_mode = 'wb'
 829
 830                 # Do not include the Accept-Encoding header
 831                 headers = {'Youtubedl-no-compression': 'True'}
 832                 basic_request = urllib2.Request(url, None, headers)
 833                 request = urllib2.Request(url, None, headers)
 834
 835                 # Establish possible resume length
 836                 if os.path.isfile(tmpfilename):
 837                         resume_len = os.path.getsize(tmpfilename)
 838                 else:
 839                         resume_len = 0
 840
 841                 # Request parameters in case of being able to resume
 842                 if self.params.get('continuedl', False) and resume_len != 0:
 843                         self.report_resuming_byte(resume_len)
 844                         request.add_header('Range','bytes=%d-' % resume_len)
 845                         open_mode = 'ab'
 846
 847                 count = 0
 848                 retries = self.params.get('retries', 0)
 849                 while count <= retries:
 850                         # Establish connection
 851                         try:
 852                                 data = urllib2.urlopen(request)
 853                                 break
 854                         except (urllib2.HTTPError, ), err:
 855                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 856                                         # Unexpected HTTP error
 857                                         raise
 858                                 elif err.code == 416:
 859                                         # Unable to resume (requested range not satisfiable)
 860                                         try:
 861                                                 # Open the connection again without the range header
 862                                                 data = urllib2.urlopen(basic_request)
 863                                                 content_length = data.info()['Content-Length']
 864                                         except (urllib2.HTTPError, ), err:
 865                                                 if err.code < 500 or err.code >= 600:
 866                                                         raise
 867                                         else:
 868                                                 # Examine the reported length
 869                                                 if (content_length is not None and
 870                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 871                                                         # The file had already been fully downloaded.
 872                                                         # Explanation to the above condition: in issue #175 it was revealed that
 873                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 874                                                         # changing the file size slightly and causing problems for some users. So
 875                                                         # I decided to implement a suggested change and consider the file
 876                                                         # completely downloaded if the file size differs less than 100 bytes from
 877                                                         # the one in the hard drive.
 878                                                         self.report_file_already_downloaded(filename)
 879                                                         self.try_rename(tmpfilename, filename)
 880                                                         return True
 881                                                 else:
 882                                                         # The length does not match, we start the download over
 883                                                         self.report_unable_to_resume()
 884                                                         open_mode = 'wb'
 885                                                         break
 886                         # Retry
 887                         count += 1
 888                         if count <= retries:
 889                                 self.report_retry(count, retries)
 890
 891                 if count > retries:
 892                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 893                         return False
 894
 895                 data_len = data.info().get('Content-length', None)
 896                 if data_len is not None:
 897                         data_len = long(data_len) + resume_len
 898                 data_len_str = self.format_bytes(data_len)
 899                 byte_counter = 0 + resume_len
 900                 block_size = 1024
 901                 start = time.time()
 902                 while True:
 903                         # Download and write
 904                         before = time.time()
 905                         data_block = data.read(block_size)
 906                         after = time.time()
 907                         if len(data_block) == 0:
 908                                 break
 909                         byte_counter += len(data_block)
 910
 911                         # Open file just in time
 912                         if stream is None:
 913                                 try:
 914                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 915                                         filename = self.undo_temp_name(tmpfilename)
 916                                         self.report_destination(filename)
 917                                 except (OSError, IOError), err:
 918                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 919                                         return False
 920                         try:
 921                                 stream.write(data_block)
 922                         except (IOError, OSError), err:
 923                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 924                                 return False
 925                         block_size = self.best_block_size(after - before, len(data_block))
 926
 927                         # Progress message
 928                         percent_str = self.calc_percent(byte_counter, data_len)
 929                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 930                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 931                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 932
 933                         # Apply rate limit
 934                         self.slow_down(start, byte_counter - resume_len)
 935
 936                 stream.close()
 937                 self.report_finish()
 938                 if data_len is not None and byte_counter != data_len:
 939                         raise ContentTooShortError(byte_counter, long(data_len))
 940                 self.try_rename(tmpfilename, filename)
 941
 942                 # Update file modification time
 943                 if self.params.get('updatetime', True):
 944                         self.try_utime(filename, data.info().get('last-modified', None))
 945
 946                 return True
 947
 948 class InfoExtractor(object):
 949         """Information Extractor class.
 950
 951         Information extractors are the classes that, given a URL, extract
 952         information from the video (or videos) the URL refers to. This
 953         information includes the real video URL, the video title and simplified
 954         title, author and others. The information is stored in a dictionary
 955         which is then passed to the FileDownloader. The FileDownloader
 956         processes this information possibly downloading the video to the file
 957         system, among other possible outcomes. The dictionaries must include
 958         the following fields:
 959
 960         id:             Video identifier.
 961         url:            Final video URL.
 962         uploader:       Nickname of the video uploader.
 963         title:          Literal title.
 964         stitle:         Simplified title.
 965         ext:            Video filename extension.
 966         format:         Video format.
 967         player_url:     SWF Player URL (may be None).
 968
 969         The following fields are optional. Their primary purpose is to allow
 970         youtube-dl to serve as the backend for a video search function, such
 971         as the one in youtube2mp3.  They are only used when their respective
 972         forced printing functions are called:
 973
 974         thumbnail:      Full URL to a video thumbnail image.
 975         description:    One-line video description.
 976
 977         Subclasses of this one should re-define the _real_initialize() and
 978         _real_extract() methods, as well as the suitable() static method.
 979         Probably, they should also be instantiated and added to the main
 980         downloader.
 981         """
 982
 983         _ready = False
 984         _downloader = None
 985
 986         def __init__(self, downloader=None):
 987                 """Constructor. Receives an optional downloader."""
 988                 self._ready = False
 989                 self.set_downloader(downloader)
 990
 991         @staticmethod
 992         def suitable(url):
 993                 """Receives a URL and returns True if suitable for this IE."""
 994                 return False
 995
 996         def initialize(self):
 997                 """Initializes an instance (authentication, etc)."""
 998                 if not self._ready:
 999                         self._real_initialize()
1000                         self._ready = True
1001
1002         def extract(self, url):
1003                 """Extracts URL information and returns it in list of dicts."""
1004                 self.initialize()
1005                 return self._real_extract(url)
1006
1007         def set_downloader(self, downloader):
1008                 """Sets the downloader for this IE."""
1009                 self._downloader = downloader
1010
1011         def _real_initialize(self):
1012                 """Real initialization process. Redefine in subclasses."""
1013                 pass
1014
1015         def _real_extract(self, url):
1016                 """Real extraction process. Redefine in subclasses."""
1017                 pass
1018
1019 class YoutubeIE(InfoExtractor):
1020         """Information extractor for youtube.com."""
1021
1022         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1023         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1024         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1025         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1026         _NETRC_MACHINE = 'youtube'
1027         # Listed in order of quality
1028         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1029         _video_extensions = {
1030                 '13': '3gp',
1031                 '17': 'mp4',
1032                 '18': 'mp4',
1033                 '22': 'mp4',
1034                 '37': 'mp4',
1035                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1036                 '43': 'webm',
1037                 '45': 'webm',
1038         }
1039
1040         @staticmethod
1041         def suitable(url):
1042                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1043
1044         def report_lang(self):
1045                 """Report attempt to set language."""
1046                 self._downloader.to_screen(u'[youtube] Setting language')
1047
1048         def report_login(self):
1049                 """Report attempt to log in."""
1050                 self._downloader.to_screen(u'[youtube] Logging in')
1051
1052         def report_age_confirmation(self):
1053                 """Report attempt to confirm age."""
1054                 self._downloader.to_screen(u'[youtube] Confirming age')
1055
1056         def report_video_webpage_download(self, video_id):
1057                 """Report attempt to download video webpage."""
1058                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1059
1060         def report_video_info_webpage_download(self, video_id):
1061                 """Report attempt to download video info webpage."""
1062                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1063
1064         def report_information_extraction(self, video_id):
1065                 """Report attempt to extract video information."""
1066                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1067
1068         def report_unavailable_format(self, video_id, format):
1069                 """Report extracted video URL."""
1070                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1071
1072         def report_rtmp_download(self):
1073                 """Indicate the download will use the RTMP protocol."""
1074                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1075
1076         def _real_initialize(self):
1077                 if self._downloader is None:
1078                         return
1079
1080                 username = None
1081                 password = None
1082                 downloader_params = self._downloader.params
1083
1084                 # Attempt to use provided username and password or .netrc data
1085                 if downloader_params.get('username', None) is not None:
1086                         username = downloader_params['username']
1087                         password = downloader_params['password']
1088                 elif downloader_params.get('usenetrc', False):
1089                         try:
1090                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1091                                 if info is not None:
1092                                         username = info[0]
1093                                         password = info[2]
1094                                 else:
1095                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1096                         except (IOError, netrc.NetrcParseError), err:
1097                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1098                                 return
1099
1100                 # Set language
1101                 request = urllib2.Request(self._LANG_URL)
1102                 try:
1103                         self.report_lang()
1104                         urllib2.urlopen(request).read()
1105                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1106                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1107                         return
1108
1109                 # No authentication to be performed
1110                 if username is None:
1111                         return
1112
1113                 # Log in
1114                 login_form = {
1115                                 'current_form': 'loginForm',
1116                                 'next':         '/',
1117                                 'action_login': 'Log In',
1118                                 'username':     username,
1119                                 'password':     password,
1120                                 }
1121                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1122                 try:
1123                         self.report_login()
1124                         login_results = urllib2.urlopen(request).read()
1125                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1126                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1127                                 return
1128                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1129                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1130                         return
1131
1132                 # Confirm age
1133                 age_form = {
1134                                 'next_url':             '/',
1135                                 'action_confirm':       'Confirm',
1136                                 }
1137                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1138                 try:
1139                         self.report_age_confirmation()
1140                         age_results = urllib2.urlopen(request).read()
1141                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1142                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1143                         return
1144
1145         def _real_extract(self, url):
1146                 # Extract video id from URL
1147                 mobj = re.match(self._VALID_URL, url)
1148                 if mobj is None:
1149                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1150                         return
1151                 video_id = mobj.group(2)
1152
1153                 # Get video webpage
1154                 self.report_video_webpage_download(video_id)
1155                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1156                 try:
1157                         video_webpage = urllib2.urlopen(request).read()
1158                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1159                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1160                         return
1161
1162                 # Attempt to extract SWF player URL
1163                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1164                 if mobj is not None:
1165                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1166                 else:
1167                         player_url = None
1168
1169                 # Get video info
1170                 self.report_video_info_webpage_download(video_id)
1171                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1172                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1173                                            % (video_id, el_type))
1174                         request = urllib2.Request(video_info_url)
1175                         try:
1176                                 video_info_webpage = urllib2.urlopen(request).read()
1177                                 video_info = parse_qs(video_info_webpage)
1178                                 if 'token' in video_info:
1179                                         break
1180                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1182                                 return
1183                 if 'token' not in video_info:
1184                         if 'reason' in video_info:
1185                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1186                         else:
1187                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1188                         return
1189
1190                 # Start extracting information
1191                 self.report_information_extraction(video_id)
1192
1193                 # uploader
1194                 if 'author' not in video_info:
1195                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1196                         return
1197                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1198
1199                 # title
1200                 if 'title' not in video_info:
1201                         self._downloader.trouble(u'ERROR: unable to extract video title')
1202                         return
1203                 video_title = urllib.unquote_plus(video_info['title'][0])
1204                 video_title = video_title.decode('utf-8')
1205                 video_title = sanitize_title(video_title)
1206
1207                 # simplified title
1208                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1209                 simple_title = simple_title.strip(ur'_')
1210
1211                 # thumbnail image
1212                 if 'thumbnail_url' not in video_info:
1213                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1214                         video_thumbnail = ''
1215                 else:   # don't panic if we can't find it
1216                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1217
1218                 # upload date
1219                 upload_date = u'NA'
1220                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1221                 if mobj is not None:
1222                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1223                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1224                         for expression in format_expressions:
1225                                 try:
1226                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1227                                 except:
1228                                         pass
1229
1230                 # description
1231                 try:
1232                         lxml.etree
1233                 except NameError:
1234                         video_description = u'No description available.'
1235                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1236                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1237                                 if mobj is not None:
1238                                         video_description = mobj.group(1).decode('utf-8')
1239                 else:
1240                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1241                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1242                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1243                         # TODO use another parser
1244
1245                 # token
1246                 video_token = urllib.unquote_plus(video_info['token'][0])
1247
1248                 # Decide which formats to download
1249                 req_format = self._downloader.params.get('format', None)
1250
1251                 if 'fmt_url_map' in video_info and len(video_info['fmt_url_map']) >= 1:
1252                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1253                         format_limit = self._downloader.params.get('format_limit', None)
1254                         if format_limit is not None and format_limit in self._available_formats:
1255                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1256                         else:
1257                                 format_list = self._available_formats
1258                         existing_formats = [x for x in format_list if x in url_map]
1259                         if len(existing_formats) == 0:
1260                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1261                                 return
1262                         if req_format is None:
1263                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1264                         elif req_format == '-1':
1265                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1266                         else:
1267                                 # Specific format
1268                                 if req_format not in url_map:
1269                                         self._downloader.trouble(u'ERROR: requested format not available')
1270                                         return
1271                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1272
1273                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1274                         self.report_rtmp_download()
1275                         video_url_list = [(None, video_info['conn'][0])]
1276
1277                 else:
1278                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1279                         return
1280
1281                 for format_param, video_real_url in video_url_list:
1282                         # At this point we have a new video
1283                         self._downloader.increment_downloads()
1284
1285                         # Extension
1286                         video_extension = self._video_extensions.get(format_param, 'flv')
1287
1288                         # Find the video URL in fmt_url_map or conn paramters
1289                         try:
1290                                 # Process video information
1291                                 self._downloader.process_info({
1292                                         'id':           video_id.decode('utf-8'),
1293                                         'url':          video_real_url.decode('utf-8'),
1294                                         'uploader':     video_uploader.decode('utf-8'),
1295                                         'upload_date':  upload_date,
1296                                         'title':        video_title,
1297                                         'stitle':       simple_title,
1298                                         'ext':          video_extension.decode('utf-8'),
1299                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1300                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1301                                         'description':  video_description,
1302                                         'player_url':   player_url,
1303                                 })
1304                         except UnavailableVideoError, err:
1305                                 self._downloader.trouble(u'\nERROR: unable to download video')
1306
1307
1308 class MetacafeIE(InfoExtractor):
1309         """Information Extractor for metacafe.com."""
1310
1311         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1312         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1313         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1314         _youtube_ie = None
1315
1316         def __init__(self, youtube_ie, downloader=None):
1317                 InfoExtractor.__init__(self, downloader)
1318                 self._youtube_ie = youtube_ie
1319
1320         @staticmethod
1321         def suitable(url):
1322                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1323
1324         def report_disclaimer(self):
1325                 """Report disclaimer retrieval."""
1326                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1327
1328         def report_age_confirmation(self):
1329                 """Report attempt to confirm age."""
1330                 self._downloader.to_screen(u'[metacafe] Confirming age')
1331
1332         def report_download_webpage(self, video_id):
1333                 """Report webpage download."""
1334                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1335
1336         def report_extraction(self, video_id):
1337                 """Report information extraction."""
1338                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1339
1340         def _real_initialize(self):
1341                 # Retrieve disclaimer
1342                 request = urllib2.Request(self._DISCLAIMER)
1343                 try:
1344                         self.report_disclaimer()
1345                         disclaimer = urllib2.urlopen(request).read()
1346                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1347                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1348                         return
1349
1350                 # Confirm age
1351                 disclaimer_form = {
1352                         'filters': '0',
1353                         'submit': "Continue - I'm over 18",
1354                         }
1355                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1356                 try:
1357                         self.report_age_confirmation()
1358                         disclaimer = urllib2.urlopen(request).read()
1359                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1360                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1361                         return
1362
1363         def _real_extract(self, url):
1364                 # Extract id and simplified title from URL
1365                 mobj = re.match(self._VALID_URL, url)
1366                 if mobj is None:
1367                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1368                         return
1369
1370                 video_id = mobj.group(1)
1371
1372                 # Check if video comes from YouTube
1373                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1374                 if mobj2 is not None:
1375                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1376                         return
1377
1378                 # At this point we have a new video
1379                 self._downloader.increment_downloads()
1380
1381                 simple_title = mobj.group(2).decode('utf-8')
1382
1383                 # Retrieve video webpage to extract further information
1384                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1385                 try:
1386                         self.report_download_webpage(video_id)
1387                         webpage = urllib2.urlopen(request).read()
1388                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1389                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1390                         return
1391
1392                 # Extract URL, uploader and title from webpage
1393                 self.report_extraction(video_id)
1394                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1395                 if mobj is not None:
1396                         mediaURL = urllib.unquote(mobj.group(1))
1397                         video_extension = mediaURL[-3:]
1398
1399                         # Extract gdaKey if available
1400                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1401                         if mobj is None:
1402                                 video_url = mediaURL
1403                         else:
1404                                 gdaKey = mobj.group(1)
1405                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1406                 else:
1407                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1408                         if mobj is None:
1409                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1410                                 return
1411                         vardict = parse_qs(mobj.group(1))
1412                         if 'mediaData' not in vardict:
1413                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1414                                 return
1415                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1416                         if mobj is None:
1417                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1418                                 return
1419                         mediaURL = mobj.group(1).replace('\\/', '/')
1420                         video_extension = mediaURL[-3:]
1421                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1422
1423                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1424                 if mobj is None:
1425                         self._downloader.trouble(u'ERROR: unable to extract title')
1426                         return
1427                 video_title = mobj.group(1).decode('utf-8')
1428                 video_title = sanitize_title(video_title)
1429
1430                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1431                 if mobj is None:
1432                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1433                         return
1434                 video_uploader = mobj.group(1)
1435
1436                 try:
1437                         # Process video information
1438                         self._downloader.process_info({
1439                                 'id':           video_id.decode('utf-8'),
1440                                 'url':          video_url.decode('utf-8'),
1441                                 'uploader':     video_uploader.decode('utf-8'),
1442                                 'upload_date':  u'NA',
1443                                 'title':        video_title,
1444                                 'stitle':       simple_title,
1445                                 'ext':          video_extension.decode('utf-8'),
1446                                 'format':       u'NA',
1447                                 'player_url':   None,
1448                         })
1449                 except UnavailableVideoError:
1450                         self._downloader.trouble(u'\nERROR: unable to download video')
1451
1452
1453 class DailymotionIE(InfoExtractor):
1454         """Information Extractor for Dailymotion"""
1455
1456         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1457
1458         def __init__(self, downloader=None):
1459                 InfoExtractor.__init__(self, downloader)
1460
1461         @staticmethod
1462         def suitable(url):
1463                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1464
1465         def report_download_webpage(self, video_id):
1466                 """Report webpage download."""
1467                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1468
1469         def report_extraction(self, video_id):
1470                 """Report information extraction."""
1471                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1472
1473         def _real_initialize(self):
1474                 return
1475
1476         def _real_extract(self, url):
1477                 # Extract id and simplified title from URL
1478                 mobj = re.match(self._VALID_URL, url)
1479                 if mobj is None:
1480                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1481                         return
1482
1483                 # At this point we have a new video
1484                 self._downloader.increment_downloads()
1485                 video_id = mobj.group(1)
1486
1487                 simple_title = mobj.group(2).decode('utf-8')
1488                 video_extension = 'flv'
1489
1490                 # Retrieve video webpage to extract further information
1491                 request = urllib2.Request(url)
1492                 try:
1493                         self.report_download_webpage(video_id)
1494                         webpage = urllib2.urlopen(request).read()
1495                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1496                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1497                         return
1498
1499                 # Extract URL, uploader and title from webpage
1500                 self.report_extraction(video_id)
1501                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1502                 if mobj is None:
1503                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1504                         return
1505                 mediaURL = urllib.unquote(mobj.group(1))
1506
1507                 # if needed add http://www.dailymotion.com/ if relative URL
1508
1509                 video_url = mediaURL
1510
1511                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1512                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1513                 if mobj is None:
1514                         self._downloader.trouble(u'ERROR: unable to extract title')
1515                         return
1516                 video_title = mobj.group(1).decode('utf-8')
1517                 video_title = sanitize_title(video_title)
1518
1519                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1520                 if mobj is None:
1521                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1522                         return
1523                 video_uploader = mobj.group(1)
1524
1525                 try:
1526                         # Process video information
1527                         self._downloader.process_info({
1528                                 'id':           video_id.decode('utf-8'),
1529                                 'url':          video_url.decode('utf-8'),
1530                                 'uploader':     video_uploader.decode('utf-8'),
1531                                 'upload_date':  u'NA',
1532                                 'title':        video_title,
1533                                 'stitle':       simple_title,
1534                                 'ext':          video_extension.decode('utf-8'),
1535                                 'format':       u'NA',
1536                                 'player_url':   None,
1537                         })
1538                 except UnavailableVideoError:
1539                         self._downloader.trouble(u'\nERROR: unable to download video')
1540
1541 class GoogleIE(InfoExtractor):
1542         """Information extractor for video.google.com."""
1543
1544         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1545
1546         def __init__(self, downloader=None):
1547                 InfoExtractor.__init__(self, downloader)
1548
1549         @staticmethod
1550         def suitable(url):
1551                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1552
1553         def report_download_webpage(self, video_id):
1554                 """Report webpage download."""
1555                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1556
1557         def report_extraction(self, video_id):
1558                 """Report information extraction."""
1559                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1560
1561         def _real_initialize(self):
1562                 return
1563
1564         def _real_extract(self, url):
1565                 # Extract id from URL
1566                 mobj = re.match(self._VALID_URL, url)
1567                 if mobj is None:
1568                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1569                         return
1570
1571                 # At this point we have a new video
1572                 self._downloader.increment_downloads()
1573                 video_id = mobj.group(1)
1574
1575                 video_extension = 'mp4'
1576
1577                 # Retrieve video webpage to extract further information
1578                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1579                 try:
1580                         self.report_download_webpage(video_id)
1581                         webpage = urllib2.urlopen(request).read()
1582                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1583                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1584                         return
1585
1586                 # Extract URL, uploader, and title from webpage
1587                 self.report_extraction(video_id)
1588                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1589                 if mobj is None:
1590                         video_extension = 'flv'
1591                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1592                 if mobj is None:
1593                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1594                         return
1595                 mediaURL = urllib.unquote(mobj.group(1))
1596                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1597                 mediaURL = mediaURL.replace('\\x26', '\x26')
1598
1599                 video_url = mediaURL
1600
1601                 mobj = re.search(r'<title>(.*)</title>', webpage)
1602                 if mobj is None:
1603                         self._downloader.trouble(u'ERROR: unable to extract title')
1604                         return
1605                 video_title = mobj.group(1).decode('utf-8')
1606                 video_title = sanitize_title(video_title)
1607                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1608
1609                 # Extract video description
1610                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1611                 if mobj is None:
1612                         self._downloader.trouble(u'ERROR: unable to extract video description')
1613                         return
1614                 video_description = mobj.group(1).decode('utf-8')
1615                 if not video_description:
1616                         video_description = 'No description available.'
1617
1618                 # Extract video thumbnail
1619                 if self._downloader.params.get('forcethumbnail', False):
1620                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1621                         try:
1622                                 webpage = urllib2.urlopen(request).read()
1623                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1624                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1625                                 return
1626                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1627                         if mobj is None:
1628                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1629                                 return
1630                         video_thumbnail = mobj.group(1)
1631                 else:   # we need something to pass to process_info
1632                         video_thumbnail = ''
1633
1634
1635                 try:
1636                         # Process video information
1637                         self._downloader.process_info({
1638                                 'id':           video_id.decode('utf-8'),
1639                                 'url':          video_url.decode('utf-8'),
1640                                 'uploader':     u'NA',
1641                                 'upload_date':  u'NA',
1642                                 'title':        video_title,
1643                                 'stitle':       simple_title,
1644                                 'ext':          video_extension.decode('utf-8'),
1645                                 'format':       u'NA',
1646                                 'player_url':   None,
1647                         })
1648                 except UnavailableVideoError:
1649                         self._downloader.trouble(u'\nERROR: unable to download video')
1650
1651
1652 class PhotobucketIE(InfoExtractor):
1653         """Information extractor for photobucket.com."""
1654
1655         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1656
1657         def __init__(self, downloader=None):
1658                 InfoExtractor.__init__(self, downloader)
1659
1660         @staticmethod
1661         def suitable(url):
1662                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1663
1664         def report_download_webpage(self, video_id):
1665                 """Report webpage download."""
1666                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1667
1668         def report_extraction(self, video_id):
1669                 """Report information extraction."""
1670                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1671
1672         def _real_initialize(self):
1673                 return
1674
1675         def _real_extract(self, url):
1676                 # Extract id from URL
1677                 mobj = re.match(self._VALID_URL, url)
1678                 if mobj is None:
1679                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1680                         return
1681
1682                 # At this point we have a new video
1683                 self._downloader.increment_downloads()
1684                 video_id = mobj.group(1)
1685
1686                 video_extension = 'flv'
1687
1688                 # Retrieve video webpage to extract further information
1689                 request = urllib2.Request(url)
1690                 try:
1691                         self.report_download_webpage(video_id)
1692                         webpage = urllib2.urlopen(request).read()
1693                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1694                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1695                         return
1696
1697                 # Extract URL, uploader, and title from webpage
1698                 self.report_extraction(video_id)
1699                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1700                 if mobj is None:
1701                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1702                         return
1703                 mediaURL = urllib.unquote(mobj.group(1))
1704
1705                 video_url = mediaURL
1706
1707                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1708                 if mobj is None:
1709                         self._downloader.trouble(u'ERROR: unable to extract title')
1710                         return
1711                 video_title = mobj.group(1).decode('utf-8')
1712                 video_title = sanitize_title(video_title)
1713                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1714
1715                 video_uploader = mobj.group(2).decode('utf-8')
1716
1717                 try:
1718                         # Process video information
1719                         self._downloader.process_info({
1720                                 'id':           video_id.decode('utf-8'),
1721                                 'url':          video_url.decode('utf-8'),
1722                                 'uploader':     video_uploader,
1723                                 'upload_date':  u'NA',
1724                                 'title':        video_title,
1725                                 'stitle':       simple_title,
1726                                 'ext':          video_extension.decode('utf-8'),
1727                                 'format':       u'NA',
1728                                 'player_url':   None,
1729                         })
1730                 except UnavailableVideoError:
1731                         self._downloader.trouble(u'\nERROR: unable to download video')
1732
1733
1734 class YahooIE(InfoExtractor):
1735         """Information extractor for video.yahoo.com."""
1736
1737         # _VALID_URL matches all Yahoo! Video URLs
1738         # _VPAGE_URL matches only the extractable '/watch/' URLs
1739         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1740         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1741
1742         def __init__(self, downloader=None):
1743                 InfoExtractor.__init__(self, downloader)
1744
1745         @staticmethod
1746         def suitable(url):
1747                 return (re.match(YahooIE._VALID_URL, url) is not None)
1748
1749         def report_download_webpage(self, video_id):
1750                 """Report webpage download."""
1751                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1752
1753         def report_extraction(self, video_id):
1754                 """Report information extraction."""
1755                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1756
1757         def _real_initialize(self):
1758                 return
1759
1760         def _real_extract(self, url, new_video=True):
1761                 # Extract ID from URL
1762                 mobj = re.match(self._VALID_URL, url)
1763                 if mobj is None:
1764                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1765                         return
1766
1767                 # At this point we have a new video
1768                 self._downloader.increment_downloads()
1769                 video_id = mobj.group(2)
1770                 video_extension = 'flv'
1771
1772                 # Rewrite valid but non-extractable URLs as
1773                 # extractable English language /watch/ URLs
1774                 if re.match(self._VPAGE_URL, url) is None:
1775                         request = urllib2.Request(url)
1776                         try:
1777                                 webpage = urllib2.urlopen(request).read()
1778                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1779                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1780                                 return
1781
1782                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1783                         if mobj is None:
1784                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1785                                 return
1786                         yahoo_id = mobj.group(1)
1787
1788                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1789                         if mobj is None:
1790                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1791                                 return
1792                         yahoo_vid = mobj.group(1)
1793
1794                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1795                         return self._real_extract(url, new_video=False)
1796
1797                 # Retrieve video webpage to extract further information
1798                 request = urllib2.Request(url)
1799                 try:
1800                         self.report_download_webpage(video_id)
1801                         webpage = urllib2.urlopen(request).read()
1802                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1803                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1804                         return
1805
1806                 # Extract uploader and title from webpage
1807                 self.report_extraction(video_id)
1808                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1809                 if mobj is None:
1810                         self._downloader.trouble(u'ERROR: unable to extract video title')
1811                         return
1812                 video_title = mobj.group(1).decode('utf-8')
1813                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1814
1815                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1816                 if mobj is None:
1817                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1818                         return
1819                 video_uploader = mobj.group(1).decode('utf-8')
1820
1821                 # Extract video thumbnail
1822                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1823                 if mobj is None:
1824                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1825                         return
1826                 video_thumbnail = mobj.group(1).decode('utf-8')
1827
1828                 # Extract video description
1829                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1830                 if mobj is None:
1831                         self._downloader.trouble(u'ERROR: unable to extract video description')
1832                         return
1833                 video_description = mobj.group(1).decode('utf-8')
1834                 if not video_description: video_description = 'No description available.'
1835
1836                 # Extract video height and width
1837                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1838                 if mobj is None:
1839                         self._downloader.trouble(u'ERROR: unable to extract video height')
1840                         return
1841                 yv_video_height = mobj.group(1)
1842
1843                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1844                 if mobj is None:
1845                         self._downloader.trouble(u'ERROR: unable to extract video width')
1846                         return
1847                 yv_video_width = mobj.group(1)
1848
1849                 # Retrieve video playlist to extract media URL
1850                 # I'm not completely sure what all these options are, but we
1851                 # seem to need most of them, otherwise the server sends a 401.
1852                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1853                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1854                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1855                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1856                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1857                 try:
1858                         self.report_download_webpage(video_id)
1859                         webpage = urllib2.urlopen(request).read()
1860                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1861                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1862                         return
1863
1864                 # Extract media URL from playlist XML
1865                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1866                 if mobj is None:
1867                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1868                         return
1869                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1870                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1871
1872                 try:
1873                         # Process video information
1874                         self._downloader.process_info({
1875                                 'id':           video_id.decode('utf-8'),
1876                                 'url':          video_url,
1877                                 'uploader':     video_uploader,
1878                                 'upload_date':  u'NA',
1879                                 'title':        video_title,
1880                                 'stitle':       simple_title,
1881                                 'ext':          video_extension.decode('utf-8'),
1882                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1883                                 'description':  video_description,
1884                                 'thumbnail':    video_thumbnail,
1885                                 'description':  video_description,
1886                                 'player_url':   None,
1887                         })
1888                 except UnavailableVideoError:
1889                         self._downloader.trouble(u'\nERROR: unable to download video')
1890
1891
1892 class GenericIE(InfoExtractor):
1893         """Generic last-resort information extractor."""
1894
1895         def __init__(self, downloader=None):
1896                 InfoExtractor.__init__(self, downloader)
1897
1898         @staticmethod
1899         def suitable(url):
1900                 return True
1901
1902         def report_download_webpage(self, video_id):
1903                 """Report webpage download."""
1904                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1905                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1906
1907         def report_extraction(self, video_id):
1908                 """Report information extraction."""
1909                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1910
1911         def _real_initialize(self):
1912                 return
1913
1914         def _real_extract(self, url):
1915                 # At this point we have a new video
1916                 self._downloader.increment_downloads()
1917
1918                 video_id = url.split('/')[-1]
1919                 request = urllib2.Request(url)
1920                 try:
1921                         self.report_download_webpage(video_id)
1922                         webpage = urllib2.urlopen(request).read()
1923                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1924                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1925                         return
1926                 except ValueError, err:
1927                         # since this is the last-resort InfoExtractor, if
1928                         # this error is thrown, it'll be thrown here
1929                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1930                         return
1931
1932                 self.report_extraction(video_id)
1933                 # Start with something easy: JW Player in SWFObject
1934                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1935                 if mobj is None:
1936                         # Broaden the search a little bit
1937                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1938                 if mobj is None:
1939                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1940                         return
1941
1942                 # It's possible that one of the regexes
1943                 # matched, but returned an empty group:
1944                 if mobj.group(1) is None:
1945                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1946                         return
1947
1948                 video_url = urllib.unquote(mobj.group(1))
1949                 video_id  = os.path.basename(video_url)
1950
1951                 # here's a fun little line of code for you:
1952                 video_extension = os.path.splitext(video_id)[1][1:]
1953                 video_id        = os.path.splitext(video_id)[0]
1954
1955                 # it's tempting to parse this further, but you would
1956                 # have to take into account all the variations like
1957                 #   Video Title - Site Name
1958                 #   Site Name | Video Title
1959                 #   Video Title - Tagline | Site Name
1960                 # and so on and so forth; it's just not practical
1961                 mobj = re.search(r'<title>(.*)</title>', webpage)
1962                 if mobj is None:
1963                         self._downloader.trouble(u'ERROR: unable to extract title')
1964                         return
1965                 video_title = mobj.group(1).decode('utf-8')
1966                 video_title = sanitize_title(video_title)
1967                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1968
1969                 # video uploader is domain name
1970                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1971                 if mobj is None:
1972                         self._downloader.trouble(u'ERROR: unable to extract title')
1973                         return
1974                 video_uploader = mobj.group(1).decode('utf-8')
1975
1976                 try:
1977                         # Process video information
1978                         self._downloader.process_info({
1979                                 'id':           video_id.decode('utf-8'),
1980                                 'url':          video_url.decode('utf-8'),
1981                                 'uploader':     video_uploader,
1982                                 'upload_date':  u'NA',
1983                                 'title':        video_title,
1984                                 'stitle':       simple_title,
1985                                 'ext':          video_extension.decode('utf-8'),
1986                                 'format':       u'NA',
1987                                 'player_url':   None,
1988                         })
1989                 except UnavailableVideoError, err:
1990                         self._downloader.trouble(u'\nERROR: unable to download video')
1991
1992
1993 class YoutubeSearchIE(InfoExtractor):
1994         """Information Extractor for YouTube search queries."""
1995         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1996         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1997         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1998         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1999         _youtube_ie = None
2000         _max_youtube_results = 1000
2001
2002         def __init__(self, youtube_ie, downloader=None):
2003                 InfoExtractor.__init__(self, downloader)
2004                 self._youtube_ie = youtube_ie
2005
2006         @staticmethod
2007         def suitable(url):
2008                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2009
2010         def report_download_page(self, query, pagenum):
2011                 """Report attempt to download playlist page with given number."""
2012                 query = query.decode(preferredencoding())
2013                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2014
2015         def _real_initialize(self):
2016                 self._youtube_ie.initialize()
2017
2018         def _real_extract(self, query):
2019                 mobj = re.match(self._VALID_QUERY, query)
2020                 if mobj is None:
2021                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2022                         return
2023
2024                 prefix, query = query.split(':')
2025                 prefix = prefix[8:]
2026                 query  = query.encode('utf-8')
2027                 if prefix == '':
2028                         self._download_n_results(query, 1)
2029                         return
2030                 elif prefix == 'all':
2031                         self._download_n_results(query, self._max_youtube_results)
2032                         return
2033                 else:
2034                         try:
2035                                 n = long(prefix)
2036                                 if n <= 0:
2037                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2038                                         return
2039                                 elif n > self._max_youtube_results:
2040                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2041                                         n = self._max_youtube_results
2042                                 self._download_n_results(query, n)
2043                                 return
2044                         except ValueError: # parsing prefix as integer fails
2045                                 self._download_n_results(query, 1)
2046                                 return
2047
2048         def _download_n_results(self, query, n):
2049                 """Downloads a specified number of results for a query"""
2050
2051                 video_ids = []
2052                 already_seen = set()
2053                 pagenum = 1
2054
2055                 while True:
2056                         self.report_download_page(query, pagenum)
2057                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2058                         request = urllib2.Request(result_url)
2059                         try:
2060                                 page = urllib2.urlopen(request).read()
2061                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2062                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2063                                 return
2064
2065                         # Extract video identifiers
2066                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2067                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2068                                 if video_id not in already_seen:
2069                                         video_ids.append(video_id)
2070                                         already_seen.add(video_id)
2071                                         if len(video_ids) == n:
2072                                                 # Specified n videos reached
2073                                                 for id in video_ids:
2074                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2075                                                 return
2076
2077                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2078                                 for id in video_ids:
2079                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2080                                 return
2081
2082                         pagenum = pagenum + 1
2083
2084 class GoogleSearchIE(InfoExtractor):
2085         """Information Extractor for Google Video search queries."""
2086         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2087         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2088         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2089         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2090         _google_ie = None
2091         _max_google_results = 1000
2092
2093         def __init__(self, google_ie, downloader=None):
2094                 InfoExtractor.__init__(self, downloader)
2095                 self._google_ie = google_ie
2096
2097         @staticmethod
2098         def suitable(url):
2099                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2100
2101         def report_download_page(self, query, pagenum):
2102                 """Report attempt to download playlist page with given number."""
2103                 query = query.decode(preferredencoding())
2104                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2105
2106         def _real_initialize(self):
2107                 self._google_ie.initialize()
2108
2109         def _real_extract(self, query):
2110                 mobj = re.match(self._VALID_QUERY, query)
2111                 if mobj is None:
2112                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2113                         return
2114
2115                 prefix, query = query.split(':')
2116                 prefix = prefix[8:]
2117                 query  = query.encode('utf-8')
2118                 if prefix == '':
2119                         self._download_n_results(query, 1)
2120                         return
2121                 elif prefix == 'all':
2122                         self._download_n_results(query, self._max_google_results)
2123                         return
2124                 else:
2125                         try:
2126                                 n = long(prefix)
2127                                 if n <= 0:
2128                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2129                                         return
2130                                 elif n > self._max_google_results:
2131                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2132                                         n = self._max_google_results
2133                                 self._download_n_results(query, n)
2134                                 return
2135                         except ValueError: # parsing prefix as integer fails
2136                                 self._download_n_results(query, 1)
2137                                 return
2138
2139         def _download_n_results(self, query, n):
2140                 """Downloads a specified number of results for a query"""
2141
2142                 video_ids = []
2143                 already_seen = set()
2144                 pagenum = 1
2145
2146                 while True:
2147                         self.report_download_page(query, pagenum)
2148                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2149                         request = urllib2.Request(result_url)
2150                         try:
2151                                 page = urllib2.urlopen(request).read()
2152                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2153                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2154                                 return
2155
2156                         # Extract video identifiers
2157                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2158                                 video_id = mobj.group(1)
2159                                 if video_id not in already_seen:
2160                                         video_ids.append(video_id)
2161                                         already_seen.add(video_id)
2162                                         if len(video_ids) == n:
2163                                                 # Specified n videos reached
2164                                                 for id in video_ids:
2165                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2166                                                 return
2167
2168                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2169                                 for id in video_ids:
2170                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2171                                 return
2172
2173                         pagenum = pagenum + 1
2174
2175 class YahooSearchIE(InfoExtractor):
2176         """Information Extractor for Yahoo! Video search queries."""
2177         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2178         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2179         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2180         _MORE_PAGES_INDICATOR = r'\s*Next'
2181         _yahoo_ie = None
2182         _max_yahoo_results = 1000
2183
2184         def __init__(self, yahoo_ie, downloader=None):
2185                 InfoExtractor.__init__(self, downloader)
2186                 self._yahoo_ie = yahoo_ie
2187
2188         @staticmethod
2189         def suitable(url):
2190                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2191
2192         def report_download_page(self, query, pagenum):
2193                 """Report attempt to download playlist page with given number."""
2194                 query = query.decode(preferredencoding())
2195                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2196
2197         def _real_initialize(self):
2198                 self._yahoo_ie.initialize()
2199
2200         def _real_extract(self, query):
2201                 mobj = re.match(self._VALID_QUERY, query)
2202                 if mobj is None:
2203                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2204                         return
2205
2206                 prefix, query = query.split(':')
2207                 prefix = prefix[8:]
2208                 query  = query.encode('utf-8')
2209                 if prefix == '':
2210                         self._download_n_results(query, 1)
2211                         return
2212                 elif prefix == 'all':
2213                         self._download_n_results(query, self._max_yahoo_results)
2214                         return
2215                 else:
2216                         try:
2217                                 n = long(prefix)
2218                                 if n <= 0:
2219                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2220                                         return
2221                                 elif n > self._max_yahoo_results:
2222                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2223                                         n = self._max_yahoo_results
2224                                 self._download_n_results(query, n)
2225                                 return
2226                         except ValueError: # parsing prefix as integer fails
2227                                 self._download_n_results(query, 1)
2228                                 return
2229
2230         def _download_n_results(self, query, n):
2231                 """Downloads a specified number of results for a query"""
2232
2233                 video_ids = []
2234                 already_seen = set()
2235                 pagenum = 1
2236
2237                 while True:
2238                         self.report_download_page(query, pagenum)
2239                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2240                         request = urllib2.Request(result_url)
2241                         try:
2242                                 page = urllib2.urlopen(request).read()
2243                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2244                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2245                                 return
2246
2247                         # Extract video identifiers
2248                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2249                                 video_id = mobj.group(1)
2250                                 if video_id not in already_seen:
2251                                         video_ids.append(video_id)
2252                                         already_seen.add(video_id)
2253                                         if len(video_ids) == n:
2254                                                 # Specified n videos reached
2255                                                 for id in video_ids:
2256                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2257                                                 return
2258
2259                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2260                                 for id in video_ids:
2261                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2262                                 return
2263
2264                         pagenum = pagenum + 1
2265
2266 class YoutubePlaylistIE(InfoExtractor):
2267         """Information Extractor for YouTube playlists."""
2268
2269         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2270         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2271         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2272         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2273         _youtube_ie = None
2274
2275         def __init__(self, youtube_ie, downloader=None):
2276                 InfoExtractor.__init__(self, downloader)
2277                 self._youtube_ie = youtube_ie
2278
2279         @staticmethod
2280         def suitable(url):
2281                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2282
2283         def report_download_page(self, playlist_id, pagenum):
2284                 """Report attempt to download playlist page with given number."""
2285                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2286
2287         def _real_initialize(self):
2288                 self._youtube_ie.initialize()
2289
2290         def _real_extract(self, url):
2291                 # Extract playlist id
2292                 mobj = re.match(self._VALID_URL, url)
2293                 if mobj is None:
2294                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2295                         return
2296
2297                 # Single video case
2298                 if mobj.group(3) is not None:
2299                         self._youtube_ie.extract(mobj.group(3))
2300                         return
2301
2302                 # Download playlist pages
2303                 # prefix is 'p' as default for playlists but there are other types that need extra care
2304                 playlist_prefix = mobj.group(1)
2305                 if playlist_prefix == 'a':
2306                         playlist_access = 'artist'
2307                 else:
2308                         playlist_prefix = 'p'
2309                         playlist_access = 'view_play_list'
2310                 playlist_id = mobj.group(2)
2311                 video_ids = []
2312                 pagenum = 1
2313
2314                 while True:
2315                         self.report_download_page(playlist_id, pagenum)
2316                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2317                         try:
2318                                 page = urllib2.urlopen(request).read()
2319                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2320                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2321                                 return
2322
2323                         # Extract video identifiers
2324                         ids_in_page = []
2325                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2326                                 if mobj.group(1) not in ids_in_page:
2327                                         ids_in_page.append(mobj.group(1))
2328                         video_ids.extend(ids_in_page)
2329
2330                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2331                                 break
2332                         pagenum = pagenum + 1
2333
2334                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2335                 playlistend = self._downloader.params.get('playlistend', -1)
2336                 video_ids = video_ids[playliststart:playlistend]
2337
2338                 for id in video_ids:
2339                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2340                 return
2341
2342 class YoutubeUserIE(InfoExtractor):
2343         """Information Extractor for YouTube users."""
2344
2345         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2346         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2347         _GDATA_PAGE_SIZE = 50
2348         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2349         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2350         _youtube_ie = None
2351
2352         def __init__(self, youtube_ie, downloader=None):
2353                 InfoExtractor.__init__(self, downloader)
2354                 self._youtube_ie = youtube_ie
2355
2356         @staticmethod
2357         def suitable(url):
2358                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2359
2360         def report_download_page(self, username, start_index):
2361                 """Report attempt to download user page."""
2362                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2363                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2364
2365         def _real_initialize(self):
2366                 self._youtube_ie.initialize()
2367
2368         def _real_extract(self, url):
2369                 # Extract username
2370                 mobj = re.match(self._VALID_URL, url)
2371                 if mobj is None:
2372                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2373                         return
2374
2375                 username = mobj.group(1)
2376
2377                 # Download video ids using YouTube Data API. Result size per
2378                 # query is limited (currently to 50 videos) so we need to query
2379                 # page by page until there are no video ids - it means we got
2380                 # all of them.
2381
2382                 video_ids = []
2383                 pagenum = 0
2384
2385                 while True:
2386                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2387                         self.report_download_page(username, start_index)
2388
2389                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2390
2391                         try:
2392                                 page = urllib2.urlopen(request).read()
2393                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2394                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2395                                 return
2396
2397                         # Extract video identifiers
2398                         ids_in_page = []
2399
2400                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2401                                 if mobj.group(1) not in ids_in_page:
2402                                         ids_in_page.append(mobj.group(1))
2403
2404                         video_ids.extend(ids_in_page)
2405
2406                         # A little optimization - if current page is not
2407                         # "full", ie. does not contain PAGE_SIZE video ids then
2408                         # we can assume that this page is the last one - there
2409                         # are no more ids on further pages - no need to query
2410                         # again.
2411
2412                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2413                                 break
2414
2415                         pagenum += 1
2416
2417                 all_ids_count = len(video_ids)
2418                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2419                 playlistend = self._downloader.params.get('playlistend', -1)
2420
2421                 if playlistend == -1:
2422                         video_ids = video_ids[playliststart:]
2423                 else:
2424                         video_ids = video_ids[playliststart:playlistend]
2425
2426                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2427                                            (username, all_ids_count, len(video_ids)))
2428
2429                 for video_id in video_ids:
2430                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2431
2432
2433 class DepositFilesIE(InfoExtractor):
2434         """Information extractor for depositfiles.com"""
2435
2436         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2437
2438         def __init__(self, downloader=None):
2439                 InfoExtractor.__init__(self, downloader)
2440
2441         @staticmethod
2442         def suitable(url):
2443                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2444
2445         def report_download_webpage(self, file_id):
2446                 """Report webpage download."""
2447                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2448
2449         def report_extraction(self, file_id):
2450                 """Report information extraction."""
2451                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2452
2453         def _real_initialize(self):
2454                 return
2455
2456         def _real_extract(self, url):
2457                 # At this point we have a new file
2458                 self._downloader.increment_downloads()
2459
2460                 file_id = url.split('/')[-1]
2461                 # Rebuild url in english locale
2462                 url = 'http://depositfiles.com/en/files/' + file_id
2463
2464                 # Retrieve file webpage with 'Free download' button pressed
2465                 free_download_indication = { 'gateway_result' : '1' }
2466                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2467                 try:
2468                         self.report_download_webpage(file_id)
2469                         webpage = urllib2.urlopen(request).read()
2470                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2471                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2472                         return
2473
2474                 # Search for the real file URL
2475                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2476                 if (mobj is None) or (mobj.group(1) is None):
2477                         # Try to figure out reason of the error.
2478                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2479                         if (mobj is not None) and (mobj.group(1) is not None):
2480                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2481                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2482                         else:
2483                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2484                         return
2485
2486                 file_url = mobj.group(1)
2487                 file_extension = os.path.splitext(file_url)[1][1:]
2488
2489                 # Search for file title
2490                 mobj = re.search(r'<b title="(.*?)">', webpage)
2491                 if mobj is None:
2492                         self._downloader.trouble(u'ERROR: unable to extract title')
2493                         return
2494                 file_title = mobj.group(1).decode('utf-8')
2495
2496                 try:
2497                         # Process file information
2498                         self._downloader.process_info({
2499                                 'id':           file_id.decode('utf-8'),
2500                                 'url':          file_url.decode('utf-8'),
2501                                 'uploader':     u'NA',
2502                                 'upload_date':  u'NA',
2503                                 'title':        file_title,
2504                                 'stitle':       file_title,
2505                                 'ext':          file_extension.decode('utf-8'),
2506                                 'format':       u'NA',
2507                                 'player_url':   None,
2508                         })
2509                 except UnavailableVideoError, err:
2510                         self._downloader.trouble(u'ERROR: unable to download file')
2511
2512 class FacebookIE(InfoExtractor):
2513         """Information Extractor for Facebook"""
2514
2515         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2516         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2517         _NETRC_MACHINE = 'facebook'
2518         _available_formats = ['highqual', 'lowqual']
2519         _video_extensions = {
2520                 'highqual': 'mp4',
2521                 'lowqual': 'mp4',
2522         }
2523
2524         def __init__(self, downloader=None):
2525                 InfoExtractor.__init__(self, downloader)
2526
2527         @staticmethod
2528         def suitable(url):
2529                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2530
2531         def _reporter(self, message):
2532                 """Add header and report message."""
2533                 self._downloader.to_screen(u'[facebook] %s' % message)
2534
2535         def report_login(self):
2536                 """Report attempt to log in."""
2537                 self._reporter(u'Logging in')
2538
2539         def report_video_webpage_download(self, video_id):
2540                 """Report attempt to download video webpage."""
2541                 self._reporter(u'%s: Downloading video webpage' % video_id)
2542
2543         def report_information_extraction(self, video_id):
2544                 """Report attempt to extract video information."""
2545                 self._reporter(u'%s: Extracting video information' % video_id)
2546
2547         def _parse_page(self, video_webpage):
2548                 """Extract video information from page"""
2549                 # General data
2550                 data = {'title': r'class="video_title datawrap">(.*?)</',
2551                         'description': r'<div class="datawrap">(.*?)</div>',
2552                         'owner': r'\("video_owner_name", "(.*?)"\)',
2553                         'upload_date': r'data-date="(.*?)"',
2554                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2555                         }
2556                 video_info = {}
2557                 for piece in data.keys():
2558                         mobj = re.search(data[piece], video_webpage)
2559                         if mobj is not None:
2560                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2561
2562                 # Video urls
2563                 video_urls = {}
2564                 for fmt in self._available_formats:
2565                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2566                         if mobj is not None:
2567                                 # URL is in a Javascript segment inside an escaped Unicode format within
2568                                 # the generally utf-8 page
2569                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2570                 video_info['video_urls'] = video_urls
2571
2572                 return video_info
2573
2574         def _real_initialize(self):
2575                 if self._downloader is None:
2576                         return
2577
2578                 useremail = None
2579                 password = None
2580                 downloader_params = self._downloader.params
2581
2582                 # Attempt to use provided username and password or .netrc data
2583                 if downloader_params.get('username', None) is not None:
2584                         useremail = downloader_params['username']
2585                         password = downloader_params['password']
2586                 elif downloader_params.get('usenetrc', False):
2587                         try:
2588                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2589                                 if info is not None:
2590                                         useremail = info[0]
2591                                         password = info[2]
2592                                 else:
2593                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2594                         except (IOError, netrc.NetrcParseError), err:
2595                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2596                                 return
2597
2598                 if useremail is None:
2599                         return
2600
2601                 # Log in
2602                 login_form = {
2603                         'email': useremail,
2604                         'pass': password,
2605                         'login': 'Log+In'
2606                         }
2607                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2608                 try:
2609                         self.report_login()
2610                         login_results = urllib2.urlopen(request).read()
2611                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2612                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2613                                 return
2614                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2615                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2616                         return
2617
2618         def _real_extract(self, url):
2619                 mobj = re.match(self._VALID_URL, url)
2620                 if mobj is None:
2621                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2622                         return
2623                 video_id = mobj.group('ID')
2624
2625                 # Get video webpage
2626                 self.report_video_webpage_download(video_id)
2627                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2628                 try:
2629                         page = urllib2.urlopen(request)
2630                         video_webpage = page.read()
2631                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2632                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2633                         return
2634
2635                 # Start extracting information
2636                 self.report_information_extraction(video_id)
2637
2638                 # Extract information
2639                 video_info = self._parse_page(video_webpage)
2640
2641                 # uploader
2642                 if 'owner' not in video_info:
2643                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2644                         return
2645                 video_uploader = video_info['owner']
2646
2647                 # title
2648                 if 'title' not in video_info:
2649                         self._downloader.trouble(u'ERROR: unable to extract video title')
2650                         return
2651                 video_title = video_info['title']
2652                 video_title = video_title.decode('utf-8')
2653                 video_title = sanitize_title(video_title)
2654
2655                 # simplified title
2656                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2657                 simple_title = simple_title.strip(ur'_')
2658
2659                 # thumbnail image
2660                 if 'thumbnail' not in video_info:
2661                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2662                         video_thumbnail = ''
2663                 else:
2664                         video_thumbnail = video_info['thumbnail']
2665
2666                 # upload date
2667                 upload_date = u'NA'
2668                 if 'upload_date' in video_info:
2669                         upload_time = video_info['upload_date']
2670                         timetuple = email.utils.parsedate_tz(upload_time)
2671                         if timetuple is not None:
2672                                 try:
2673                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2674                                 except:
2675                                         pass
2676
2677                 # description
2678                 video_description = video_info.get('description', 'No description available.')
2679
2680                 url_map = video_info['video_urls']
2681                 if len(url_map.keys()) > 0:
2682                         # Decide which formats to download
2683                         req_format = self._downloader.params.get('format', None)
2684                         format_limit = self._downloader.params.get('format_limit', None)
2685
2686                         if format_limit is not None and format_limit in self._available_formats:
2687                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2688                         else:
2689                                 format_list = self._available_formats
2690                         existing_formats = [x for x in format_list if x in url_map]
2691                         if len(existing_formats) == 0:
2692                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2693                                 return
2694                         if req_format is None:
2695                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2696                         elif req_format == '-1':
2697                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2698                         else:
2699                                 # Specific format
2700                                 if req_format not in url_map:
2701                                         self._downloader.trouble(u'ERROR: requested format not available')
2702                                         return
2703                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2704
2705                 for format_param, video_real_url in video_url_list:
2706
2707                         # At this point we have a new video
2708                         self._downloader.increment_downloads()
2709
2710                         # Extension
2711                         video_extension = self._video_extensions.get(format_param, 'mp4')
2712
2713                         # Find the video URL in fmt_url_map or conn paramters
2714                         try:
2715                                 # Process video information
2716                                 self._downloader.process_info({
2717                                         'id':           video_id.decode('utf-8'),
2718                                         'url':          video_real_url.decode('utf-8'),
2719                                         'uploader':     video_uploader.decode('utf-8'),
2720                                         'upload_date':  upload_date,
2721                                         'title':        video_title,
2722                                         'stitle':       simple_title,
2723                                         'ext':          video_extension.decode('utf-8'),
2724                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2725                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2726                                         'description':  video_description.decode('utf-8'),
2727                                         'player_url':   None,
2728                                 })
2729                         except UnavailableVideoError, err:
2730                                 self._downloader.trouble(u'\nERROR: unable to download video')
2731
2732 class BlipTVIE(InfoExtractor):
2733         """Information extractor for blip.tv"""
2734
2735         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2736         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2737
2738         @staticmethod
2739         def suitable(url):
2740                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2741
2742         def report_extraction(self, file_id):
2743                 """Report information extraction."""
2744                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2745
2746         def _simplify_title(self, title):
2747                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2748                 res = res.strip(ur'_')
2749                 return res
2750
2751         def _real_extract(self, url):
2752                 mobj = re.match(self._VALID_URL, url)
2753                 if mobj is None:
2754                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2755                         return
2756
2757                 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2758                 request = urllib2.Request(json_url)
2759                 self.report_extraction(mobj.group(1))
2760                 try:
2761                         json_code = urllib2.urlopen(request).read()
2762                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2763                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2764                         return
2765                 try:
2766                         json_data = json.loads(json_code)
2767                         data = json_data['Post'] if 'Post' in json_data else json_data
2768
2769                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2770                         video_url = data['media']['url']
2771                         umobj = re.match(self._URL_EXT, video_url)
2772                         if umobj is None:
2773                                 raise ValueError('Can not determine filename extension')
2774                         ext = umobj.group(1)
2775
2776                         self._downloader.increment_downloads()
2777
2778                         info = {
2779                                 'id': data['item_id'],
2780                                 'url': video_url,
2781                                 'uploader': data['display_name'],
2782                                 'upload_date': upload_date,
2783                                 'title': data['title'],
2784                                 'stitle': self._simplify_title(data['title']),
2785                                 'ext': ext,
2786                                 'format': data['media']['mimeType'],
2787                                 'thumbnail': data['thumbnailUrl'],
2788                                 'description': data['description'],
2789                                 'player_url': data['embedUrl']
2790                         }
2791                 except (ValueError,KeyError), err:
2792                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2793                         return
2794
2795                 try:
2796                         self._downloader.process_info(info)
2797                 except UnavailableVideoError, err:
2798                         self._downloader.trouble(u'\nERROR: unable to download video')
2799
2800
2801 class PostProcessor(object):
2802         """Post Processor class.
2803
2804         PostProcessor objects can be added to downloaders with their
2805         add_post_processor() method. When the downloader has finished a
2806         successful download, it will take its internal chain of PostProcessors
2807         and start calling the run() method on each one of them, first with
2808         an initial argument and then with the returned value of the previous
2809         PostProcessor.
2810
2811         The chain will be stopped if one of them ever returns None or the end
2812         of the chain is reached.
2813
2814         PostProcessor objects follow a "mutual registration" process similar
2815         to InfoExtractor objects.
2816         """
2817
2818         _downloader = None
2819
2820         def __init__(self, downloader=None):
2821                 self._downloader = downloader
2822
2823         def set_downloader(self, downloader):
2824                 """Sets the downloader for this PP."""
2825                 self._downloader = downloader
2826
2827         def run(self, information):
2828                 """Run the PostProcessor.
2829
2830                 The "information" argument is a dictionary like the ones
2831                 composed by InfoExtractors. The only difference is that this
2832                 one has an extra field called "filepath" that points to the
2833                 downloaded file.
2834
2835                 When this method returns None, the postprocessing chain is
2836                 stopped. However, this method may return an information
2837                 dictionary that will be passed to the next postprocessing
2838                 object in the chain. It can be the one it received after
2839                 changing some fields.
2840
2841                 In addition, this method may raise a PostProcessingError
2842                 exception that will be taken into account by the downloader
2843                 it was called from.
2844                 """
2845                 return information # by default, do nothing
2846
2847 class FFmpegExtractAudioPP(PostProcessor):
2848
2849         def __init__(self, downloader=None, preferredcodec=None):
2850                 PostProcessor.__init__(self, downloader)
2851                 if preferredcodec is None:
2852                         preferredcodec = 'best'
2853                 self._preferredcodec = preferredcodec
2854
2855         @staticmethod
2856         def get_audio_codec(path):
2857                 try:
2858                         cmd = ['ffprobe', '-show_streams', '--', path]
2859                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2860                         output = handle.communicate()[0]
2861                         if handle.wait() != 0:
2862                                 return None
2863                 except (IOError, OSError):
2864                         return None
2865                 audio_codec = None
2866                 for line in output.split('\n'):
2867                         if line.startswith('codec_name='):
2868                                 audio_codec = line.split('=')[1].strip()
2869                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2870                                 return audio_codec
2871                 return None
2872
2873         @staticmethod
2874         def run_ffmpeg(path, out_path, codec, more_opts):
2875                 try:
2876                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2877                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2878                         return (ret == 0)
2879                 except (IOError, OSError):
2880                         return False
2881
2882         def run(self, information):
2883                 path = information['filepath']
2884
2885                 filecodec = self.get_audio_codec(path)
2886                 if filecodec is None:
2887                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2888                         return None
2889
2890                 more_opts = []
2891                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2892                         if filecodec == 'aac' or filecodec == 'mp3':
2893                                 # Lossless if possible
2894                                 acodec = 'copy'
2895                                 extension = filecodec
2896                                 if filecodec == 'aac':
2897                                         more_opts = ['-f', 'adts']
2898                         else:
2899                                 # MP3 otherwise.
2900                                 acodec = 'libmp3lame'
2901                                 extension = 'mp3'
2902                                 more_opts = ['-ab', '128k']
2903                 else:
2904                         # We convert the audio (lossy)
2905                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2906                         extension = self._preferredcodec
2907                         more_opts = ['-ab', '128k']
2908                         if self._preferredcodec == 'aac':
2909                                 more_opts += ['-f', 'adts']
2910
2911                 (prefix, ext) = os.path.splitext(path)
2912                 new_path = prefix + '.' + extension
2913                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2914                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2915
2916                 if not status:
2917                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2918                         return None
2919
2920                 try:
2921                         os.remove(path)
2922                 except (IOError, OSError):
2923                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2924                         return None
2925
2926                 information['filepath'] = new_path
2927                 return information
2928
2929 ### MAIN PROGRAM ###
2930 if __name__ == '__main__':
2931         try:
2932                 # Modules needed only when running the main program
2933                 import getpass
2934                 import optparse
2935
2936                 # Function to update the program file with the latest version from the repository.
2937                 def update_self(downloader, filename):
2938                         # Note: downloader only used for options
2939                         if not os.access(filename, os.W_OK):
2940                                 sys.exit('ERROR: no write permissions on %s' % filename)
2941
2942                         downloader.to_screen('Updating to latest stable version...')
2943                         try:
2944                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2945                                 latest_version = urllib.urlopen(latest_url).read().strip()
2946                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2947                                 newcontent = urllib.urlopen(prog_url).read()
2948                         except (IOError, OSError), err:
2949                                 sys.exit('ERROR: unable to download latest version')
2950                         try:
2951                                 stream = open(filename, 'w')
2952                                 stream.write(newcontent)
2953                                 stream.close()
2954                         except (IOError, OSError), err:
2955                                 sys.exit('ERROR: unable to overwrite current version')
2956                         downloader.to_screen('Updated to version %s' % latest_version)
2957
2958                 # Parse command line
2959                 parser = optparse.OptionParser(
2960                         usage='Usage: %prog [options] url...',
2961                         version='2011.07.09-phihag',
2962                         conflict_handler='resolve',
2963                 )
2964
2965                 parser.add_option('-h', '--help',
2966                                 action='help', help='print this help text and exit')
2967                 parser.add_option('-v', '--version',
2968                                 action='version', help='print program version and exit')
2969                 parser.add_option('-U', '--update',
2970                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2971                 parser.add_option('-i', '--ignore-errors',
2972                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2973                 parser.add_option('-r', '--rate-limit',
2974                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2975                 parser.add_option('-R', '--retries',
2976                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2977                 parser.add_option('--playlist-start',
2978                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2979                 parser.add_option('--playlist-end',
2980                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2981                 parser.add_option('--dump-user-agent',
2982                                 action='store_true', dest='dump_user_agent',
2983                                 help='display the current browser identification', default=False)
2984
2985                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2986                 authentication.add_option('-u', '--username',
2987                                 dest='username', metavar='USERNAME', help='account username')
2988                 authentication.add_option('-p', '--password',
2989                                 dest='password', metavar='PASSWORD', help='account password')
2990                 authentication.add_option('-n', '--netrc',
2991                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2992                 parser.add_option_group(authentication)
2993
2994                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2995                 video_format.add_option('-f', '--format',
2996                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2997                 video_format.add_option('--all-formats',
2998                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2999                 video_format.add_option('--max-quality',
3000                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3001                 parser.add_option_group(video_format)
3002
3003                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3004                 verbosity.add_option('-q', '--quiet',
3005                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3006                 verbosity.add_option('-s', '--simulate',
3007                                 action='store_true', dest='simulate', help='do not download video', default=False)
3008                 verbosity.add_option('-g', '--get-url',
3009                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3010                 verbosity.add_option('-e', '--get-title',
3011                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3012                 verbosity.add_option('--get-thumbnail',
3013                                 action='store_true', dest='getthumbnail',
3014                                 help='simulate, quiet but print thumbnail URL', default=False)
3015                 verbosity.add_option('--get-description',
3016                                 action='store_true', dest='getdescription',
3017                                 help='simulate, quiet but print video description', default=False)
3018                 verbosity.add_option('--get-filename',
3019                                 action='store_true', dest='getfilename',
3020                                 help='simulate, quiet but print output filename', default=False)
3021                 verbosity.add_option('--no-progress',
3022                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3023                 verbosity.add_option('--console-title',
3024                                 action='store_true', dest='consoletitle',
3025                                 help='display progress in console titlebar', default=False)
3026                 parser.add_option_group(verbosity)
3027
3028                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3029                 filesystem.add_option('-t', '--title',
3030                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
3031                 filesystem.add_option('-l', '--literal',
3032                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3033                 filesystem.add_option('-A', '--auto-number',
3034                                 action='store_true', dest='autonumber',
3035                                 help='number downloaded files starting from 00000', default=False)
3036                 filesystem.add_option('-o', '--output',
3037                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3038                 filesystem.add_option('-a', '--batch-file',
3039                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3040                 filesystem.add_option('-w', '--no-overwrites',
3041                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3042                 filesystem.add_option('-c', '--continue',
3043                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3044                 filesystem.add_option('--cookies',
3045                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3046                 filesystem.add_option('--no-part',
3047                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
3048                 filesystem.add_option('--no-mtime',
3049                                 action='store_false', dest='updatetime',
3050                                 help='do not use the Last-modified header to set the file modification time', default=True)
3051                 filesystem.add_option('--write-description',
3052                                 action='store_true', dest='writedescription',
3053                                 help='write video description to a .description file', default=False)
3054                 filesystem.add_option('--write-info-json',
3055                                 action='store_true', dest='writeinfojson',
3056                                 help='write video metadata to a .info.json file', default=False)
3057                 parser.add_option_group(filesystem)
3058
3059                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3060                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3061                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3062                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3063                                 help='"best", "aac" or "mp3"; best by default')
3064                 parser.add_option_group(postproc)
3065
3066                 (opts, args) = parser.parse_args()
3067
3068                 # Open appropriate CookieJar
3069                 if opts.cookiefile is None:
3070                         jar = cookielib.CookieJar()
3071                 else:
3072                         try:
3073                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3074                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3075                                         jar.load()
3076                         except (IOError, OSError), err:
3077                                 sys.exit(u'ERROR: unable to open cookie file')
3078
3079                 # Dump user agent
3080                 if opts.dump_user_agent:
3081                         print std_headers['User-Agent']
3082                         sys.exit(0)
3083
3084                 # General configuration
3085                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3086                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3087                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3088
3089                 # Batch file verification
3090                 batchurls = []
3091                 if opts.batchfile is not None:
3092                         try:
3093                                 if opts.batchfile == '-':
3094                                         batchfd = sys.stdin
3095                                 else:
3096                                         batchfd = open(opts.batchfile, 'r')
3097                                 batchurls = batchfd.readlines()
3098                                 batchurls = [x.strip() for x in batchurls]
3099                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3100                         except IOError:
3101                                 sys.exit(u'ERROR: batch file could not be read')
3102                 all_urls = batchurls + args
3103
3104                 # Conflicting, missing and erroneous options
3105                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3106                         parser.error(u'using .netrc conflicts with giving username/password')
3107                 if opts.password is not None and opts.username is None:
3108                         parser.error(u'account username missing')
3109                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3110                         parser.error(u'using output template conflicts with using title, literal title or auto number')
3111                 if opts.usetitle and opts.useliteral:
3112                         parser.error(u'using title conflicts with using literal title')
3113                 if opts.username is not None and opts.password is None:
3114                         opts.password = getpass.getpass(u'Type account password and press return:')
3115                 if opts.ratelimit is not None:
3116                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3117                         if numeric_limit is None:
3118                                 parser.error(u'invalid rate limit specified')
3119                         opts.ratelimit = numeric_limit
3120                 if opts.retries is not None:
3121                         try:
3122                                 opts.retries = long(opts.retries)
3123                         except (TypeError, ValueError), err:
3124                                 parser.error(u'invalid retry count specified')
3125                 try:
3126                         opts.playliststart = long(opts.playliststart)
3127                         if opts.playliststart <= 0:
3128                                 raise ValueError
3129                 except (TypeError, ValueError), err:
3130                         parser.error(u'invalid playlist start number specified')
3131                 try:
3132                         opts.playlistend = long(opts.playlistend)
3133                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3134                                 raise ValueError
3135                 except (TypeError, ValueError), err:
3136                         parser.error(u'invalid playlist end number specified')
3137                 if opts.extractaudio:
3138                         if opts.audioformat not in ['best', 'aac', 'mp3']:
3139                                 parser.error(u'invalid audio format specified')
3140
3141                 # Information extractors
3142                 youtube_ie = YoutubeIE()
3143                 metacafe_ie = MetacafeIE(youtube_ie)
3144                 dailymotion_ie = DailymotionIE()
3145                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3146                 youtube_user_ie = YoutubeUserIE(youtube_ie)
3147                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3148                 google_ie = GoogleIE()
3149                 google_search_ie = GoogleSearchIE(google_ie)
3150                 photobucket_ie = PhotobucketIE()
3151                 yahoo_ie = YahooIE()
3152                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3153                 deposit_files_ie = DepositFilesIE()
3154                 facebook_ie = FacebookIE()
3155                 bliptv_ie = BlipTVIE()
3156                 generic_ie = GenericIE()
3157
3158                 # File downloader
3159                 fd = FileDownloader({
3160                         'usenetrc': opts.usenetrc,
3161                         'username': opts.username,
3162                         'password': opts.password,
3163                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3164                         'forceurl': opts.geturl,
3165                         'forcetitle': opts.gettitle,
3166                         'forcethumbnail': opts.getthumbnail,
3167                         'forcedescription': opts.getdescription,
3168                         'forcefilename': opts.getfilename,
3169                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3170                         'format': opts.format,
3171                         'format_limit': opts.format_limit,
3172                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3173                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3174                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3175                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3176                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3177                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3178                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3179                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3180                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3181                                 or u'%(id)s.%(ext)s'),
3182                         'ignoreerrors': opts.ignoreerrors,
3183                         'ratelimit': opts.ratelimit,
3184                         'nooverwrites': opts.nooverwrites,
3185                         'retries': opts.retries,
3186                         'continuedl': opts.continue_dl,
3187                         'noprogress': opts.noprogress,
3188                         'playliststart': opts.playliststart,
3189                         'playlistend': opts.playlistend,
3190                         'logtostderr': opts.outtmpl == '-',
3191                         'consoletitle': opts.consoletitle,
3192                         'nopart': opts.nopart,
3193                         'updatetime': opts.updatetime,
3194                         'writedescription': opts.writedescription,
3195                         'writeinfojson': opts.writeinfojson,
3196                         })
3197                 fd.add_info_extractor(youtube_search_ie)
3198                 fd.add_info_extractor(youtube_pl_ie)
3199                 fd.add_info_extractor(youtube_user_ie)
3200                 fd.add_info_extractor(metacafe_ie)
3201                 fd.add_info_extractor(dailymotion_ie)
3202                 fd.add_info_extractor(youtube_ie)
3203                 fd.add_info_extractor(google_ie)
3204                 fd.add_info_extractor(google_search_ie)
3205                 fd.add_info_extractor(photobucket_ie)
3206                 fd.add_info_extractor(yahoo_ie)
3207                 fd.add_info_extractor(yahoo_search_ie)
3208                 fd.add_info_extractor(deposit_files_ie)
3209                 fd.add_info_extractor(facebook_ie)
3210                 fd.add_info_extractor(bliptv_ie)
3211
3212                 # This must come last since it's the
3213                 # fallback if none of the others work
3214                 fd.add_info_extractor(generic_ie)
3215
3216                 # PostProcessors
3217                 if opts.extractaudio:
3218                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3219
3220                 # Update version
3221                 if opts.update_self:
3222                         update_self(fd, sys.argv[0])
3223
3224                 # Maybe do nothing
3225                 if len(all_urls) < 1:
3226                         if not opts.update_self:
3227                                 parser.error(u'you must provide at least one URL')
3228                         else:
3229                                 sys.exit()
3230                 retcode = fd.download(all_urls)
3231
3232                 # Dump cookie jar if requested
3233                 if opts.cookiefile is not None:
3234                         try:
3235                                 jar.save()
3236                         except (IOError, OSError), err:
3237                                 sys.exit(u'ERROR: unable to save cookie jar')
3238
3239                 sys.exit(retcode)
3240
3241         except DownloadError:
3242                 sys.exit(1)
3243         except SameFileError:
3244                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3245         except KeyboardInterrupt:
3246                 sys.exit(u'\nERROR: Interrupted by user')