youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         )
  13
  14 __license__ = 'Public Domain'
  15 __version__ = '2011.08.24-phihag'
  16
  17 import cookielib
  18 import datetime
  19 import gzip
  20 import htmlentitydefs
  21 import httplib
  22 import locale
  23 import math
  24 import netrc
  25 import os
  26 import os.path
  27 import re
  28 import socket
  29 import string
  30 import subprocess
  31 import sys
  32 import time
  33 import urllib
  34 import urllib2
  35 import warnings
  36 import zlib
  37
  38 if os.name == 'nt':
  39         import ctypes
  40
  41 try:
  42         import email.utils
  43 except ImportError: # Python 2.4
  44         import email.Utils
  45 try:
  46         import cStringIO as StringIO
  47 except ImportError:
  48         import StringIO
  49
  50 # parse_qs was moved from the cgi module to the urlparse module recently.
  51 try:
  52         from urlparse import parse_qs
  53 except ImportError:
  54         from cgi import parse_qs
  55
  56 try:
  57         import lxml.etree
  58 except ImportError:
  59         pass # Handled below
  60
  61 std_headers = {
  62         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  63         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  64         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  65         'Accept-Encoding': 'gzip, deflate',
  66         'Accept-Language': 'en-us,en;q=0.5',
  67 }
  68
  69 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  70
  71 try:
  72         import json
  73 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  74         import re
  75         class json(object):
  76                 @staticmethod
  77                 def loads(s):
  78                         s = s.decode('UTF-8')
  79                         def raiseError(msg, i):
  80                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  81                         def skipSpace(i, expectMore=True):
  82                                 while i < len(s) and s[i] in ' \t\r\n':
  83                                         i += 1
  84                                 if expectMore:
  85                                         if i >= len(s):
  86                                                 raiseError('Premature end', i)
  87                                 return i
  88                         def decodeEscape(match):
  89                                 esc = match.group(1)
  90                                 _STATIC = {
  91                                         '"': '"',
  92                                         '\\': '\\',
  93                                         '/': '/',
  94                                         'b': unichr(0x8),
  95                                         'f': unichr(0xc),
  96                                         'n': '\n',
  97                                         'r': '\r',
  98                                         't': '\t',
  99                                 }
 100                                 if esc in _STATIC:
 101                                         return _STATIC[esc]
 102                                 if esc[0] == 'u':
 103                                         if len(esc) == 1+4:
 104                                                 return unichr(int(esc[1:5], 16))
 105                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 106                                                 hi = int(esc[1:5], 16)
 107                                                 low = int(esc[7:11], 16)
 108                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 109                                 raise ValueError('Unknown escape ' + str(esc))
 110                         def parseString(i):
 111                                 i += 1
 112                                 e = i
 113                                 while True:
 114                                         e = s.index('"', e)
 115                                         bslashes = 0
 116                                         while s[e-bslashes-1] == '\\':
 117                                                 bslashes += 1
 118                                         if bslashes % 2 == 1:
 119                                                 e += 1
 120                                                 continue
 121                                         break
 122                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 123                                 stri = rexp.sub(decodeEscape, s[i:e])
 124                                 return (e+1,stri)
 125                         def parseObj(i):
 126                                 i += 1
 127                                 res = {}
 128                                 i = skipSpace(i)
 129                                 if s[i] == '}': # Empty dictionary
 130                                         return (i+1,res)
 131                                 while True:
 132                                         if s[i] != '"':
 133                                                 raiseError('Expected a string object key', i)
 134                                         i,key = parseString(i)
 135                                         i = skipSpace(i)
 136                                         if i >= len(s) or s[i] != ':':
 137                                                 raiseError('Expected a colon', i)
 138                                         i,val = parse(i+1)
 139                                         res[key] = val
 140                                         i = skipSpace(i)
 141                                         if s[i] == '}':
 142                                                 return (i+1, res)
 143                                         if s[i] != ',':
 144                                                 raiseError('Expected comma or closing curly brace', i)
 145                                         i = skipSpace(i+1)
 146                         def parseArray(i):
 147                                 res = []
 148                                 i = skipSpace(i+1)
 149                                 if s[i] == ']': # Empty array
 150                                         return (i+1,res)
 151                                 while True:
 152                                         i,val = parse(i)
 153                                         res.append(val)
 154                                         i = skipSpace(i) # Raise exception if premature end
 155                                         if s[i] == ']':
 156                                                 return (i+1, res)
 157                                         if s[i] != ',':
 158                                                 raiseError('Expected a comma or closing bracket', i)
 159                                         i = skipSpace(i+1)
 160                         def parseDiscrete(i):
 161                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 162                                         if s.startswith(k, i):
 163                                                 return (i+len(k), v)
 164                                 raiseError('Not a boolean (or null)', i)
 165                         def parseNumber(i):
 166                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 167                                 if mobj is None:
 168                                         raiseError('Not a number', i)
 169                                 nums = mobj.group(1)
 170                                 if '.' in nums or 'e' in nums or 'E' in nums:
 171                                         return (i+len(nums), float(nums))
 172                                 return (i+len(nums), int(nums))
 173                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 174                         def parse(i):
 175                                 i = skipSpace(i)
 176                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 177                                 i = skipSpace(i, False)
 178                                 return (i,res)
 179                         i,res = parse(0)
 180                         if i < len(s):
 181                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 182                         return res
 183
 184 def preferredencoding():
 185         """Get preferred encoding.
 186
 187         Returns the best encoding scheme for the system, based on
 188         locale.getpreferredencoding() and some further tweaks.
 189         """
 190         def yield_preferredencoding():
 191                 try:
 192                         pref = locale.getpreferredencoding()
 193                         u'TEST'.encode(pref)
 194                 except:
 195                         pref = 'UTF-8'
 196                 while True:
 197                         yield pref
 198         return yield_preferredencoding().next()
 199
 200 def htmlentity_transform(matchobj):
 201         """Transforms an HTML entity to a Unicode character.
 202
 203         This function receives a match object and is intended to be used with
 204         the re.sub() function.
 205         """
 206         entity = matchobj.group(1)
 207
 208         # Known non-numeric HTML entity
 209         if entity in htmlentitydefs.name2codepoint:
 210                 return unichr(htmlentitydefs.name2codepoint[entity])
 211
 212         # Unicode character
 213         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 214         if mobj is not None:
 215                 numstr = mobj.group(1)
 216                 if numstr.startswith(u'x'):
 217                         base = 16
 218                         numstr = u'0%s' % numstr
 219                 else:
 220                         base = 10
 221                 return unichr(long(numstr, base))
 222
 223         # Unknown entity in name, return its literal representation
 224         return (u'&%s;' % entity)
 225
 226 def sanitize_title(utitle):
 227         """Sanitizes a video title so it could be used as part of a filename."""
 228         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 229         return utitle.replace(unicode(os.sep), u'%')
 230
 231 def sanitize_open(filename, open_mode):
 232         """Try to open the given filename, and slightly tweak it if this fails.
 233
 234         Attempts to open the given filename. If this fails, it tries to change
 235         the filename slightly, step by step, until it's either able to open it
 236         or it fails and raises a final exception, like the standard open()
 237         function.
 238
 239         It returns the tuple (stream, definitive_file_name).
 240         """
 241         try:
 242                 if filename == u'-':
 243                         if sys.platform == 'win32':
 244                                 import msvcrt
 245                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 246                         return (sys.stdout, filename)
 247                 stream = open(filename, open_mode)
 248                 return (stream, filename)
 249         except (IOError, OSError), err:
 250                 # In case of error, try to remove win32 forbidden chars
 251                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 252
 253                 # An exception here should be caught in the caller
 254                 stream = open(filename, open_mode)
 255                 return (stream, filename)
 256
 257 def timeconvert(timestr):
 258     """Convert RFC 2822 defined time string into system timestamp"""
 259     timestamp = None
 260     timetuple = email.utils.parsedate_tz(timestr)
 261     if timetuple is not None:
 262         timestamp = email.utils.mktime_tz(timetuple)
 263     return timestamp
 264
 265 class DownloadError(Exception):
 266         """Download Error exception.
 267
 268         This exception may be thrown by FileDownloader objects if they are not
 269         configured to continue on errors. They will contain the appropriate
 270         error message.
 271         """
 272         pass
 273
 274 class SameFileError(Exception):
 275         """Same File exception.
 276
 277         This exception will be thrown by FileDownloader objects if they detect
 278         multiple files would have to be downloaded to the same file on disk.
 279         """
 280         pass
 281
 282 class PostProcessingError(Exception):
 283         """Post Processing exception.
 284
 285         This exception may be raised by PostProcessor's .run() method to
 286         indicate an error in the postprocessing task.
 287         """
 288         pass
 289
 290 class UnavailableVideoError(Exception):
 291         """Unavailable Format exception.
 292
 293         This exception will be thrown when a video is requested
 294         in a format that is not available for that video.
 295         """
 296         pass
 297
 298 class ContentTooShortError(Exception):
 299         """Content Too Short exception.
 300
 301         This exception may be raised by FileDownloader objects when a file they
 302         download is too small for what the server announced first, indicating
 303         the connection was probably interrupted.
 304         """
 305         # Both in bytes
 306         downloaded = None
 307         expected = None
 308
 309         def __init__(self, downloaded, expected):
 310                 self.downloaded = downloaded
 311                 self.expected = expected
 312
 313 class YoutubeDLHandler(urllib2.HTTPHandler):
 314         """Handler for HTTP requests and responses.
 315
 316         This class, when installed with an OpenerDirector, automatically adds
 317         the standard headers to every HTTP request and handles gzipped and
 318         deflated responses from web servers. If compression is to be avoided in
 319         a particular request, the original request in the program code only has
 320         to include the HTTP header "Youtubedl-No-Compression", which will be
 321         removed before making the real request.
 322
 323         Part of this code was copied from:
 324
 325           http://techknack.net/python-urllib2-handlers/
 326
 327         Andrew Rowls, the author of that code, agreed to release it to the
 328         public domain.
 329         """
 330
 331         @staticmethod
 332         def deflate(data):
 333                 try:
 334                         return zlib.decompress(data, -zlib.MAX_WBITS)
 335                 except zlib.error:
 336                         return zlib.decompress(data)
 337
 338         @staticmethod
 339         def addinfourl_wrapper(stream, headers, url, code):
 340                 if hasattr(urllib2.addinfourl, 'getcode'):
 341                         return urllib2.addinfourl(stream, headers, url, code)
 342                 ret = urllib2.addinfourl(stream, headers, url)
 343                 ret.code = code
 344                 return ret
 345
 346         def http_request(self, req):
 347                 for h in std_headers:
 348                         if h in req.headers:
 349                                 del req.headers[h]
 350                         req.add_header(h, std_headers[h])
 351                 if 'Youtubedl-no-compression' in req.headers:
 352                         if 'Accept-encoding' in req.headers:
 353                                 del req.headers['Accept-encoding']
 354                         del req.headers['Youtubedl-no-compression']
 355                 return req
 356
 357         def http_response(self, req, resp):
 358                 old_resp = resp
 359                 # gzip
 360                 if resp.headers.get('Content-encoding', '') == 'gzip':
 361                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 362                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 363                         resp.msg = old_resp.msg
 364                 # deflate
 365                 if resp.headers.get('Content-encoding', '') == 'deflate':
 366                         gz = StringIO.StringIO(self.deflate(resp.read()))
 367                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 368                         resp.msg = old_resp.msg
 369                 return resp
 370
 371 class FileDownloader(object):
 372         """File Downloader class.
 373
 374         File downloader objects are the ones responsible of downloading the
 375         actual video file and writing it to disk if the user has requested
 376         it, among some other tasks. In most cases there should be one per
 377         program. As, given a video URL, the downloader doesn't know how to
 378         extract all the needed information, task that InfoExtractors do, it
 379         has to pass the URL to one of them.
 380
 381         For this, file downloader objects have a method that allows
 382         InfoExtractors to be registered in a given order. When it is passed
 383         a URL, the file downloader handles it to the first InfoExtractor it
 384         finds that reports being able to handle it. The InfoExtractor extracts
 385         all the information about the video or videos the URL refers to, and
 386         asks the FileDownloader to process the video information, possibly
 387         downloading the video.
 388
 389         File downloaders accept a lot of parameters. In order not to saturate
 390         the object constructor with arguments, it receives a dictionary of
 391         options instead. These options are available through the params
 392         attribute for the InfoExtractors to use. The FileDownloader also
 393         registers itself as the downloader in charge for the InfoExtractors
 394         that are added to it, so this is a "mutual registration".
 395
 396         Available options:
 397
 398         username:         Username for authentication purposes.
 399         password:         Password for authentication purposes.
 400         usenetrc:         Use netrc for authentication instead.
 401         quiet:            Do not print messages to stdout.
 402         forceurl:         Force printing final URL.
 403         forcetitle:       Force printing title.
 404         forcethumbnail:   Force printing thumbnail URL.
 405         forcedescription: Force printing description.
 406         forcefilename:    Force printing final filename.
 407         simulate:         Do not download the video files.
 408         format:           Video format code.
 409         format_limit:     Highest quality format to try.
 410         outtmpl:          Template for output names.
 411         ignoreerrors:     Do not stop on download errors.
 412         ratelimit:        Download speed limit, in bytes/sec.
 413         nooverwrites:     Prevent overwriting files.
 414         retries:          Number of times to retry for HTTP error 5xx
 415         continuedl:       Try to continue downloads if possible.
 416         noprogress:       Do not print the progress bar.
 417         playliststart:    Playlist item to start at.
 418         playlistend:      Playlist item to end at.
 419         logtostderr:      Log messages to stderr instead of stdout.
 420         consoletitle:     Display progress in console window's titlebar.
 421         nopart:           Do not use temporary .part files.
 422         updatetime:       Use the Last-modified header to set output file timestamps.
 423         writedescription: Write the video description to a .description file
 424         writeinfojson:    Write the video description to a .info.json file
 425         """
 426
 427         params = None
 428         _ies = []
 429         _pps = []
 430         _download_retcode = None
 431         _num_downloads = None
 432         _screen_file = None
 433
 434         def __init__(self, params):
 435                 """Create a FileDownloader object with the given options."""
 436                 self._ies = []
 437                 self._pps = []
 438                 self._download_retcode = 0
 439                 self._num_downloads = 0
 440                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 441                 self.params = params
 442
 443         @staticmethod
 444         def pmkdir(filename):
 445                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 446                 components = filename.split(os.sep)
 447                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 448                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 449                 for dir in aggregate:
 450                         if not os.path.exists(dir):
 451                                 os.mkdir(dir)
 452
 453         @staticmethod
 454         def format_bytes(bytes):
 455                 if bytes is None:
 456                         return 'N/A'
 457                 if type(bytes) is str:
 458                         bytes = float(bytes)
 459                 if bytes == 0.0:
 460                         exponent = 0
 461                 else:
 462                         exponent = long(math.log(bytes, 1024.0))
 463                 suffix = 'bkMGTPEZY'[exponent]
 464                 converted = float(bytes) / float(1024**exponent)
 465                 return '%.2f%s' % (converted, suffix)
 466
 467         @staticmethod
 468         def calc_percent(byte_counter, data_len):
 469                 if data_len is None:
 470                         return '---.-%'
 471                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 472
 473         @staticmethod
 474         def calc_eta(start, now, total, current):
 475                 if total is None:
 476                         return '--:--'
 477                 dif = now - start
 478                 if current == 0 or dif < 0.001: # One millisecond
 479                         return '--:--'
 480                 rate = float(current) / dif
 481                 eta = long((float(total) - float(current)) / rate)
 482                 (eta_mins, eta_secs) = divmod(eta, 60)
 483                 if eta_mins > 99:
 484                         return '--:--'
 485                 return '%02d:%02d' % (eta_mins, eta_secs)
 486
 487         @staticmethod
 488         def calc_speed(start, now, bytes):
 489                 dif = now - start
 490                 if bytes == 0 or dif < 0.001: # One millisecond
 491                         return '%10s' % '---b/s'
 492                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 493
 494         @staticmethod
 495         def best_block_size(elapsed_time, bytes):
 496                 new_min = max(bytes / 2.0, 1.0)
 497                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 498                 if elapsed_time < 0.001:
 499                         return long(new_max)
 500                 rate = bytes / elapsed_time
 501                 if rate > new_max:
 502                         return long(new_max)
 503                 if rate < new_min:
 504                         return long(new_min)
 505                 return long(rate)
 506
 507         @staticmethod
 508         def parse_bytes(bytestr):
 509                 """Parse a string indicating a byte quantity into a long integer."""
 510                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 511                 if matchobj is None:
 512                         return None
 513                 number = float(matchobj.group(1))
 514                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 515                 return long(round(number * multiplier))
 516
 517         def add_info_extractor(self, ie):
 518                 """Add an InfoExtractor object to the end of the list."""
 519                 self._ies.append(ie)
 520                 ie.set_downloader(self)
 521
 522         def add_post_processor(self, pp):
 523                 """Add a PostProcessor object to the end of the chain."""
 524                 self._pps.append(pp)
 525                 pp.set_downloader(self)
 526
 527         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 528                 """Print message to stdout if not in quiet mode."""
 529                 try:
 530                         if not self.params.get('quiet', False):
 531                                 terminator = [u'\n', u''][skip_eol]
 532                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 533                         self._screen_file.flush()
 534                 except (UnicodeEncodeError), err:
 535                         if not ignore_encoding_errors:
 536                                 raise
 537
 538         def to_stderr(self, message):
 539                 """Print message to stderr."""
 540                 print >>sys.stderr, message.encode(preferredencoding())
 541
 542         def to_cons_title(self, message):
 543                 """Set console/terminal window title to message."""
 544                 if not self.params.get('consoletitle', False):
 545                         return
 546                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 547                         # c_wchar_p() might not be necessary if `message` is
 548                         # already of type unicode()
 549                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 550                 elif 'TERM' in os.environ:
 551                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 552
 553         def fixed_template(self):
 554                 """Checks if the output template is fixed."""
 555                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 556
 557         def trouble(self, message=None):
 558                 """Determine action to take when a download problem appears.
 559
 560                 Depending on if the downloader has been configured to ignore
 561                 download errors or not, this method may throw an exception or
 562                 not when errors are found, after printing the message.
 563                 """
 564                 if message is not None:
 565                         self.to_stderr(message)
 566                 if not self.params.get('ignoreerrors', False):
 567                         raise DownloadError(message)
 568                 self._download_retcode = 1
 569
 570         def slow_down(self, start_time, byte_counter):
 571                 """Sleep if the download speed is over the rate limit."""
 572                 rate_limit = self.params.get('ratelimit', None)
 573                 if rate_limit is None or byte_counter == 0:
 574                         return
 575                 now = time.time()
 576                 elapsed = now - start_time
 577                 if elapsed <= 0.0:
 578                         return
 579                 speed = float(byte_counter) / elapsed
 580                 if speed > rate_limit:
 581                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 582
 583         def temp_name(self, filename):
 584                 """Returns a temporary filename for the given filename."""
 585                 if self.params.get('nopart', False) or filename == u'-' or \
 586                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 587                         return filename
 588                 return filename + u'.part'
 589
 590         def undo_temp_name(self, filename):
 591                 if filename.endswith(u'.part'):
 592                         return filename[:-len(u'.part')]
 593                 return filename
 594
 595         def try_rename(self, old_filename, new_filename):
 596                 try:
 597                         if old_filename == new_filename:
 598                                 return
 599                         os.rename(old_filename, new_filename)
 600                 except (IOError, OSError), err:
 601                         self.trouble(u'ERROR: unable to rename file')
 602
 603         def try_utime(self, filename, last_modified_hdr):
 604                 """Try to set the last-modified time of the given file."""
 605                 if last_modified_hdr is None:
 606                         return
 607                 if not os.path.isfile(filename):
 608                         return
 609                 timestr = last_modified_hdr
 610                 if timestr is None:
 611                         return
 612                 filetime = timeconvert(timestr)
 613                 if filetime is None:
 614                         return
 615                 try:
 616                         os.utime(filename,(time.time(), filetime))
 617                 except:
 618                         pass
 619
 620         def report_writedescription(self, descfn):
 621                 """ Report that the description file is being written """
 622                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 623
 624         def report_writeinfojson(self, infofn):
 625                 """ Report that the metadata file has been written """
 626                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 627
 628         def report_destination(self, filename):
 629                 """Report destination filename."""
 630                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 631
 632         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 633                 """Report download progress."""
 634                 if self.params.get('noprogress', False):
 635                         return
 636                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 637                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 638                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 639                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 640
 641         def report_resuming_byte(self, resume_len):
 642                 """Report attempt to resume at given byte."""
 643                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 644
 645         def report_retry(self, count, retries):
 646                 """Report retry in case of HTTP error 5xx"""
 647                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 648
 649         def report_file_already_downloaded(self, file_name):
 650                 """Report file has already been fully downloaded."""
 651                 try:
 652                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 653                 except (UnicodeEncodeError), err:
 654                         self.to_screen(u'[download] The file has already been downloaded')
 655
 656         def report_unable_to_resume(self):
 657                 """Report it was impossible to resume download."""
 658                 self.to_screen(u'[download] Unable to resume')
 659
 660         def report_finish(self):
 661                 """Report download finished."""
 662                 if self.params.get('noprogress', False):
 663                         self.to_screen(u'[download] Download completed')
 664                 else:
 665                         self.to_screen(u'')
 666
 667         def increment_downloads(self):
 668                 """Increment the ordinal that assigns a number to each file."""
 669                 self._num_downloads += 1
 670
 671         def prepare_filename(self, info_dict):
 672                 """Generate the output filename."""
 673                 try:
 674                         template_dict = dict(info_dict)
 675                         template_dict['epoch'] = unicode(long(time.time()))
 676                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 677                         filename = self.params['outtmpl'] % template_dict
 678                         return filename
 679                 except (ValueError, KeyError), err:
 680                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 681                         return None
 682
 683         def process_info(self, info_dict):
 684                 """Process a single dictionary returned by an InfoExtractor."""
 685                 filename = self.prepare_filename(info_dict)
 686                 # Do nothing else if in simulate mode
 687                 if self.params.get('simulate', False):
 688                         # Forced printings
 689                         if self.params.get('forcetitle', False):
 690                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 691                         if self.params.get('forceurl', False):
 692                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 693                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 694                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 695                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 696                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 697                         if self.params.get('forcefilename', False) and filename is not None:
 698                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 699
 700                         return
 701
 702                 if filename is None:
 703                         return
 704                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 705                         self.to_stderr(u'WARNING: file exists and will be skipped')
 706                         return
 707
 708                 try:
 709                         self.pmkdir(filename)
 710                 except (OSError, IOError), err:
 711                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 712                         return
 713
 714                 if self.params.get('writedescription', False):
 715                         try:
 716                                 descfn = filename + '.description'
 717                                 self.report_writedescription(descfn)
 718                                 descfile = open(descfn, 'wb')
 719                                 try:
 720                                         descfile.write(info_dict['description'].encode('utf-8'))
 721                                 finally:
 722                                         descfile.close()
 723                         except (OSError, IOError):
 724                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 725                                 return
 726
 727                 if self.params.get('writeinfojson', False):
 728                         infofn = filename + '.info.json'
 729                         self.report_writeinfojson(infofn)
 730                         try:
 731                                 json.dump
 732                         except (NameError,AttributeError):
 733                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 734                                 return
 735                         try:
 736                                 infof = open(infofn, 'wb')
 737                                 try:
 738                                         json.dump(info_dict, infof)
 739                                 finally:
 740                                         infof.close()
 741                         except (OSError, IOError):
 742                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 743                                 return
 744
 745                 try:
 746                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 747                 except (OSError, IOError), err:
 748                         raise UnavailableVideoError
 749                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 750                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 751                         return
 752                 except (ContentTooShortError, ), err:
 753                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 754                         return
 755
 756                 if success:
 757                         try:
 758                                 self.post_process(filename, info_dict)
 759                         except (PostProcessingError), err:
 760                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 761                                 return
 762
 763         def download(self, url_list):
 764                 """Download a given list of URLs."""
 765                 if len(url_list) > 1 and self.fixed_template():
 766                         raise SameFileError(self.params['outtmpl'])
 767
 768                 for url in url_list:
 769                         suitable_found = False
 770                         for ie in self._ies:
 771                                 # Go to next InfoExtractor if not suitable
 772                                 if not ie.suitable(url):
 773                                         continue
 774
 775                                 # Suitable InfoExtractor found
 776                                 suitable_found = True
 777
 778                                 # Extract information from URL and process it
 779                                 ie.extract(url)
 780
 781                                 # Suitable InfoExtractor had been found; go to next URL
 782                                 break
 783
 784                         if not suitable_found:
 785                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 786
 787                 return self._download_retcode
 788
 789         def post_process(self, filename, ie_info):
 790                 """Run the postprocessing chain on the given file."""
 791                 info = dict(ie_info)
 792                 info['filepath'] = filename
 793                 for pp in self._pps:
 794                         info = pp.run(info)
 795                         if info is None:
 796                                 break
 797
 798         def _download_with_rtmpdump(self, filename, url, player_url):
 799                 self.report_destination(filename)
 800                 tmpfilename = self.temp_name(filename)
 801
 802                 # Check for rtmpdump first
 803                 try:
 804                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 805                 except (OSError, IOError):
 806                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 807                         return False
 808
 809                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 810                 # the connection was interrumpted and resuming appears to be
 811                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 812                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 813                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 814                 while retval == 2 or retval == 1:
 815                         prevsize = os.path.getsize(tmpfilename)
 816                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 817                         time.sleep(5.0) # This seems to be needed
 818                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 819                         cursize = os.path.getsize(tmpfilename)
 820                         if prevsize == cursize and retval == 1:
 821                                 break
 822                 if retval == 0:
 823                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 824                         self.try_rename(tmpfilename, filename)
 825                         return True
 826                 else:
 827                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 828                         return False
 829
 830         def _do_download(self, filename, url, player_url):
 831                 # Check file already present
 832                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 833                         self.report_file_already_downloaded(filename)
 834                         return True
 835
 836                 # Attempt to download using rtmpdump
 837                 if url.startswith('rtmp'):
 838                         return self._download_with_rtmpdump(filename, url, player_url)
 839
 840                 tmpfilename = self.temp_name(filename)
 841                 stream = None
 842                 open_mode = 'wb'
 843
 844                 # Do not include the Accept-Encoding header
 845                 headers = {'Youtubedl-no-compression': 'True'}
 846                 basic_request = urllib2.Request(url, None, headers)
 847                 request = urllib2.Request(url, None, headers)
 848
 849                 # Establish possible resume length
 850                 if os.path.isfile(tmpfilename):
 851                         resume_len = os.path.getsize(tmpfilename)
 852                 else:
 853                         resume_len = 0
 854
 855                 # Request parameters in case of being able to resume
 856                 if self.params.get('continuedl', False) and resume_len != 0:
 857                         self.report_resuming_byte(resume_len)
 858                         request.add_header('Range','bytes=%d-' % resume_len)
 859                         open_mode = 'ab'
 860
 861                 count = 0
 862                 retries = self.params.get('retries', 0)
 863                 while count <= retries:
 864                         # Establish connection
 865                         try:
 866                                 data = urllib2.urlopen(request)
 867                                 break
 868                         except (urllib2.HTTPError, ), err:
 869                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 870                                         # Unexpected HTTP error
 871                                         raise
 872                                 elif err.code == 416:
 873                                         # Unable to resume (requested range not satisfiable)
 874                                         try:
 875                                                 # Open the connection again without the range header
 876                                                 data = urllib2.urlopen(basic_request)
 877                                                 content_length = data.info()['Content-Length']
 878                                         except (urllib2.HTTPError, ), err:
 879                                                 if err.code < 500 or err.code >= 600:
 880                                                         raise
 881                                         else:
 882                                                 # Examine the reported length
 883                                                 if (content_length is not None and
 884                                                         (resume_len - 100 < long(content_length) < resume_len + 100)):
 885                                                         # The file had already been fully downloaded.
 886                                                         # Explanation to the above condition: in issue #175 it was revealed that
 887                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 888                                                         # changing the file size slightly and causing problems for some users. So
 889                                                         # I decided to implement a suggested change and consider the file
 890                                                         # completely downloaded if the file size differs less than 100 bytes from
 891                                                         # the one in the hard drive.
 892                                                         self.report_file_already_downloaded(filename)
 893                                                         self.try_rename(tmpfilename, filename)
 894                                                         return True
 895                                                 else:
 896                                                         # The length does not match, we start the download over
 897                                                         self.report_unable_to_resume()
 898                                                         open_mode = 'wb'
 899                                                         break
 900                         # Retry
 901                         count += 1
 902                         if count <= retries:
 903                                 self.report_retry(count, retries)
 904
 905                 if count > retries:
 906                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 907                         return False
 908
 909                 data_len = data.info().get('Content-length', None)
 910                 if data_len is not None:
 911                         data_len = long(data_len) + resume_len
 912                 data_len_str = self.format_bytes(data_len)
 913                 byte_counter = 0 + resume_len
 914                 block_size = 1024
 915                 start = time.time()
 916                 while True:
 917                         # Download and write
 918                         before = time.time()
 919                         data_block = data.read(block_size)
 920                         after = time.time()
 921                         if len(data_block) == 0:
 922                                 break
 923                         byte_counter += len(data_block)
 924
 925                         # Open file just in time
 926                         if stream is None:
 927                                 try:
 928                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 929                                         filename = self.undo_temp_name(tmpfilename)
 930                                         self.report_destination(filename)
 931                                 except (OSError, IOError), err:
 932                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 933                                         return False
 934                         try:
 935                                 stream.write(data_block)
 936                         except (IOError, OSError), err:
 937                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 938                                 return False
 939                         block_size = self.best_block_size(after - before, len(data_block))
 940
 941                         # Progress message
 942                         percent_str = self.calc_percent(byte_counter, data_len)
 943                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 944                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 945                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 946
 947                         # Apply rate limit
 948                         self.slow_down(start, byte_counter - resume_len)
 949
 950                 stream.close()
 951                 self.report_finish()
 952                 if data_len is not None and byte_counter != data_len:
 953                         raise ContentTooShortError(byte_counter, long(data_len))
 954                 self.try_rename(tmpfilename, filename)
 955
 956                 # Update file modification time
 957                 if self.params.get('updatetime', True):
 958                         self.try_utime(filename, data.info().get('last-modified', None))
 959
 960                 return True
 961
 962 class InfoExtractor(object):
 963         """Information Extractor class.
 964
 965         Information extractors are the classes that, given a URL, extract
 966         information from the video (or videos) the URL refers to. This
 967         information includes the real video URL, the video title and simplified
 968         title, author and others. The information is stored in a dictionary
 969         which is then passed to the FileDownloader. The FileDownloader
 970         processes this information possibly downloading the video to the file
 971         system, among other possible outcomes. The dictionaries must include
 972         the following fields:
 973
 974         id:             Video identifier.
 975         url:            Final video URL.
 976         uploader:       Nickname of the video uploader.
 977         title:          Literal title.
 978         stitle:         Simplified title.
 979         ext:            Video filename extension.
 980         format:         Video format.
 981         player_url:     SWF Player URL (may be None).
 982
 983         The following fields are optional. Their primary purpose is to allow
 984         youtube-dl to serve as the backend for a video search function, such
 985         as the one in youtube2mp3.  They are only used when their respective
 986         forced printing functions are called:
 987
 988         thumbnail:      Full URL to a video thumbnail image.
 989         description:    One-line video description.
 990
 991         Subclasses of this one should re-define the _real_initialize() and
 992         _real_extract() methods, as well as the suitable() static method.
 993         Probably, they should also be instantiated and added to the main
 994         downloader.
 995         """
 996
 997         _ready = False
 998         _downloader = None
 999
1000         def __init__(self, downloader=None):
1001                 """Constructor. Receives an optional downloader."""
1002                 self._ready = False
1003                 self.set_downloader(downloader)
1004
1005         @staticmethod
1006         def suitable(url):
1007                 """Receives a URL and returns True if suitable for this IE."""
1008                 return False
1009
1010         def initialize(self):
1011                 """Initializes an instance (authentication, etc)."""
1012                 if not self._ready:
1013                         self._real_initialize()
1014                         self._ready = True
1015
1016         def extract(self, url):
1017                 """Extracts URL information and returns it in list of dicts."""
1018                 self.initialize()
1019                 return self._real_extract(url)
1020
1021         def set_downloader(self, downloader):
1022                 """Sets the downloader for this IE."""
1023                 self._downloader = downloader
1024
1025         def _real_initialize(self):
1026                 """Real initialization process. Redefine in subclasses."""
1027                 pass
1028
1029         def _real_extract(self, url):
1030                 """Real extraction process. Redefine in subclasses."""
1031                 pass
1032
1033 class YoutubeIE(InfoExtractor):
1034         """Information extractor for youtube.com."""
1035
1036         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1037         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1038         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1039         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1040         _NETRC_MACHINE = 'youtube'
1041         # Listed in order of quality
1042         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1043         _video_extensions = {
1044                 '13': '3gp',
1045                 '17': 'mp4',
1046                 '18': 'mp4',
1047                 '22': 'mp4',
1048                 '37': 'mp4',
1049                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1050                 '43': 'webm',
1051                 '45': 'webm',
1052         }
1053
1054         @staticmethod
1055         def suitable(url):
1056                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1057
1058         def report_lang(self):
1059                 """Report attempt to set language."""
1060                 self._downloader.to_screen(u'[youtube] Setting language')
1061
1062         def report_login(self):
1063                 """Report attempt to log in."""
1064                 self._downloader.to_screen(u'[youtube] Logging in')
1065
1066         def report_age_confirmation(self):
1067                 """Report attempt to confirm age."""
1068                 self._downloader.to_screen(u'[youtube] Confirming age')
1069
1070         def report_video_webpage_download(self, video_id):
1071                 """Report attempt to download video webpage."""
1072                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1073
1074         def report_video_info_webpage_download(self, video_id):
1075                 """Report attempt to download video info webpage."""
1076                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1077
1078         def report_information_extraction(self, video_id):
1079                 """Report attempt to extract video information."""
1080                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1081
1082         def report_unavailable_format(self, video_id, format):
1083                 """Report extracted video URL."""
1084                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1085
1086         def report_rtmp_download(self):
1087                 """Indicate the download will use the RTMP protocol."""
1088                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1089
1090         def _real_initialize(self):
1091                 if self._downloader is None:
1092                         return
1093
1094                 username = None
1095                 password = None
1096                 downloader_params = self._downloader.params
1097
1098                 # Attempt to use provided username and password or .netrc data
1099                 if downloader_params.get('username', None) is not None:
1100                         username = downloader_params['username']
1101                         password = downloader_params['password']
1102                 elif downloader_params.get('usenetrc', False):
1103                         try:
1104                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1105                                 if info is not None:
1106                                         username = info[0]
1107                                         password = info[2]
1108                                 else:
1109                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1110                         except (IOError, netrc.NetrcParseError), err:
1111                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1112                                 return
1113
1114                 # Set language
1115                 request = urllib2.Request(self._LANG_URL)
1116                 try:
1117                         self.report_lang()
1118                         urllib2.urlopen(request).read()
1119                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1120                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1121                         return
1122
1123                 # No authentication to be performed
1124                 if username is None:
1125                         return
1126
1127                 # Log in
1128                 login_form = {
1129                                 'current_form': 'loginForm',
1130                                 'next':         '/',
1131                                 'action_login': 'Log In',
1132                                 'username':     username,
1133                                 'password':     password,
1134                                 }
1135                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1136                 try:
1137                         self.report_login()
1138                         login_results = urllib2.urlopen(request).read()
1139                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1140                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1141                                 return
1142                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1143                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1144                         return
1145
1146                 # Confirm age
1147                 age_form = {
1148                                 'next_url':             '/',
1149                                 'action_confirm':       'Confirm',
1150                                 }
1151                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1152                 try:
1153                         self.report_age_confirmation()
1154                         age_results = urllib2.urlopen(request).read()
1155                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1156                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1157                         return
1158
1159         def _real_extract(self, url):
1160                 # Extract video id from URL
1161                 mobj = re.match(self._VALID_URL, url)
1162                 if mobj is None:
1163                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1164                         return
1165                 video_id = mobj.group(2)
1166
1167                 # Get video webpage
1168                 self.report_video_webpage_download(video_id)
1169                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1170                 try:
1171                         video_webpage = urllib2.urlopen(request).read()
1172                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1173                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1174                         return
1175
1176                 # Attempt to extract SWF player URL
1177                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1178                 if mobj is not None:
1179                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1180                 else:
1181                         player_url = None
1182
1183                 # Get video info
1184                 self.report_video_info_webpage_download(video_id)
1185                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1186                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1187                                            % (video_id, el_type))
1188                         request = urllib2.Request(video_info_url)
1189                         try:
1190                                 video_info_webpage = urllib2.urlopen(request).read()
1191                                 video_info = parse_qs(video_info_webpage)
1192                                 if 'token' in video_info:
1193                                         break
1194                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1195                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1196                                 return
1197                 if 'token' not in video_info:
1198                         if 'reason' in video_info:
1199                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1200                         else:
1201                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1202                         return
1203
1204                 # Start extracting information
1205                 self.report_information_extraction(video_id)
1206
1207                 # uploader
1208                 if 'author' not in video_info:
1209                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1210                         return
1211                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1212
1213                 # title
1214                 if 'title' not in video_info:
1215                         self._downloader.trouble(u'ERROR: unable to extract video title')
1216                         return
1217                 video_title = urllib.unquote_plus(video_info['title'][0])
1218                 video_title = video_title.decode('utf-8')
1219                 video_title = sanitize_title(video_title)
1220
1221                 # simplified title
1222                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1223                 simple_title = simple_title.strip(ur'_')
1224
1225                 # thumbnail image
1226                 if 'thumbnail_url' not in video_info:
1227                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1228                         video_thumbnail = ''
1229                 else:   # don't panic if we can't find it
1230                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1231
1232                 # upload date
1233                 upload_date = u'NA'
1234                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1235                 if mobj is not None:
1236                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1237                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1238                         for expression in format_expressions:
1239                                 try:
1240                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1241                                 except:
1242                                         pass
1243
1244                 # description
1245                 try:
1246                         lxml.etree
1247                 except NameError:
1248                         video_description = u'No description available.'
1249                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1250                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1251                                 if mobj is not None:
1252                                         video_description = mobj.group(1).decode('utf-8')
1253                 else:
1254                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1255                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1256                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1257                         # TODO use another parser
1258
1259                 # token
1260                 video_token = urllib.unquote_plus(video_info['token'][0])
1261
1262                 # Decide which formats to download
1263                 req_format = self._downloader.params.get('format', None)
1264
1265                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1266                         self.report_rtmp_download()
1267                         video_url_list = [(None, video_info['conn'][0])]
1268                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1269                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1270                         url_data = [parse_qs(uds) for uds in url_data_strs]
1271                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1272                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1273
1274                         format_limit = self._downloader.params.get('format_limit', None)
1275                         if format_limit is not None and format_limit in self._available_formats:
1276                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1277                         else:
1278                                 format_list = self._available_formats
1279                         existing_formats = [x for x in format_list if x in url_map]
1280                         if len(existing_formats) == 0:
1281                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1282                                 return
1283                         if req_format is None:
1284                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1285                         elif req_format == '-1':
1286                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1287                         else:
1288                                 # Specific format
1289                                 if req_format not in url_map:
1290                                         self._downloader.trouble(u'ERROR: requested format not available')
1291                                         return
1292                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1293                 else:
1294                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1295                         return
1296
1297                 for format_param, video_real_url in video_url_list:
1298                         # At this point we have a new video
1299                         self._downloader.increment_downloads()
1300
1301                         # Extension
1302                         video_extension = self._video_extensions.get(format_param, 'flv')
1303
1304                         try:
1305                                 # Process video information
1306                                 self._downloader.process_info({
1307                                         'id':           video_id.decode('utf-8'),
1308                                         'url':          video_real_url.decode('utf-8'),
1309                                         'uploader':     video_uploader.decode('utf-8'),
1310                                         'upload_date':  upload_date,
1311                                         'title':        video_title,
1312                                         'stitle':       simple_title,
1313                                         'ext':          video_extension.decode('utf-8'),
1314                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1315                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1316                                         'description':  video_description,
1317                                         'player_url':   player_url,
1318                                 })
1319                         except UnavailableVideoError, err:
1320                                 self._downloader.trouble(u'\nERROR: unable to download video')
1321
1322
1323 class MetacafeIE(InfoExtractor):
1324         """Information Extractor for metacafe.com."""
1325
1326         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1327         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1328         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1329         _youtube_ie = None
1330
1331         def __init__(self, youtube_ie, downloader=None):
1332                 InfoExtractor.__init__(self, downloader)
1333                 self._youtube_ie = youtube_ie
1334
1335         @staticmethod
1336         def suitable(url):
1337                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1338
1339         def report_disclaimer(self):
1340                 """Report disclaimer retrieval."""
1341                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1342
1343         def report_age_confirmation(self):
1344                 """Report attempt to confirm age."""
1345                 self._downloader.to_screen(u'[metacafe] Confirming age')
1346
1347         def report_download_webpage(self, video_id):
1348                 """Report webpage download."""
1349                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1350
1351         def report_extraction(self, video_id):
1352                 """Report information extraction."""
1353                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1354
1355         def _real_initialize(self):
1356                 # Retrieve disclaimer
1357                 request = urllib2.Request(self._DISCLAIMER)
1358                 try:
1359                         self.report_disclaimer()
1360                         disclaimer = urllib2.urlopen(request).read()
1361                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1362                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1363                         return
1364
1365                 # Confirm age
1366                 disclaimer_form = {
1367                         'filters': '0',
1368                         'submit': "Continue - I'm over 18",
1369                         }
1370                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1371                 try:
1372                         self.report_age_confirmation()
1373                         disclaimer = urllib2.urlopen(request).read()
1374                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1375                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1376                         return
1377
1378         def _real_extract(self, url):
1379                 # Extract id and simplified title from URL
1380                 mobj = re.match(self._VALID_URL, url)
1381                 if mobj is None:
1382                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1383                         return
1384
1385                 video_id = mobj.group(1)
1386
1387                 # Check if video comes from YouTube
1388                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1389                 if mobj2 is not None:
1390                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1391                         return
1392
1393                 # At this point we have a new video
1394                 self._downloader.increment_downloads()
1395
1396                 simple_title = mobj.group(2).decode('utf-8')
1397
1398                 # Retrieve video webpage to extract further information
1399                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1400                 try:
1401                         self.report_download_webpage(video_id)
1402                         webpage = urllib2.urlopen(request).read()
1403                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1404                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1405                         return
1406
1407                 # Extract URL, uploader and title from webpage
1408                 self.report_extraction(video_id)
1409                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1410                 if mobj is not None:
1411                         mediaURL = urllib.unquote(mobj.group(1))
1412                         video_extension = mediaURL[-3:]
1413
1414                         # Extract gdaKey if available
1415                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1416                         if mobj is None:
1417                                 video_url = mediaURL
1418                         else:
1419                                 gdaKey = mobj.group(1)
1420                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1421                 else:
1422                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1423                         if mobj is None:
1424                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1425                                 return
1426                         vardict = parse_qs(mobj.group(1))
1427                         if 'mediaData' not in vardict:
1428                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1429                                 return
1430                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1431                         if mobj is None:
1432                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1433                                 return
1434                         mediaURL = mobj.group(1).replace('\\/', '/')
1435                         video_extension = mediaURL[-3:]
1436                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1437
1438                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1439                 if mobj is None:
1440                         self._downloader.trouble(u'ERROR: unable to extract title')
1441                         return
1442                 video_title = mobj.group(1).decode('utf-8')
1443                 video_title = sanitize_title(video_title)
1444
1445                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1446                 if mobj is None:
1447                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1448                         return
1449                 video_uploader = mobj.group(1)
1450
1451                 try:
1452                         # Process video information
1453                         self._downloader.process_info({
1454                                 'id':           video_id.decode('utf-8'),
1455                                 'url':          video_url.decode('utf-8'),
1456                                 'uploader':     video_uploader.decode('utf-8'),
1457                                 'upload_date':  u'NA',
1458                                 'title':        video_title,
1459                                 'stitle':       simple_title,
1460                                 'ext':          video_extension.decode('utf-8'),
1461                                 'format':       u'NA',
1462                                 'player_url':   None,
1463                         })
1464                 except UnavailableVideoError:
1465                         self._downloader.trouble(u'\nERROR: unable to download video')
1466
1467
1468 class DailymotionIE(InfoExtractor):
1469         """Information Extractor for Dailymotion"""
1470
1471         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1472
1473         def __init__(self, downloader=None):
1474                 InfoExtractor.__init__(self, downloader)
1475
1476         @staticmethod
1477         def suitable(url):
1478                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1479
1480         def report_download_webpage(self, video_id):
1481                 """Report webpage download."""
1482                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1483
1484         def report_extraction(self, video_id):
1485                 """Report information extraction."""
1486                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1487
1488         def _real_initialize(self):
1489                 return
1490
1491         def _real_extract(self, url):
1492                 # Extract id and simplified title from URL
1493                 mobj = re.match(self._VALID_URL, url)
1494                 if mobj is None:
1495                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1496                         return
1497
1498                 # At this point we have a new video
1499                 self._downloader.increment_downloads()
1500                 video_id = mobj.group(1)
1501
1502                 simple_title = mobj.group(2).decode('utf-8')
1503                 video_extension = 'flv'
1504
1505                 # Retrieve video webpage to extract further information
1506                 request = urllib2.Request(url)
1507                 try:
1508                         self.report_download_webpage(video_id)
1509                         webpage = urllib2.urlopen(request).read()
1510                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1511                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1512                         return
1513
1514                 # Extract URL, uploader and title from webpage
1515                 self.report_extraction(video_id)
1516                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1517                 if mobj is None:
1518                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1519                         return
1520                 mediaURL = urllib.unquote(mobj.group(1))
1521
1522                 # if needed add http://www.dailymotion.com/ if relative URL
1523
1524                 video_url = mediaURL
1525
1526                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1527                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1528                 if mobj is None:
1529                         self._downloader.trouble(u'ERROR: unable to extract title')
1530                         return
1531                 video_title = mobj.group(1).decode('utf-8')
1532                 video_title = sanitize_title(video_title)
1533
1534                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1535                 if mobj is None:
1536                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1537                         return
1538                 video_uploader = mobj.group(1)
1539
1540                 try:
1541                         # Process video information
1542                         self._downloader.process_info({
1543                                 'id':           video_id.decode('utf-8'),
1544                                 'url':          video_url.decode('utf-8'),
1545                                 'uploader':     video_uploader.decode('utf-8'),
1546                                 'upload_date':  u'NA',
1547                                 'title':        video_title,
1548                                 'stitle':       simple_title,
1549                                 'ext':          video_extension.decode('utf-8'),
1550                                 'format':       u'NA',
1551                                 'player_url':   None,
1552                         })
1553                 except UnavailableVideoError:
1554                         self._downloader.trouble(u'\nERROR: unable to download video')
1555
1556 class GoogleIE(InfoExtractor):
1557         """Information extractor for video.google.com."""
1558
1559         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1560
1561         def __init__(self, downloader=None):
1562                 InfoExtractor.__init__(self, downloader)
1563
1564         @staticmethod
1565         def suitable(url):
1566                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1567
1568         def report_download_webpage(self, video_id):
1569                 """Report webpage download."""
1570                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1571
1572         def report_extraction(self, video_id):
1573                 """Report information extraction."""
1574                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1575
1576         def _real_initialize(self):
1577                 return
1578
1579         def _real_extract(self, url):
1580                 # Extract id from URL
1581                 mobj = re.match(self._VALID_URL, url)
1582                 if mobj is None:
1583                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1584                         return
1585
1586                 # At this point we have a new video
1587                 self._downloader.increment_downloads()
1588                 video_id = mobj.group(1)
1589
1590                 video_extension = 'mp4'
1591
1592                 # Retrieve video webpage to extract further information
1593                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1594                 try:
1595                         self.report_download_webpage(video_id)
1596                         webpage = urllib2.urlopen(request).read()
1597                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1598                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1599                         return
1600
1601                 # Extract URL, uploader, and title from webpage
1602                 self.report_extraction(video_id)
1603                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1604                 if mobj is None:
1605                         video_extension = 'flv'
1606                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1607                 if mobj is None:
1608                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1609                         return
1610                 mediaURL = urllib.unquote(mobj.group(1))
1611                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1612                 mediaURL = mediaURL.replace('\\x26', '\x26')
1613
1614                 video_url = mediaURL
1615
1616                 mobj = re.search(r'<title>(.*)</title>', webpage)
1617                 if mobj is None:
1618                         self._downloader.trouble(u'ERROR: unable to extract title')
1619                         return
1620                 video_title = mobj.group(1).decode('utf-8')
1621                 video_title = sanitize_title(video_title)
1622                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1623
1624                 # Extract video description
1625                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1626                 if mobj is None:
1627                         self._downloader.trouble(u'ERROR: unable to extract video description')
1628                         return
1629                 video_description = mobj.group(1).decode('utf-8')
1630                 if not video_description:
1631                         video_description = 'No description available.'
1632
1633                 # Extract video thumbnail
1634                 if self._downloader.params.get('forcethumbnail', False):
1635                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1636                         try:
1637                                 webpage = urllib2.urlopen(request).read()
1638                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1639                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1640                                 return
1641                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1642                         if mobj is None:
1643                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1644                                 return
1645                         video_thumbnail = mobj.group(1)
1646                 else:   # we need something to pass to process_info
1647                         video_thumbnail = ''
1648
1649
1650                 try:
1651                         # Process video information
1652                         self._downloader.process_info({
1653                                 'id':           video_id.decode('utf-8'),
1654                                 'url':          video_url.decode('utf-8'),
1655                                 'uploader':     u'NA',
1656                                 'upload_date':  u'NA',
1657                                 'title':        video_title,
1658                                 'stitle':       simple_title,
1659                                 'ext':          video_extension.decode('utf-8'),
1660                                 'format':       u'NA',
1661                                 'player_url':   None,
1662                         })
1663                 except UnavailableVideoError:
1664                         self._downloader.trouble(u'\nERROR: unable to download video')
1665
1666
1667 class PhotobucketIE(InfoExtractor):
1668         """Information extractor for photobucket.com."""
1669
1670         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1671
1672         def __init__(self, downloader=None):
1673                 InfoExtractor.__init__(self, downloader)
1674
1675         @staticmethod
1676         def suitable(url):
1677                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1678
1679         def report_download_webpage(self, video_id):
1680                 """Report webpage download."""
1681                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1682
1683         def report_extraction(self, video_id):
1684                 """Report information extraction."""
1685                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1686
1687         def _real_initialize(self):
1688                 return
1689
1690         def _real_extract(self, url):
1691                 # Extract id from URL
1692                 mobj = re.match(self._VALID_URL, url)
1693                 if mobj is None:
1694                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1695                         return
1696
1697                 # At this point we have a new video
1698                 self._downloader.increment_downloads()
1699                 video_id = mobj.group(1)
1700
1701                 video_extension = 'flv'
1702
1703                 # Retrieve video webpage to extract further information
1704                 request = urllib2.Request(url)
1705                 try:
1706                         self.report_download_webpage(video_id)
1707                         webpage = urllib2.urlopen(request).read()
1708                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1709                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1710                         return
1711
1712                 # Extract URL, uploader, and title from webpage
1713                 self.report_extraction(video_id)
1714                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1715                 if mobj is None:
1716                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1717                         return
1718                 mediaURL = urllib.unquote(mobj.group(1))
1719
1720                 video_url = mediaURL
1721
1722                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1723                 if mobj is None:
1724                         self._downloader.trouble(u'ERROR: unable to extract title')
1725                         return
1726                 video_title = mobj.group(1).decode('utf-8')
1727                 video_title = sanitize_title(video_title)
1728                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1729
1730                 video_uploader = mobj.group(2).decode('utf-8')
1731
1732                 try:
1733                         # Process video information
1734                         self._downloader.process_info({
1735                                 'id':           video_id.decode('utf-8'),
1736                                 'url':          video_url.decode('utf-8'),
1737                                 'uploader':     video_uploader,
1738                                 'upload_date':  u'NA',
1739                                 'title':        video_title,
1740                                 'stitle':       simple_title,
1741                                 'ext':          video_extension.decode('utf-8'),
1742                                 'format':       u'NA',
1743                                 'player_url':   None,
1744                         })
1745                 except UnavailableVideoError:
1746                         self._downloader.trouble(u'\nERROR: unable to download video')
1747
1748
1749 class YahooIE(InfoExtractor):
1750         """Information extractor for video.yahoo.com."""
1751
1752         # _VALID_URL matches all Yahoo! Video URLs
1753         # _VPAGE_URL matches only the extractable '/watch/' URLs
1754         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1755         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1756
1757         def __init__(self, downloader=None):
1758                 InfoExtractor.__init__(self, downloader)
1759
1760         @staticmethod
1761         def suitable(url):
1762                 return (re.match(YahooIE._VALID_URL, url) is not None)
1763
1764         def report_download_webpage(self, video_id):
1765                 """Report webpage download."""
1766                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1767
1768         def report_extraction(self, video_id):
1769                 """Report information extraction."""
1770                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1771
1772         def _real_initialize(self):
1773                 return
1774
1775         def _real_extract(self, url, new_video=True):
1776                 # Extract ID from URL
1777                 mobj = re.match(self._VALID_URL, url)
1778                 if mobj is None:
1779                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1780                         return
1781
1782                 # At this point we have a new video
1783                 self._downloader.increment_downloads()
1784                 video_id = mobj.group(2)
1785                 video_extension = 'flv'
1786
1787                 # Rewrite valid but non-extractable URLs as
1788                 # extractable English language /watch/ URLs
1789                 if re.match(self._VPAGE_URL, url) is None:
1790                         request = urllib2.Request(url)
1791                         try:
1792                                 webpage = urllib2.urlopen(request).read()
1793                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1794                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1795                                 return
1796
1797                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1798                         if mobj is None:
1799                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1800                                 return
1801                         yahoo_id = mobj.group(1)
1802
1803                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1804                         if mobj is None:
1805                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1806                                 return
1807                         yahoo_vid = mobj.group(1)
1808
1809                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1810                         return self._real_extract(url, new_video=False)
1811
1812                 # Retrieve video webpage to extract further information
1813                 request = urllib2.Request(url)
1814                 try:
1815                         self.report_download_webpage(video_id)
1816                         webpage = urllib2.urlopen(request).read()
1817                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1818                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1819                         return
1820
1821                 # Extract uploader and title from webpage
1822                 self.report_extraction(video_id)
1823                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1824                 if mobj is None:
1825                         self._downloader.trouble(u'ERROR: unable to extract video title')
1826                         return
1827                 video_title = mobj.group(1).decode('utf-8')
1828                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1829
1830                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1831                 if mobj is None:
1832                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1833                         return
1834                 video_uploader = mobj.group(1).decode('utf-8')
1835
1836                 # Extract video thumbnail
1837                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1838                 if mobj is None:
1839                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1840                         return
1841                 video_thumbnail = mobj.group(1).decode('utf-8')
1842
1843                 # Extract video description
1844                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1845                 if mobj is None:
1846                         self._downloader.trouble(u'ERROR: unable to extract video description')
1847                         return
1848                 video_description = mobj.group(1).decode('utf-8')
1849                 if not video_description: video_description = 'No description available.'
1850
1851                 # Extract video height and width
1852                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1853                 if mobj is None:
1854                         self._downloader.trouble(u'ERROR: unable to extract video height')
1855                         return
1856                 yv_video_height = mobj.group(1)
1857
1858                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1859                 if mobj is None:
1860                         self._downloader.trouble(u'ERROR: unable to extract video width')
1861                         return
1862                 yv_video_width = mobj.group(1)
1863
1864                 # Retrieve video playlist to extract media URL
1865                 # I'm not completely sure what all these options are, but we
1866                 # seem to need most of them, otherwise the server sends a 401.
1867                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1868                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1869                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1870                                                                   '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1871                                                                   '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1872                 try:
1873                         self.report_download_webpage(video_id)
1874                         webpage = urllib2.urlopen(request).read()
1875                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1876                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1877                         return
1878
1879                 # Extract media URL from playlist XML
1880                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1881                 if mobj is None:
1882                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1883                         return
1884                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1885                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1886
1887                 try:
1888                         # Process video information
1889                         self._downloader.process_info({
1890                                 'id':           video_id.decode('utf-8'),
1891                                 'url':          video_url,
1892                                 'uploader':     video_uploader,
1893                                 'upload_date':  u'NA',
1894                                 'title':        video_title,
1895                                 'stitle':       simple_title,
1896                                 'ext':          video_extension.decode('utf-8'),
1897                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1898                                 'description':  video_description,
1899                                 'thumbnail':    video_thumbnail,
1900                                 'description':  video_description,
1901                                 'player_url':   None,
1902                         })
1903                 except UnavailableVideoError:
1904                         self._downloader.trouble(u'\nERROR: unable to download video')
1905
1906
1907 class GenericIE(InfoExtractor):
1908         """Generic last-resort information extractor."""
1909
1910         def __init__(self, downloader=None):
1911                 InfoExtractor.__init__(self, downloader)
1912
1913         @staticmethod
1914         def suitable(url):
1915                 return True
1916
1917         def report_download_webpage(self, video_id):
1918                 """Report webpage download."""
1919                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1920                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1921
1922         def report_extraction(self, video_id):
1923                 """Report information extraction."""
1924                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1925
1926         def _real_initialize(self):
1927                 return
1928
1929         def _real_extract(self, url):
1930                 # At this point we have a new video
1931                 self._downloader.increment_downloads()
1932
1933                 video_id = url.split('/')[-1]
1934                 request = urllib2.Request(url)
1935                 try:
1936                         self.report_download_webpage(video_id)
1937                         webpage = urllib2.urlopen(request).read()
1938                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1940                         return
1941                 except ValueError, err:
1942                         # since this is the last-resort InfoExtractor, if
1943                         # this error is thrown, it'll be thrown here
1944                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1945                         return
1946
1947                 self.report_extraction(video_id)
1948                 # Start with something easy: JW Player in SWFObject
1949                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1950                 if mobj is None:
1951                         # Broaden the search a little bit
1952                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1953                 if mobj is None:
1954                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1955                         return
1956
1957                 # It's possible that one of the regexes
1958                 # matched, but returned an empty group:
1959                 if mobj.group(1) is None:
1960                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1961                         return
1962
1963                 video_url = urllib.unquote(mobj.group(1))
1964                 video_id  = os.path.basename(video_url)
1965
1966                 # here's a fun little line of code for you:
1967                 video_extension = os.path.splitext(video_id)[1][1:]
1968                 video_id        = os.path.splitext(video_id)[0]
1969
1970                 # it's tempting to parse this further, but you would
1971                 # have to take into account all the variations like
1972                 #   Video Title - Site Name
1973                 #   Site Name | Video Title
1974                 #   Video Title - Tagline | Site Name
1975                 # and so on and so forth; it's just not practical
1976                 mobj = re.search(r'<title>(.*)</title>', webpage)
1977                 if mobj is None:
1978                         self._downloader.trouble(u'ERROR: unable to extract title')
1979                         return
1980                 video_title = mobj.group(1).decode('utf-8')
1981                 video_title = sanitize_title(video_title)
1982                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1983
1984                 # video uploader is domain name
1985                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1986                 if mobj is None:
1987                         self._downloader.trouble(u'ERROR: unable to extract title')
1988                         return
1989                 video_uploader = mobj.group(1).decode('utf-8')
1990
1991                 try:
1992                         # Process video information
1993                         self._downloader.process_info({
1994                                 'id':           video_id.decode('utf-8'),
1995                                 'url':          video_url.decode('utf-8'),
1996                                 'uploader':     video_uploader,
1997                                 'upload_date':  u'NA',
1998                                 'title':        video_title,
1999                                 'stitle':       simple_title,
2000                                 'ext':          video_extension.decode('utf-8'),
2001                                 'format':       u'NA',
2002                                 'player_url':   None,
2003                         })
2004                 except UnavailableVideoError, err:
2005                         self._downloader.trouble(u'\nERROR: unable to download video')
2006
2007
2008 class YoutubeSearchIE(InfoExtractor):
2009         """Information Extractor for YouTube search queries."""
2010         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2011         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2012         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2013         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2014         _youtube_ie = None
2015         _max_youtube_results = 1000
2016
2017         def __init__(self, youtube_ie, downloader=None):
2018                 InfoExtractor.__init__(self, downloader)
2019                 self._youtube_ie = youtube_ie
2020
2021         @staticmethod
2022         def suitable(url):
2023                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2024
2025         def report_download_page(self, query, pagenum):
2026                 """Report attempt to download playlist page with given number."""
2027                 query = query.decode(preferredencoding())
2028                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2029
2030         def _real_initialize(self):
2031                 self._youtube_ie.initialize()
2032
2033         def _real_extract(self, query):
2034                 mobj = re.match(self._VALID_QUERY, query)
2035                 if mobj is None:
2036                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2037                         return
2038
2039                 prefix, query = query.split(':')
2040                 prefix = prefix[8:]
2041                 query  = query.encode('utf-8')
2042                 if prefix == '':
2043                         self._download_n_results(query, 1)
2044                         return
2045                 elif prefix == 'all':
2046                         self._download_n_results(query, self._max_youtube_results)
2047                         return
2048                 else:
2049                         try:
2050                                 n = long(prefix)
2051                                 if n <= 0:
2052                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2053                                         return
2054                                 elif n > self._max_youtube_results:
2055                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2056                                         n = self._max_youtube_results
2057                                 self._download_n_results(query, n)
2058                                 return
2059                         except ValueError: # parsing prefix as integer fails
2060                                 self._download_n_results(query, 1)
2061                                 return
2062
2063         def _download_n_results(self, query, n):
2064                 """Downloads a specified number of results for a query"""
2065
2066                 video_ids = []
2067                 already_seen = set()
2068                 pagenum = 1
2069
2070                 while True:
2071                         self.report_download_page(query, pagenum)
2072                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2073                         request = urllib2.Request(result_url)
2074                         try:
2075                                 page = urllib2.urlopen(request).read()
2076                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2077                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2078                                 return
2079
2080                         # Extract video identifiers
2081                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2082                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2083                                 if video_id not in already_seen:
2084                                         video_ids.append(video_id)
2085                                         already_seen.add(video_id)
2086                                         if len(video_ids) == n:
2087                                                 # Specified n videos reached
2088                                                 for id in video_ids:
2089                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2090                                                 return
2091
2092                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2093                                 for id in video_ids:
2094                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2095                                 return
2096
2097                         pagenum = pagenum + 1
2098
2099 class GoogleSearchIE(InfoExtractor):
2100         """Information Extractor for Google Video search queries."""
2101         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2102         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2103         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2104         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2105         _google_ie = None
2106         _max_google_results = 1000
2107
2108         def __init__(self, google_ie, downloader=None):
2109                 InfoExtractor.__init__(self, downloader)
2110                 self._google_ie = google_ie
2111
2112         @staticmethod
2113         def suitable(url):
2114                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2115
2116         def report_download_page(self, query, pagenum):
2117                 """Report attempt to download playlist page with given number."""
2118                 query = query.decode(preferredencoding())
2119                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2120
2121         def _real_initialize(self):
2122                 self._google_ie.initialize()
2123
2124         def _real_extract(self, query):
2125                 mobj = re.match(self._VALID_QUERY, query)
2126                 if mobj is None:
2127                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2128                         return
2129
2130                 prefix, query = query.split(':')
2131                 prefix = prefix[8:]
2132                 query  = query.encode('utf-8')
2133                 if prefix == '':
2134                         self._download_n_results(query, 1)
2135                         return
2136                 elif prefix == 'all':
2137                         self._download_n_results(query, self._max_google_results)
2138                         return
2139                 else:
2140                         try:
2141                                 n = long(prefix)
2142                                 if n <= 0:
2143                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2144                                         return
2145                                 elif n > self._max_google_results:
2146                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2147                                         n = self._max_google_results
2148                                 self._download_n_results(query, n)
2149                                 return
2150                         except ValueError: # parsing prefix as integer fails
2151                                 self._download_n_results(query, 1)
2152                                 return
2153
2154         def _download_n_results(self, query, n):
2155                 """Downloads a specified number of results for a query"""
2156
2157                 video_ids = []
2158                 already_seen = set()
2159                 pagenum = 1
2160
2161                 while True:
2162                         self.report_download_page(query, pagenum)
2163                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2164                         request = urllib2.Request(result_url)
2165                         try:
2166                                 page = urllib2.urlopen(request).read()
2167                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2168                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2169                                 return
2170
2171                         # Extract video identifiers
2172                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2173                                 video_id = mobj.group(1)
2174                                 if video_id not in already_seen:
2175                                         video_ids.append(video_id)
2176                                         already_seen.add(video_id)
2177                                         if len(video_ids) == n:
2178                                                 # Specified n videos reached
2179                                                 for id in video_ids:
2180                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2181                                                 return
2182
2183                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2184                                 for id in video_ids:
2185                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2186                                 return
2187
2188                         pagenum = pagenum + 1
2189
2190 class YahooSearchIE(InfoExtractor):
2191         """Information Extractor for Yahoo! Video search queries."""
2192         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2193         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2194         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2195         _MORE_PAGES_INDICATOR = r'\s*Next'
2196         _yahoo_ie = None
2197         _max_yahoo_results = 1000
2198
2199         def __init__(self, yahoo_ie, downloader=None):
2200                 InfoExtractor.__init__(self, downloader)
2201                 self._yahoo_ie = yahoo_ie
2202
2203         @staticmethod
2204         def suitable(url):
2205                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2206
2207         def report_download_page(self, query, pagenum):
2208                 """Report attempt to download playlist page with given number."""
2209                 query = query.decode(preferredencoding())
2210                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2211
2212         def _real_initialize(self):
2213                 self._yahoo_ie.initialize()
2214
2215         def _real_extract(self, query):
2216                 mobj = re.match(self._VALID_QUERY, query)
2217                 if mobj is None:
2218                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2219                         return
2220
2221                 prefix, query = query.split(':')
2222                 prefix = prefix[8:]
2223                 query  = query.encode('utf-8')
2224                 if prefix == '':
2225                         self._download_n_results(query, 1)
2226                         return
2227                 elif prefix == 'all':
2228                         self._download_n_results(query, self._max_yahoo_results)
2229                         return
2230                 else:
2231                         try:
2232                                 n = long(prefix)
2233                                 if n <= 0:
2234                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2235                                         return
2236                                 elif n > self._max_yahoo_results:
2237                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2238                                         n = self._max_yahoo_results
2239                                 self._download_n_results(query, n)
2240                                 return
2241                         except ValueError: # parsing prefix as integer fails
2242                                 self._download_n_results(query, 1)
2243                                 return
2244
2245         def _download_n_results(self, query, n):
2246                 """Downloads a specified number of results for a query"""
2247
2248                 video_ids = []
2249                 already_seen = set()
2250                 pagenum = 1
2251
2252                 while True:
2253                         self.report_download_page(query, pagenum)
2254                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2255                         request = urllib2.Request(result_url)
2256                         try:
2257                                 page = urllib2.urlopen(request).read()
2258                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2259                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2260                                 return
2261
2262                         # Extract video identifiers
2263                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2264                                 video_id = mobj.group(1)
2265                                 if video_id not in already_seen:
2266                                         video_ids.append(video_id)
2267                                         already_seen.add(video_id)
2268                                         if len(video_ids) == n:
2269                                                 # Specified n videos reached
2270                                                 for id in video_ids:
2271                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2272                                                 return
2273
2274                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2275                                 for id in video_ids:
2276                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2277                                 return
2278
2279                         pagenum = pagenum + 1
2280
2281 class YoutubePlaylistIE(InfoExtractor):
2282         """Information Extractor for YouTube playlists."""
2283
2284         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2285         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2286         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2287         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2288         _youtube_ie = None
2289
2290         def __init__(self, youtube_ie, downloader=None):
2291                 InfoExtractor.__init__(self, downloader)
2292                 self._youtube_ie = youtube_ie
2293
2294         @staticmethod
2295         def suitable(url):
2296                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2297
2298         def report_download_page(self, playlist_id, pagenum):
2299                 """Report attempt to download playlist page with given number."""
2300                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2301
2302         def _real_initialize(self):
2303                 self._youtube_ie.initialize()
2304
2305         def _real_extract(self, url):
2306                 # Extract playlist id
2307                 mobj = re.match(self._VALID_URL, url)
2308                 if mobj is None:
2309                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2310                         return
2311
2312                 # Single video case
2313                 if mobj.group(3) is not None:
2314                         self._youtube_ie.extract(mobj.group(3))
2315                         return
2316
2317                 # Download playlist pages
2318                 # prefix is 'p' as default for playlists but there are other types that need extra care
2319                 playlist_prefix = mobj.group(1)
2320                 if playlist_prefix == 'a':
2321                         playlist_access = 'artist'
2322                 else:
2323                         playlist_prefix = 'p'
2324                         playlist_access = 'view_play_list'
2325                 playlist_id = mobj.group(2)
2326                 video_ids = []
2327                 pagenum = 1
2328
2329                 while True:
2330                         self.report_download_page(playlist_id, pagenum)
2331                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2332                         try:
2333                                 page = urllib2.urlopen(request).read()
2334                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2335                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2336                                 return
2337
2338                         # Extract video identifiers
2339                         ids_in_page = []
2340                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2341                                 if mobj.group(1) not in ids_in_page:
2342                                         ids_in_page.append(mobj.group(1))
2343                         video_ids.extend(ids_in_page)
2344
2345                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2346                                 break
2347                         pagenum = pagenum + 1
2348
2349                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2350                 playlistend = self._downloader.params.get('playlistend', -1)
2351                 video_ids = video_ids[playliststart:playlistend]
2352
2353                 for id in video_ids:
2354                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2355                 return
2356
2357 class YoutubeUserIE(InfoExtractor):
2358         """Information Extractor for YouTube users."""
2359
2360         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2361         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2362         _GDATA_PAGE_SIZE = 50
2363         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2364         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2365         _youtube_ie = None
2366
2367         def __init__(self, youtube_ie, downloader=None):
2368                 InfoExtractor.__init__(self, downloader)
2369                 self._youtube_ie = youtube_ie
2370
2371         @staticmethod
2372         def suitable(url):
2373                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2374
2375         def report_download_page(self, username, start_index):
2376                 """Report attempt to download user page."""
2377                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2378                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2379
2380         def _real_initialize(self):
2381                 self._youtube_ie.initialize()
2382
2383         def _real_extract(self, url):
2384                 # Extract username
2385                 mobj = re.match(self._VALID_URL, url)
2386                 if mobj is None:
2387                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2388                         return
2389
2390                 username = mobj.group(1)
2391
2392                 # Download video ids using YouTube Data API. Result size per
2393                 # query is limited (currently to 50 videos) so we need to query
2394                 # page by page until there are no video ids - it means we got
2395                 # all of them.
2396
2397                 video_ids = []
2398                 pagenum = 0
2399
2400                 while True:
2401                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2402                         self.report_download_page(username, start_index)
2403
2404                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2405
2406                         try:
2407                                 page = urllib2.urlopen(request).read()
2408                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2409                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2410                                 return
2411
2412                         # Extract video identifiers
2413                         ids_in_page = []
2414
2415                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2416                                 if mobj.group(1) not in ids_in_page:
2417                                         ids_in_page.append(mobj.group(1))
2418
2419                         video_ids.extend(ids_in_page)
2420
2421                         # A little optimization - if current page is not
2422                         # "full", ie. does not contain PAGE_SIZE video ids then
2423                         # we can assume that this page is the last one - there
2424                         # are no more ids on further pages - no need to query
2425                         # again.
2426
2427                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2428                                 break
2429
2430                         pagenum += 1
2431
2432                 all_ids_count = len(video_ids)
2433                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2434                 playlistend = self._downloader.params.get('playlistend', -1)
2435
2436                 if playlistend == -1:
2437                         video_ids = video_ids[playliststart:]
2438                 else:
2439                         video_ids = video_ids[playliststart:playlistend]
2440
2441                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2442                                                                   (username, all_ids_count, len(video_ids)))
2443
2444                 for video_id in video_ids:
2445                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2446
2447
2448 class DepositFilesIE(InfoExtractor):
2449         """Information extractor for depositfiles.com"""
2450
2451         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2452
2453         def __init__(self, downloader=None):
2454                 InfoExtractor.__init__(self, downloader)
2455
2456         @staticmethod
2457         def suitable(url):
2458                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2459
2460         def report_download_webpage(self, file_id):
2461                 """Report webpage download."""
2462                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2463
2464         def report_extraction(self, file_id):
2465                 """Report information extraction."""
2466                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2467
2468         def _real_initialize(self):
2469                 return
2470
2471         def _real_extract(self, url):
2472                 # At this point we have a new file
2473                 self._downloader.increment_downloads()
2474
2475                 file_id = url.split('/')[-1]
2476                 # Rebuild url in english locale
2477                 url = 'http://depositfiles.com/en/files/' + file_id
2478
2479                 # Retrieve file webpage with 'Free download' button pressed
2480                 free_download_indication = { 'gateway_result' : '1' }
2481                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2482                 try:
2483                         self.report_download_webpage(file_id)
2484                         webpage = urllib2.urlopen(request).read()
2485                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2486                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2487                         return
2488
2489                 # Search for the real file URL
2490                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2491                 if (mobj is None) or (mobj.group(1) is None):
2492                         # Try to figure out reason of the error.
2493                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2494                         if (mobj is not None) and (mobj.group(1) is not None):
2495                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2496                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2497                         else:
2498                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2499                         return
2500
2501                 file_url = mobj.group(1)
2502                 file_extension = os.path.splitext(file_url)[1][1:]
2503
2504                 # Search for file title
2505                 mobj = re.search(r'<b title="(.*?)">', webpage)
2506                 if mobj is None:
2507                         self._downloader.trouble(u'ERROR: unable to extract title')
2508                         return
2509                 file_title = mobj.group(1).decode('utf-8')
2510
2511                 try:
2512                         # Process file information
2513                         self._downloader.process_info({
2514                                 'id':           file_id.decode('utf-8'),
2515                                 'url':          file_url.decode('utf-8'),
2516                                 'uploader':     u'NA',
2517                                 'upload_date':  u'NA',
2518                                 'title':        file_title,
2519                                 'stitle':       file_title,
2520                                 'ext':          file_extension.decode('utf-8'),
2521                                 'format':       u'NA',
2522                                 'player_url':   None,
2523                         })
2524                 except UnavailableVideoError, err:
2525                         self._downloader.trouble(u'ERROR: unable to download file')
2526
2527 class FacebookIE(InfoExtractor):
2528         """Information Extractor for Facebook"""
2529
2530         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2531         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2532         _NETRC_MACHINE = 'facebook'
2533         _available_formats = ['highqual', 'lowqual']
2534         _video_extensions = {
2535                 'highqual': 'mp4',
2536                 'lowqual': 'mp4',
2537         }
2538
2539         def __init__(self, downloader=None):
2540                 InfoExtractor.__init__(self, downloader)
2541
2542         @staticmethod
2543         def suitable(url):
2544                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2545
2546         def _reporter(self, message):
2547                 """Add header and report message."""
2548                 self._downloader.to_screen(u'[facebook] %s' % message)
2549
2550         def report_login(self):
2551                 """Report attempt to log in."""
2552                 self._reporter(u'Logging in')
2553
2554         def report_video_webpage_download(self, video_id):
2555                 """Report attempt to download video webpage."""
2556                 self._reporter(u'%s: Downloading video webpage' % video_id)
2557
2558         def report_information_extraction(self, video_id):
2559                 """Report attempt to extract video information."""
2560                 self._reporter(u'%s: Extracting video information' % video_id)
2561
2562         def _parse_page(self, video_webpage):
2563                 """Extract video information from page"""
2564                 # General data
2565                 data = {'title': r'class="video_title datawrap">(.*?)</',
2566                         'description': r'<div class="datawrap">(.*?)</div>',
2567                         'owner': r'\("video_owner_name", "(.*?)"\)',
2568                         'upload_date': r'data-date="(.*?)"',
2569                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2570                         }
2571                 video_info = {}
2572                 for piece in data.keys():
2573                         mobj = re.search(data[piece], video_webpage)
2574                         if mobj is not None:
2575                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2576
2577                 # Video urls
2578                 video_urls = {}
2579                 for fmt in self._available_formats:
2580                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2581                         if mobj is not None:
2582                                 # URL is in a Javascript segment inside an escaped Unicode format within
2583                                 # the generally utf-8 page
2584                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2585                 video_info['video_urls'] = video_urls
2586
2587                 return video_info
2588
2589         def _real_initialize(self):
2590                 if self._downloader is None:
2591                         return
2592
2593                 useremail = None
2594                 password = None
2595                 downloader_params = self._downloader.params
2596
2597                 # Attempt to use provided username and password or .netrc data
2598                 if downloader_params.get('username', None) is not None:
2599                         useremail = downloader_params['username']
2600                         password = downloader_params['password']
2601                 elif downloader_params.get('usenetrc', False):
2602                         try:
2603                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2604                                 if info is not None:
2605                                         useremail = info[0]
2606                                         password = info[2]
2607                                 else:
2608                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2609                         except (IOError, netrc.NetrcParseError), err:
2610                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2611                                 return
2612
2613                 if useremail is None:
2614                         return
2615
2616                 # Log in
2617                 login_form = {
2618                         'email': useremail,
2619                         'pass': password,
2620                         'login': 'Log+In'
2621                         }
2622                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2623                 try:
2624                         self.report_login()
2625                         login_results = urllib2.urlopen(request).read()
2626                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2627                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2628                                 return
2629                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2630                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2631                         return
2632
2633         def _real_extract(self, url):
2634                 mobj = re.match(self._VALID_URL, url)
2635                 if mobj is None:
2636                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2637                         return
2638                 video_id = mobj.group('ID')
2639
2640                 # Get video webpage
2641                 self.report_video_webpage_download(video_id)
2642                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2643                 try:
2644                         page = urllib2.urlopen(request)
2645                         video_webpage = page.read()
2646                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2647                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2648                         return
2649
2650                 # Start extracting information
2651                 self.report_information_extraction(video_id)
2652
2653                 # Extract information
2654                 video_info = self._parse_page(video_webpage)
2655
2656                 # uploader
2657                 if 'owner' not in video_info:
2658                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2659                         return
2660                 video_uploader = video_info['owner']
2661
2662                 # title
2663                 if 'title' not in video_info:
2664                         self._downloader.trouble(u'ERROR: unable to extract video title')
2665                         return
2666                 video_title = video_info['title']
2667                 video_title = video_title.decode('utf-8')
2668                 video_title = sanitize_title(video_title)
2669
2670                 # simplified title
2671                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2672                 simple_title = simple_title.strip(ur'_')
2673
2674                 # thumbnail image
2675                 if 'thumbnail' not in video_info:
2676                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2677                         video_thumbnail = ''
2678                 else:
2679                         video_thumbnail = video_info['thumbnail']
2680
2681                 # upload date
2682                 upload_date = u'NA'
2683                 if 'upload_date' in video_info:
2684                         upload_time = video_info['upload_date']
2685                         timetuple = email.utils.parsedate_tz(upload_time)
2686                         if timetuple is not None:
2687                                 try:
2688                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2689                                 except:
2690                                         pass
2691
2692                 # description
2693                 video_description = video_info.get('description', 'No description available.')
2694
2695                 url_map = video_info['video_urls']
2696                 if len(url_map.keys()) > 0:
2697                         # Decide which formats to download
2698                         req_format = self._downloader.params.get('format', None)
2699                         format_limit = self._downloader.params.get('format_limit', None)
2700
2701                         if format_limit is not None and format_limit in self._available_formats:
2702                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2703                         else:
2704                                 format_list = self._available_formats
2705                         existing_formats = [x for x in format_list if x in url_map]
2706                         if len(existing_formats) == 0:
2707                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2708                                 return
2709                         if req_format is None:
2710                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2711                         elif req_format == '-1':
2712                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2713                         else:
2714                                 # Specific format
2715                                 if req_format not in url_map:
2716                                         self._downloader.trouble(u'ERROR: requested format not available')
2717                                         return
2718                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2719
2720                 for format_param, video_real_url in video_url_list:
2721
2722                         # At this point we have a new video
2723                         self._downloader.increment_downloads()
2724
2725                         # Extension
2726                         video_extension = self._video_extensions.get(format_param, 'mp4')
2727
2728                         try:
2729                                 # Process video information
2730                                 self._downloader.process_info({
2731                                         'id':           video_id.decode('utf-8'),
2732                                         'url':          video_real_url.decode('utf-8'),
2733                                         'uploader':     video_uploader.decode('utf-8'),
2734                                         'upload_date':  upload_date,
2735                                         'title':        video_title,
2736                                         'stitle':       simple_title,
2737                                         'ext':          video_extension.decode('utf-8'),
2738                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2739                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2740                                         'description':  video_description.decode('utf-8'),
2741                                         'player_url':   None,
2742                                 })
2743                         except UnavailableVideoError, err:
2744                                 self._downloader.trouble(u'\nERROR: unable to download video')
2745
2746 class BlipTVIE(InfoExtractor):
2747         """Information extractor for blip.tv"""
2748
2749         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2750         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2751
2752         @staticmethod
2753         def suitable(url):
2754                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2755
2756         def report_extraction(self, file_id):
2757                 """Report information extraction."""
2758                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2759
2760         def _simplify_title(self, title):
2761                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2762                 res = res.strip(ur'_')
2763                 return res
2764
2765         def _real_extract(self, url):
2766                 mobj = re.match(self._VALID_URL, url)
2767                 if mobj is None:
2768                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2769                         return
2770
2771                 if '?' in url:
2772                         cchar = '&'
2773                 else:
2774                         cchar = '?'
2775                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2776                 request = urllib2.Request(json_url)
2777                 self.report_extraction(mobj.group(1))
2778                 try:
2779                         json_code = urllib2.urlopen(request).read()
2780                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2781                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2782                         return
2783                 try:
2784                         json_data = json.loads(json_code)
2785                         if 'Post' in json_data:
2786                                 data = json_data['Post']
2787                         else:
2788                                 data = json_data
2789
2790                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2791                         video_url = data['media']['url']
2792                         umobj = re.match(self._URL_EXT, video_url)
2793                         if umobj is None:
2794                                 raise ValueError('Can not determine filename extension')
2795                         ext = umobj.group(1)
2796
2797                         self._downloader.increment_downloads()
2798
2799                         info = {
2800                                 'id': data['item_id'],
2801                                 'url': video_url,
2802                                 'uploader': data['display_name'],
2803                                 'upload_date': upload_date,
2804                                 'title': data['title'],
2805                                 'stitle': self._simplify_title(data['title']),
2806                                 'ext': ext,
2807                                 'format': data['media']['mimeType'],
2808                                 'thumbnail': data['thumbnailUrl'],
2809                                 'description': data['description'],
2810                                 'player_url': data['embedUrl']
2811                         }
2812                 except (ValueError,KeyError), err:
2813                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2814                         return
2815
2816                 try:
2817                         self._downloader.process_info(info)
2818                 except UnavailableVideoError, err:
2819                         self._downloader.trouble(u'\nERROR: unable to download video')
2820
2821
2822 class PostProcessor(object):
2823         """Post Processor class.
2824
2825         PostProcessor objects can be added to downloaders with their
2826         add_post_processor() method. When the downloader has finished a
2827         successful download, it will take its internal chain of PostProcessors
2828         and start calling the run() method on each one of them, first with
2829         an initial argument and then with the returned value of the previous
2830         PostProcessor.
2831
2832         The chain will be stopped if one of them ever returns None or the end
2833         of the chain is reached.
2834
2835         PostProcessor objects follow a "mutual registration" process similar
2836         to InfoExtractor objects.
2837         """
2838
2839         _downloader = None
2840
2841         def __init__(self, downloader=None):
2842                 self._downloader = downloader
2843
2844         def set_downloader(self, downloader):
2845                 """Sets the downloader for this PP."""
2846                 self._downloader = downloader
2847
2848         def run(self, information):
2849                 """Run the PostProcessor.
2850
2851                 The "information" argument is a dictionary like the ones
2852                 composed by InfoExtractors. The only difference is that this
2853                 one has an extra field called "filepath" that points to the
2854                 downloaded file.
2855
2856                 When this method returns None, the postprocessing chain is
2857                 stopped. However, this method may return an information
2858                 dictionary that will be passed to the next postprocessing
2859                 object in the chain. It can be the one it received after
2860                 changing some fields.
2861
2862                 In addition, this method may raise a PostProcessingError
2863                 exception that will be taken into account by the downloader
2864                 it was called from.
2865                 """
2866                 return information # by default, do nothing
2867
2868 class FFmpegExtractAudioPP(PostProcessor):
2869
2870         def __init__(self, downloader=None, preferredcodec=None):
2871                 PostProcessor.__init__(self, downloader)
2872                 if preferredcodec is None:
2873                         preferredcodec = 'best'
2874                 self._preferredcodec = preferredcodec
2875
2876         @staticmethod
2877         def get_audio_codec(path):
2878                 try:
2879                         cmd = ['ffprobe', '-show_streams', '--', path]
2880                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2881                         output = handle.communicate()[0]
2882                         if handle.wait() != 0:
2883                                 return None
2884                 except (IOError, OSError):
2885                         return None
2886                 audio_codec = None
2887                 for line in output.split('\n'):
2888                         if line.startswith('codec_name='):
2889                                 audio_codec = line.split('=')[1].strip()
2890                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2891                                 return audio_codec
2892                 return None
2893
2894         @staticmethod
2895         def run_ffmpeg(path, out_path, codec, more_opts):
2896                 try:
2897                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2898                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2899                         return (ret == 0)
2900                 except (IOError, OSError):
2901                         return False
2902
2903         def run(self, information):
2904                 path = information['filepath']
2905
2906                 filecodec = self.get_audio_codec(path)
2907                 if filecodec is None:
2908                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2909                         return None
2910
2911                 more_opts = []
2912                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2913                         if filecodec == 'aac' or filecodec == 'mp3':
2914                                 # Lossless if possible
2915                                 acodec = 'copy'
2916                                 extension = filecodec
2917                                 if filecodec == 'aac':
2918                                         more_opts = ['-f', 'adts']
2919                         else:
2920                                 # MP3 otherwise.
2921                                 acodec = 'libmp3lame'
2922                                 extension = 'mp3'
2923                                 more_opts = ['-ab', '128k']
2924                 else:
2925                         # We convert the audio (lossy)
2926                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2927                         extension = self._preferredcodec
2928                         more_opts = ['-ab', '128k']
2929                         if self._preferredcodec == 'aac':
2930                                 more_opts += ['-f', 'adts']
2931
2932                 (prefix, ext) = os.path.splitext(path)
2933                 new_path = prefix + '.' + extension
2934                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2935                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2936
2937                 if not status:
2938                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2939                         return None
2940
2941                 try:
2942                         os.remove(path)
2943                 except (IOError, OSError):
2944                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2945                         return None
2946
2947                 information['filepath'] = new_path
2948                 return information
2949
2950
2951 def updateSelf(downloader, filename):
2952         ''' Update the program file with the latest version from the repository '''
2953         # Note: downloader only used for options
2954         if not os.access(filename, os.W_OK):
2955                 sys.exit('ERROR: no write permissions on %s' % filename)
2956
2957         downloader.to_screen('Updating to latest stable version...')
2958
2959         try:
2960                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2961                 latest_version = urllib.urlopen(latest_url).read().strip()
2962                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2963                 newcontent = urllib.urlopen(prog_url).read()
2964         except (IOError, OSError), err:
2965                 sys.exit('ERROR: unable to download latest version')
2966
2967         try:
2968                 stream = open(filename, 'w')
2969                 stream.write(newcontent)
2970                 stream.close()
2971         except (IOError, OSError), err:
2972                 sys.exit('ERROR: unable to overwrite current version')
2973
2974         downloader.to_screen('Updated to version %s' % latest_version)
2975
2976 def parseOpts():
2977         # Deferred imports
2978         import getpass
2979         import optparse
2980
2981         def _format_option_string(option):
2982                 ''' ('-o', '--option') -> -o, --format METAVAR'''
2983
2984                 opts = []
2985
2986                 if option._short_opts: opts.append(option._short_opts[0])
2987                 if option._long_opts: opts.append(option._long_opts[0])
2988                 if len(opts) > 1: opts.insert(1, ', ')
2989
2990                 if option.takes_value(): opts.append(' %s' % option.metavar)
2991
2992                 return "".join(opts)
2993
2994         def _find_term_columns():
2995                 columns = os.environ.get('COLUMNS', None)
2996                 if columns:
2997                         return int(columns)
2998
2999                 try:
3000                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3001                         out,err = sp.communicate()
3002                         return int(out.split()[1])
3003                 except:
3004                         pass
3005                 return None
3006
3007         max_width = 80
3008         max_help_position = 80
3009
3010         # No need to wrap help messages if we're on a wide console
3011         columns = _find_term_columns()
3012         if columns: max_width = columns
3013
3014         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3015         fmt.format_option_strings = _format_option_string
3016
3017         kw = {
3018                 'version'   : __version__,
3019                 'formatter' : fmt,
3020                 'usage' : '%prog [options] url...',
3021                 'conflict_handler' : 'resolve',
3022         }
3023
3024         parser = optparse.OptionParser(**kw)
3025
3026         # option groups
3027         general        = optparse.OptionGroup(parser, 'General Options')
3028         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3029         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3030         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3031         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3032         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3033
3034         general.add_option('-h', '--help',
3035                         action='help', help='print this help text and exit')
3036         general.add_option('-v', '--version',
3037                         action='version', help='print program version and exit')
3038         general.add_option('-U', '--update',
3039                         action='store_true', dest='update_self', help='update this program to latest stable version')
3040         general.add_option('-i', '--ignore-errors',
3041                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3042         general.add_option('-r', '--rate-limit',
3043                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3044         general.add_option('-R', '--retries',
3045                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3046         general.add_option('--playlist-start',
3047                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3048         general.add_option('--playlist-end',
3049                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3050         general.add_option('--dump-user-agent',
3051                         action='store_true', dest='dump_user_agent',
3052                         help='display the current browser identification', default=False)
3053
3054         authentication.add_option('-u', '--username',
3055                         dest='username', metavar='USERNAME', help='account username')
3056         authentication.add_option('-p', '--password',
3057                         dest='password', metavar='PASSWORD', help='account password')
3058         authentication.add_option('-n', '--netrc',
3059                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3060
3061
3062         video_format.add_option('-f', '--format',
3063                         action='store', dest='format', metavar='FORMAT', help='video format code')
3064         video_format.add_option('--all-formats',
3065                         action='store_const', dest='format', help='download all available video formats', const='-1')
3066         video_format.add_option('--max-quality',
3067                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3068
3069
3070         verbosity.add_option('-q', '--quiet',
3071                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3072         verbosity.add_option('-s', '--simulate',
3073                         action='store_true', dest='simulate', help='do not download video', default=False)
3074         verbosity.add_option('-g', '--get-url',
3075                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3076         verbosity.add_option('-e', '--get-title',
3077                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3078         verbosity.add_option('--get-thumbnail',
3079                         action='store_true', dest='getthumbnail',
3080                         help='simulate, quiet but print thumbnail URL', default=False)
3081         verbosity.add_option('--get-description',
3082                         action='store_true', dest='getdescription',
3083                         help='simulate, quiet but print video description', default=False)
3084         verbosity.add_option('--get-filename',
3085                         action='store_true', dest='getfilename',
3086                         help='simulate, quiet but print output filename', default=False)
3087         verbosity.add_option('--no-progress',
3088                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3089         verbosity.add_option('--console-title',
3090                         action='store_true', dest='consoletitle',
3091                         help='display progress in console titlebar', default=False)
3092
3093
3094         filesystem.add_option('-t', '--title',
3095                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3096         filesystem.add_option('-l', '--literal',
3097                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3098         filesystem.add_option('-A', '--auto-number',
3099                         action='store_true', dest='autonumber',
3100                         help='number downloaded files starting from 00000', default=False)
3101         filesystem.add_option('-o', '--output',
3102                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3103         filesystem.add_option('-a', '--batch-file',
3104                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3105         filesystem.add_option('-w', '--no-overwrites',
3106                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3107         filesystem.add_option('-c', '--continue',
3108                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3109         filesystem.add_option('--cookies',
3110                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3111         filesystem.add_option('--no-part',
3112                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3113         filesystem.add_option('--no-mtime',
3114                         action='store_false', dest='updatetime',
3115                         help='do not use the Last-modified header to set the file modification time', default=True)
3116         filesystem.add_option('--write-description',
3117                         action='store_true', dest='writedescription',
3118                         help='write video description to a .description file', default=False)
3119         filesystem.add_option('--write-info-json',
3120                         action='store_true', dest='writeinfojson',
3121                         help='write video metadata to a .info.json file', default=False)
3122
3123
3124         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3125                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3126         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3127                         help='"best", "aac" or "mp3"; best by default')
3128
3129
3130         parser.add_option_group(general)
3131         parser.add_option_group(filesystem)
3132         parser.add_option_group(verbosity)
3133         parser.add_option_group(video_format)
3134         parser.add_option_group(authentication)
3135         parser.add_option_group(postproc)
3136
3137         opts, args = parser.parse_args()
3138
3139         return parser, opts, args
3140
3141 def main():
3142         parser, opts, args = parseOpts()
3143
3144         # Open appropriate CookieJar
3145         if opts.cookiefile is None:
3146                 jar = cookielib.CookieJar()
3147         else:
3148                 try:
3149                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3150                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3151                                 jar.load()
3152                 except (IOError, OSError), err:
3153                         sys.exit(u'ERROR: unable to open cookie file')
3154
3155         # Dump user agent
3156         if opts.dump_user_agent:
3157                 print std_headers['User-Agent']
3158                 sys.exit(0)
3159
3160         # General configuration
3161         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3162         urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3163         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3164
3165         # Batch file verification
3166         batchurls = []
3167         if opts.batchfile is not None:
3168                 try:
3169                         if opts.batchfile == '-':
3170                                 batchfd = sys.stdin
3171                         else:
3172                                 batchfd = open(opts.batchfile, 'r')
3173                         batchurls = batchfd.readlines()
3174                         batchurls = [x.strip() for x in batchurls]
3175                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3176                 except IOError:
3177                         sys.exit(u'ERROR: batch file could not be read')
3178         all_urls = batchurls + args
3179
3180         # Conflicting, missing and erroneous options
3181         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3182                 parser.error(u'using .netrc conflicts with giving username/password')
3183         if opts.password is not None and opts.username is None:
3184                 parser.error(u'account username missing')
3185         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3186                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3187         if opts.usetitle and opts.useliteral:
3188                 parser.error(u'using title conflicts with using literal title')
3189         if opts.username is not None and opts.password is None:
3190                 opts.password = getpass.getpass(u'Type account password and press return:')
3191         if opts.ratelimit is not None:
3192                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3193                 if numeric_limit is None:
3194                         parser.error(u'invalid rate limit specified')
3195                 opts.ratelimit = numeric_limit
3196         if opts.retries is not None:
3197                 try:
3198                         opts.retries = long(opts.retries)
3199                 except (TypeError, ValueError), err:
3200                         parser.error(u'invalid retry count specified')
3201         try:
3202                 opts.playliststart = int(opts.playliststart)
3203                 if opts.playliststart <= 0:
3204                         raise ValueError(u'Playlist start must be positive')
3205         except (TypeError, ValueError), err:
3206                 parser.error(u'invalid playlist start number specified')
3207         try:
3208                 opts.playlistend = int(opts.playlistend)
3209                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3210                         raise ValueError(u'Playlist end must be greater than playlist start')
3211         except (TypeError, ValueError), err:
3212                 parser.error(u'invalid playlist end number specified')
3213         if opts.extractaudio:
3214                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3215                         parser.error(u'invalid audio format specified')
3216
3217         # Information extractors
3218         youtube_ie = YoutubeIE()
3219         metacafe_ie = MetacafeIE(youtube_ie)
3220         dailymotion_ie = DailymotionIE()
3221         youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3222         youtube_user_ie = YoutubeUserIE(youtube_ie)
3223         youtube_search_ie = YoutubeSearchIE(youtube_ie)
3224         google_ie = GoogleIE()
3225         google_search_ie = GoogleSearchIE(google_ie)
3226         photobucket_ie = PhotobucketIE()
3227         yahoo_ie = YahooIE()
3228         yahoo_search_ie = YahooSearchIE(yahoo_ie)
3229         deposit_files_ie = DepositFilesIE()
3230         facebook_ie = FacebookIE()
3231         bliptv_ie = BlipTVIE()
3232         generic_ie = GenericIE()
3233
3234         # File downloader
3235         fd = FileDownloader({
3236                 'usenetrc': opts.usenetrc,
3237                 'username': opts.username,
3238                 'password': opts.password,
3239                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3240                 'forceurl': opts.geturl,
3241                 'forcetitle': opts.gettitle,
3242                 'forcethumbnail': opts.getthumbnail,
3243                 'forcedescription': opts.getdescription,
3244                 'forcefilename': opts.getfilename,
3245                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3246                 'format': opts.format,
3247                 'format_limit': opts.format_limit,
3248                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3249                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3250                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3251                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3252                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3253                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3254                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3255                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3256                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3257                         or u'%(id)s.%(ext)s'),
3258                 'ignoreerrors': opts.ignoreerrors,
3259                 'ratelimit': opts.ratelimit,
3260                 'nooverwrites': opts.nooverwrites,
3261                 'retries': opts.retries,
3262                 'continuedl': opts.continue_dl,
3263                 'noprogress': opts.noprogress,
3264                 'playliststart': opts.playliststart,
3265                 'playlistend': opts.playlistend,
3266                 'logtostderr': opts.outtmpl == '-',
3267                 'consoletitle': opts.consoletitle,
3268                 'nopart': opts.nopart,
3269                 'updatetime': opts.updatetime,
3270                 'writedescription': opts.writedescription,
3271                 'writeinfojson': opts.writeinfojson,
3272                 })
3273         fd.add_info_extractor(youtube_search_ie)
3274         fd.add_info_extractor(youtube_pl_ie)
3275         fd.add_info_extractor(youtube_user_ie)
3276         fd.add_info_extractor(metacafe_ie)
3277         fd.add_info_extractor(dailymotion_ie)
3278         fd.add_info_extractor(youtube_ie)
3279         fd.add_info_extractor(google_ie)
3280         fd.add_info_extractor(google_search_ie)
3281         fd.add_info_extractor(photobucket_ie)
3282         fd.add_info_extractor(yahoo_ie)
3283         fd.add_info_extractor(yahoo_search_ie)
3284         fd.add_info_extractor(deposit_files_ie)
3285         fd.add_info_extractor(facebook_ie)
3286         fd.add_info_extractor(bliptv_ie)
3287
3288         # This must come last since it's the
3289         # fallback if none of the others work
3290         fd.add_info_extractor(generic_ie)
3291
3292         # PostProcessors
3293         if opts.extractaudio:
3294                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3295
3296         # Update version
3297         if opts.update_self:
3298                 updateSelf(fd, sys.argv[0])
3299
3300         # Maybe do nothing
3301         if len(all_urls) < 1:
3302                 if not opts.update_self:
3303                         parser.error(u'you must provide at least one URL')
3304                 else:
3305                         sys.exit()
3306         retcode = fd.download(all_urls)
3307
3308         # Dump cookie jar if requested
3309         if opts.cookiefile is not None:
3310                 try:
3311                         jar.save()
3312                 except (IOError, OSError), err:
3313                         sys.exit(u'ERROR: unable to save cookie jar')
3314
3315         sys.exit(retcode)
3316
3317
3318 if __name__ == '__main__':
3319         try:
3320                 main()
3321         except DownloadError:
3322                 sys.exit(1)
3323         except SameFileError:
3324                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3325         except KeyboardInterrupt:
3326                 sys.exit(u'\nERROR: Interrupted by user')
3327
3328 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: