youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # Author: Paweł Paprota
   9 # Author: Gergely Imreh
  10 # Author: Philipp Hagemeister <phihag@phihag.de>
  11 # License: Public domain code
  12 import cookielib
  13 import datetime
  14 import gzip
  15 import htmlentitydefs
  16 import httplib
  17 import locale
  18 import math
  19 import netrc
  20 import os
  21 import os.path
  22 import re
  23 import socket
  24 import string
  25 import subprocess
  26 import sys
  27 import time
  28 import urllib
  29 import urllib2
  30 import warnings
  31 import zlib
  32
  33 if os.name == 'nt':
  34         import ctypes
  35
  36 try:
  37         import email.utils
  38 except ImportError: # Python 2.4
  39         import email.Utils
  40 try:
  41         import cStringIO as StringIO
  42 except ImportError:
  43         import StringIO
  44
  45 # parse_qs was moved from the cgi module to the urlparse module recently.
  46 try:
  47         from urlparse import parse_qs
  48 except ImportError:
  49         from cgi import parse_qs
  50
  51 try:
  52         import lxml.etree
  53 except ImportError: # Python < 2.6
  54         pass # Handled below
  55
  56 std_headers = {
  57         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
  58         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60         'Accept-Encoding': 'gzip, deflate',
  61         'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  65
  66 try:
  67         import json
  68 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  69         import re
  70         class json(object):
  71                 @staticmethod
  72                 def loads(s):
  73                         s = s.decode('UTF-8')
  74                         def raiseError(msg, i):
  75                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  76                         def skipSpace(i, expectMore=True):
  77                                 while i < len(s) and s[i] in ' \t\r\n':
  78                                         i += 1
  79                                 if expectMore:
  80                                         if i >= len(s):
  81                                                 raiseError('Premature end', i)
  82                                 return i
  83                         def decodeEscape(match):
  84                                 esc = match.group(1)
  85                                 _STATIC = {
  86                                         '"': '"',
  87                                         '\\': '\\',
  88                                         '/': '/',
  89                                         'b': unichr(0x8),
  90                                         'f': unichr(0xc),
  91                                         'n': '\n',
  92                                         'r': '\r',
  93                                         't': '\t',
  94                                 }
  95                                 if esc in _STATIC:
  96                                         return _STATIC[esc]
  97                                 if esc[0] == 'u':
  98                                         if len(esc) == 1+4:
  99                                                 return unichr(int(esc[1:5], 16))
 100                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 101                                                 hi = int(esc[1:5], 16)
 102                                                 low = int(esc[7:11], 16)
 103                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 104                                 raise ValueError('Unknown escape ' + str(esc))
 105                         def parseString(i):
 106                                 i += 1
 107                                 e = i
 108                                 while True:
 109                                         e = s.index('"', e)
 110                                         bslashes = 0
 111                                         while s[e-bslashes-1] == '\\':
 112                                                 bslashes += 1
 113                                         if bslashes % 2 == 1:
 114                                                 e += 1
 115                                                 continue
 116                                         break
 117                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 118                                 stri = rexp.sub(decodeEscape, s[i:e])
 119                                 return (e+1,stri)
 120                         def parseObj(i):
 121                                 i += 1
 122                                 res = {}
 123                                 i = skipSpace(i)
 124                                 if s[i] == '}': # Empty dictionary
 125                                         return (i+1,res)
 126                                 while True:
 127                                         if s[i] != '"':
 128                                                 raiseError('Expected a string object key', i)
 129                                         i,key = parseString(i)
 130                                         i = skipSpace(i)
 131                                         if i >= len(s) or s[i] != ':':
 132                                                 raiseError('Expected a colon', i)
 133                                         i,val = parse(i+1)
 134                                         res[key] = val
 135                                         i = skipSpace(i)
 136                                         if s[i] == '}':
 137                                                 return (i+1, res)
 138                                         if s[i] != ',':
 139                                                 raiseError('Expected comma or closing curly brace', i)
 140                                         i = skipSpace(i+1)
 141                         def parseArray(i):
 142                                 res = []
 143                                 i = skipSpace(i+1)
 144                                 if s[i] == ']': # Empty array
 145                                         return (i+1,res)
 146                                 while True:
 147                                         i,val = parse(i)
 148                                         res.append(val)
 149                                         i = skipSpace(i) # Raise exception if premature end
 150                                         if s[i] == ']':
 151                                                 return (i+1, res)
 152                                         if s[i] != ',':
 153                                                 raiseError('Expected a comma or closing bracket', i)
 154                                         i = skipSpace(i+1)
 155                         def parseDiscrete(i):
 156                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 157                                         if s.startswith(k, i):
 158                                                 return (i+len(k), v)
 159                                 raiseError('Not a boolean (or null)', i)
 160                         def parseNumber(i):
 161                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 162                                 if mobj is None:
 163                                         raiseError('Not a number', i)
 164                                 nums = mobj.group(1)
 165                                 if '.' in nums or 'e' in nums or 'E' in nums:
 166                                         return (i+len(nums), float(nums))
 167                                 return (i+len(nums), int(nums))
 168                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 169                         def parse(i):
 170                                 i = skipSpace(i)
 171                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 172                                 i = skipSpace(i, False)
 173                                 return (i,res)
 174                         i,res = parse(0)
 175                         if i < len(s):
 176                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 177                         return res
 178
 179 def preferredencoding():
 180         """Get preferred encoding.
 181
 182         Returns the best encoding scheme for the system, based on
 183         locale.getpreferredencoding() and some further tweaks.
 184         """
 185         def yield_preferredencoding():
 186                 try:
 187                         pref = locale.getpreferredencoding()
 188                         u'TEST'.encode(pref)
 189                 except:
 190                         pref = 'UTF-8'
 191                 while True:
 192                         yield pref
 193         return yield_preferredencoding().next()
 194
 195 def htmlentity_transform(matchobj):
 196         """Transforms an HTML entity to a Unicode character.
 197
 198         This function receives a match object and is intended to be used with
 199         the re.sub() function.
 200         """
 201         entity = matchobj.group(1)
 202
 203         # Known non-numeric HTML entity
 204         if entity in htmlentitydefs.name2codepoint:
 205                 return unichr(htmlentitydefs.name2codepoint[entity])
 206
 207         # Unicode character
 208         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 209         if mobj is not None:
 210                 numstr = mobj.group(1)
 211                 if numstr.startswith(u'x'):
 212                         base = 16
 213                         numstr = u'0%s' % numstr
 214                 else:
 215                         base = 10
 216                 return unichr(long(numstr, base))
 217
 218         # Unknown entity in name, return its literal representation
 219         return (u'&%s;' % entity)
 220
 221 def sanitize_title(utitle):
 222         """Sanitizes a video title so it could be used as part of a filename."""
 223         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 224         return utitle.replace(unicode(os.sep), u'%')
 225
 226 def sanitize_open(filename, open_mode):
 227         """Try to open the given filename, and slightly tweak it if this fails.
 228
 229         Attempts to open the given filename. If this fails, it tries to change
 230         the filename slightly, step by step, until it's either able to open it
 231         or it fails and raises a final exception, like the standard open()
 232         function.
 233
 234         It returns the tuple (stream, definitive_file_name).
 235         """
 236         try:
 237                 if filename == u'-':
 238                         if sys.platform == 'win32':
 239                                 import msvcrt
 240                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 241                         return (sys.stdout, filename)
 242                 stream = open(filename, open_mode)
 243                 return (stream, filename)
 244         except (IOError, OSError), err:
 245                 # In case of error, try to remove win32 forbidden chars
 246                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 247
 248                 # An exception here should be caught in the caller
 249                 stream = open(filename, open_mode)
 250                 return (stream, filename)
 251
 252 def timeconvert(timestr):
 253     """Convert RFC 2822 defined time string into system timestamp"""
 254     timestamp = None
 255     timetuple = email.utils.parsedate_tz(timestr)
 256     if timetuple is not None:
 257         timestamp = email.utils.mktime_tz(timetuple)
 258     return timestamp
 259
 260 class DownloadError(Exception):
 261         """Download Error exception.
 262
 263         This exception may be thrown by FileDownloader objects if they are not
 264         configured to continue on errors. They will contain the appropriate
 265         error message.
 266         """
 267         pass
 268
 269 class SameFileError(Exception):
 270         """Same File exception.
 271
 272         This exception will be thrown by FileDownloader objects if they detect
 273         multiple files would have to be downloaded to the same file on disk.
 274         """
 275         pass
 276
 277 class PostProcessingError(Exception):
 278         """Post Processing exception.
 279
 280         This exception may be raised by PostProcessor's .run() method to
 281         indicate an error in the postprocessing task.
 282         """
 283         pass
 284
 285 class UnavailableVideoError(Exception):
 286         """Unavailable Format exception.
 287
 288         This exception will be thrown when a video is requested
 289         in a format that is not available for that video.
 290         """
 291         pass
 292
 293 class ContentTooShortError(Exception):
 294         """Content Too Short exception.
 295
 296         This exception may be raised by FileDownloader objects when a file they
 297         download is too small for what the server announced first, indicating
 298         the connection was probably interrupted.
 299         """
 300         # Both in bytes
 301         downloaded = None
 302         expected = None
 303
 304         def __init__(self, downloaded, expected):
 305                 self.downloaded = downloaded
 306                 self.expected = expected
 307
 308 class YoutubeDLHandler(urllib2.HTTPHandler):
 309         """Handler for HTTP requests and responses.
 310
 311         This class, when installed with an OpenerDirector, automatically adds
 312         the standard headers to every HTTP request and handles gzipped and
 313         deflated responses from web servers. If compression is to be avoided in
 314         a particular request, the original request in the program code only has
 315         to include the HTTP header "Youtubedl-No-Compression", which will be
 316         removed before making the real request.
 317
 318         Part of this code was copied from:
 319
 320           http://techknack.net/python-urllib2-handlers/
 321
 322         Andrew Rowls, the author of that code, agreed to release it to the
 323         public domain.
 324         """
 325
 326         @staticmethod
 327         def deflate(data):
 328                 try:
 329                         return zlib.decompress(data, -zlib.MAX_WBITS)
 330                 except zlib.error:
 331                         return zlib.decompress(data)
 332
 333         @staticmethod
 334         def addinfourl_wrapper(stream, headers, url, code):
 335                 if hasattr(urllib2.addinfourl, 'getcode'):
 336                         return urllib2.addinfourl(stream, headers, url, code)
 337                 ret = urllib2.addinfourl(stream, headers, url)
 338                 ret.code = code
 339                 return ret
 340
 341         def http_request(self, req):
 342                 for h in std_headers:
 343                         if h in req.headers:
 344                                 del req.headers[h]
 345                         req.add_header(h, std_headers[h])
 346                 if 'Youtubedl-no-compression' in req.headers:
 347                         if 'Accept-encoding' in req.headers:
 348                                 del req.headers['Accept-encoding']
 349                         del req.headers['Youtubedl-no-compression']
 350                 return req
 351
 352         def http_response(self, req, resp):
 353                 old_resp = resp
 354                 # gzip
 355                 if resp.headers.get('Content-encoding', '') == 'gzip':
 356                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 357                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 358                         resp.msg = old_resp.msg
 359                 # deflate
 360                 if resp.headers.get('Content-encoding', '') == 'deflate':
 361                         gz = StringIO.StringIO(self.deflate(resp.read()))
 362                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 363                         resp.msg = old_resp.msg
 364                 return resp
 365
 366 class FileDownloader(object):
 367         """File Downloader class.
 368
 369         File downloader objects are the ones responsible of downloading the
 370         actual video file and writing it to disk if the user has requested
 371         it, among some other tasks. In most cases there should be one per
 372         program. As, given a video URL, the downloader doesn't know how to
 373         extract all the needed information, task that InfoExtractors do, it
 374         has to pass the URL to one of them.
 375
 376         For this, file downloader objects have a method that allows
 377         InfoExtractors to be registered in a given order. When it is passed
 378         a URL, the file downloader handles it to the first InfoExtractor it
 379         finds that reports being able to handle it. The InfoExtractor extracts
 380         all the information about the video or videos the URL refers to, and
 381         asks the FileDownloader to process the video information, possibly
 382         downloading the video.
 383
 384         File downloaders accept a lot of parameters. In order not to saturate
 385         the object constructor with arguments, it receives a dictionary of
 386         options instead. These options are available through the params
 387         attribute for the InfoExtractors to use. The FileDownloader also
 388         registers itself as the downloader in charge for the InfoExtractors
 389         that are added to it, so this is a "mutual registration".
 390
 391         Available options:
 392
 393         username:         Username for authentication purposes.
 394         password:         Password for authentication purposes.
 395         usenetrc:         Use netrc for authentication instead.
 396         quiet:            Do not print messages to stdout.
 397         forceurl:         Force printing final URL.
 398         forcetitle:       Force printing title.
 399         forcethumbnail:   Force printing thumbnail URL.
 400         forcedescription: Force printing description.
 401         forcefilename:    Force printing final filename.
 402         simulate:         Do not download the video files.
 403         format:           Video format code.
 404         format_limit:     Highest quality format to try.
 405         outtmpl:          Template for output names.
 406         ignoreerrors:     Do not stop on download errors.
 407         ratelimit:        Download speed limit, in bytes/sec.
 408         nooverwrites:     Prevent overwriting files.
 409         retries:          Number of times to retry for HTTP error 5xx
 410         continuedl:       Try to continue downloads if possible.
 411         noprogress:       Do not print the progress bar.
 412         playliststart:    Playlist item to start at.
 413         playlistend:      Playlist item to end at.
 414         logtostderr:      Log messages to stderr instead of stdout.
 415         consoletitle:     Display progress in console window's titlebar.
 416         nopart:           Do not use temporary .part files.
 417         updatetime:       Use the Last-modified header to set output file timestamps.
 418         writedescription: Write the video description to a .description file
 419         writeinfojson:    Write the video description to a .info.json file
 420         """
 421
 422         params = None
 423         _ies = []
 424         _pps = []
 425         _download_retcode = None
 426         _num_downloads = None
 427         _screen_file = None
 428
 429         def __init__(self, params):
 430                 """Create a FileDownloader object with the given options."""
 431                 self._ies = []
 432                 self._pps = []
 433                 self._download_retcode = 0
 434                 self._num_downloads = 0
 435                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 436                 self.params = params
 437
 438         @staticmethod
 439         def pmkdir(filename):
 440                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 441                 components = filename.split(os.sep)
 442                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 443                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 444                 for dir in aggregate:
 445                         if not os.path.exists(dir):
 446                                 os.mkdir(dir)
 447
 448         @staticmethod
 449         def format_bytes(bytes):
 450                 if bytes is None:
 451                         return 'N/A'
 452                 if type(bytes) is str:
 453                         bytes = float(bytes)
 454                 if bytes == 0.0:
 455                         exponent = 0
 456                 else:
 457                         exponent = long(math.log(bytes, 1024.0))
 458                 suffix = 'bkMGTPEZY'[exponent]
 459                 converted = float(bytes) / float(1024**exponent)
 460                 return '%.2f%s' % (converted, suffix)
 461
 462         @staticmethod
 463         def calc_percent(byte_counter, data_len):
 464                 if data_len is None:
 465                         return '---.-%'
 466                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 467
 468         @staticmethod
 469         def calc_eta(start, now, total, current):
 470                 if total is None:
 471                         return '--:--'
 472                 dif = now - start
 473                 if current == 0 or dif < 0.001: # One millisecond
 474                         return '--:--'
 475                 rate = float(current) / dif
 476                 eta = long((float(total) - float(current)) / rate)
 477                 (eta_mins, eta_secs) = divmod(eta, 60)
 478                 if eta_mins > 99:
 479                         return '--:--'
 480                 return '%02d:%02d' % (eta_mins, eta_secs)
 481
 482         @staticmethod
 483         def calc_speed(start, now, bytes):
 484                 dif = now - start
 485                 if bytes == 0 or dif < 0.001: # One millisecond
 486                         return '%10s' % '---b/s'
 487                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 488
 489         @staticmethod
 490         def best_block_size(elapsed_time, bytes):
 491                 new_min = max(bytes / 2.0, 1.0)
 492                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 493                 if elapsed_time < 0.001:
 494                         return long(new_max)
 495                 rate = bytes / elapsed_time
 496                 if rate > new_max:
 497                         return long(new_max)
 498                 if rate < new_min:
 499                         return long(new_min)
 500                 return long(rate)
 501
 502         @staticmethod
 503         def parse_bytes(bytestr):
 504                 """Parse a string indicating a byte quantity into a long integer."""
 505                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 506                 if matchobj is None:
 507                         return None
 508                 number = float(matchobj.group(1))
 509                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 510                 return long(round(number * multiplier))
 511
 512         def add_info_extractor(self, ie):
 513                 """Add an InfoExtractor object to the end of the list."""
 514                 self._ies.append(ie)
 515                 ie.set_downloader(self)
 516
 517         def add_post_processor(self, pp):
 518                 """Add a PostProcessor object to the end of the chain."""
 519                 self._pps.append(pp)
 520                 pp.set_downloader(self)
 521
 522         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 523                 """Print message to stdout if not in quiet mode."""
 524                 try:
 525                         if not self.params.get('quiet', False):
 526                                 terminator = [u'\n', u''][skip_eol]
 527                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 528                         self._screen_file.flush()
 529                 except (UnicodeEncodeError), err:
 530                         if not ignore_encoding_errors:
 531                                 raise
 532
 533         def to_stderr(self, message):
 534                 """Print message to stderr."""
 535                 print >>sys.stderr, message.encode(preferredencoding())
 536
 537         def to_cons_title(self, message):
 538                 """Set console/terminal window title to message."""
 539                 if not self.params.get('consoletitle', False):
 540                         return
 541                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 542                         # c_wchar_p() might not be necessary if `message` is
 543                         # already of type unicode()
 544                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 545                 elif 'TERM' in os.environ:
 546                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 547
 548         def fixed_template(self):
 549                 """Checks if the output template is fixed."""
 550                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 551
 552         def trouble(self, message=None):
 553                 """Determine action to take when a download problem appears.
 554
 555                 Depending on if the downloader has been configured to ignore
 556                 download errors or not, this method may throw an exception or
 557                 not when errors are found, after printing the message.
 558                 """
 559                 if message is not None:
 560                         self.to_stderr(message)
 561                 if not self.params.get('ignoreerrors', False):
 562                         raise DownloadError(message)
 563                 self._download_retcode = 1
 564
 565         def slow_down(self, start_time, byte_counter):
 566                 """Sleep if the download speed is over the rate limit."""
 567                 rate_limit = self.params.get('ratelimit', None)
 568                 if rate_limit is None or byte_counter == 0:
 569                         return
 570                 now = time.time()
 571                 elapsed = now - start_time
 572                 if elapsed <= 0.0:
 573                         return
 574                 speed = float(byte_counter) / elapsed
 575                 if speed > rate_limit:
 576                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 577
 578         def temp_name(self, filename):
 579                 """Returns a temporary filename for the given filename."""
 580                 if self.params.get('nopart', False) or filename == u'-' or \
 581                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 582                         return filename
 583                 return filename + u'.part'
 584
 585         def undo_temp_name(self, filename):
 586                 if filename.endswith(u'.part'):
 587                         return filename[:-len(u'.part')]
 588                 return filename
 589
 590         def try_rename(self, old_filename, new_filename):
 591                 try:
 592                         if old_filename == new_filename:
 593                                 return
 594                         os.rename(old_filename, new_filename)
 595                 except (IOError, OSError), err:
 596                         self.trouble(u'ERROR: unable to rename file')
 597
 598         def try_utime(self, filename, last_modified_hdr):
 599                 """Try to set the last-modified time of the given file."""
 600                 if last_modified_hdr is None:
 601                         return
 602                 if not os.path.isfile(filename):
 603                         return
 604                 timestr = last_modified_hdr
 605                 if timestr is None:
 606                         return
 607                 filetime = timeconvert(timestr)
 608                 if filetime is None:
 609                         return
 610                 try:
 611                         os.utime(filename,(time.time(), filetime))
 612                 except:
 613                         pass
 614
 615         def report_writedescription(self, descfn):
 616                 """ Report that the description file is being written """
 617                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 618
 619         def report_writeinfojson(self, infofn):
 620                 """ Report that the metadata file has been written """
 621                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 622
 623         def report_destination(self, filename):
 624                 """Report destination filename."""
 625                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 626
 627         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 628                 """Report download progress."""
 629                 if self.params.get('noprogress', False):
 630                         return
 631                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 632                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 633                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 634                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 635
 636         def report_resuming_byte(self, resume_len):
 637                 """Report attempt to resume at given byte."""
 638                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 639
 640         def report_retry(self, count, retries):
 641                 """Report retry in case of HTTP error 5xx"""
 642                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 643
 644         def report_file_already_downloaded(self, file_name):
 645                 """Report file has already been fully downloaded."""
 646                 try:
 647                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 648                 except (UnicodeEncodeError), err:
 649                         self.to_screen(u'[download] The file has already been downloaded')
 650
 651         def report_unable_to_resume(self):
 652                 """Report it was impossible to resume download."""
 653                 self.to_screen(u'[download] Unable to resume')
 654
 655         def report_finish(self):
 656                 """Report download finished."""
 657                 if self.params.get('noprogress', False):
 658                         self.to_screen(u'[download] Download completed')
 659                 else:
 660                         self.to_screen(u'')
 661
 662         def increment_downloads(self):
 663                 """Increment the ordinal that assigns a number to each file."""
 664                 self._num_downloads += 1
 665
 666         def prepare_filename(self, info_dict):
 667                 """Generate the output filename."""
 668                 try:
 669                         template_dict = dict(info_dict)
 670                         template_dict['epoch'] = unicode(long(time.time()))
 671                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 672                         filename = self.params['outtmpl'] % template_dict
 673                         return filename
 674                 except (ValueError, KeyError), err:
 675                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 676                         return None
 677
 678         def process_info(self, info_dict):
 679                 """Process a single dictionary returned by an InfoExtractor."""
 680                 filename = self.prepare_filename(info_dict)
 681                 # Do nothing else if in simulate mode
 682                 if self.params.get('simulate', False):
 683                         # Forced printings
 684                         if self.params.get('forcetitle', False):
 685                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 686                         if self.params.get('forceurl', False):
 687                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 688                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 689                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 690                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 691                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 692                         if self.params.get('forcefilename', False) and filename is not None:
 693                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 694
 695                         return
 696
 697                 if filename is None:
 698                         return
 699                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 700                         self.to_stderr(u'WARNING: file exists and will be skipped')
 701                         return
 702
 703                 try:
 704                         self.pmkdir(filename)
 705                 except (OSError, IOError), err:
 706                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 707                         return
 708
 709                 if self.params.get('writedescription', False):
 710                         try:
 711                                 descfn = filename + '.description'
 712                                 self.report_writedescription(descfn)
 713                                 descfile = open(descfn, 'wb')
 714                                 try:
 715                                         descfile.write(info_dict['description'].encode('utf-8'))
 716                                 finally:
 717                                         descfile.close()
 718                         except (OSError, IOError):
 719                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 720                                 return
 721
 722                 if self.params.get('writeinfojson', False):
 723                         infofn = filename + '.info.json'
 724                         self.report_writeinfojson(infofn)
 725                         try:
 726                                 json.dump
 727                         except (NameError,AttributeError):
 728                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 729                                 return
 730                         try:
 731                                 infof = open(infofn, 'wb')
 732                                 try:
 733                                         json.dump(info_dict, infof)
 734                                 finally:
 735                                         infof.close()
 736                         except (OSError, IOError):
 737                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 738                                 return
 739
 740                 try:
 741                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 742                 except (OSError, IOError), err:
 743                         raise UnavailableVideoError
 744                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 745                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 746                         return
 747                 except (ContentTooShortError, ), err:
 748                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 749                         return
 750
 751                 if success:
 752                         try:
 753                                 self.post_process(filename, info_dict)
 754                         except (PostProcessingError), err:
 755                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 756                                 return
 757
 758         def download(self, url_list):
 759                 """Download a given list of URLs."""
 760                 if len(url_list) > 1 and self.fixed_template():
 761                         raise SameFileError(self.params['outtmpl'])
 762
 763                 for url in url_list:
 764                         suitable_found = False
 765                         for ie in self._ies:
 766                                 # Go to next InfoExtractor if not suitable
 767                                 if not ie.suitable(url):
 768                                         continue
 769
 770                                 # Suitable InfoExtractor found
 771                                 suitable_found = True
 772
 773                                 # Extract information from URL and process it
 774                                 ie.extract(url)
 775
 776                                 # Suitable InfoExtractor had been found; go to next URL
 777                                 break
 778
 779                         if not suitable_found:
 780                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 781
 782                 return self._download_retcode
 783
 784         def post_process(self, filename, ie_info):
 785                 """Run the postprocessing chain on the given file."""
 786                 info = dict(ie_info)
 787                 info['filepath'] = filename
 788                 for pp in self._pps:
 789                         info = pp.run(info)
 790                         if info is None:
 791                                 break
 792
 793         def _download_with_rtmpdump(self, filename, url, player_url):
 794                 self.report_destination(filename)
 795                 tmpfilename = self.temp_name(filename)
 796
 797                 # Check for rtmpdump first
 798                 try:
 799                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 800                 except (OSError, IOError):
 801                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 802                         return False
 803
 804                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 805                 # the connection was interrumpted and resuming appears to be
 806                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 807                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 808                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 809                 while retval == 2 or retval == 1:
 810                         prevsize = os.path.getsize(tmpfilename)
 811                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 812                         time.sleep(5.0) # This seems to be needed
 813                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 814                         cursize = os.path.getsize(tmpfilename)
 815                         if prevsize == cursize and retval == 1:
 816                                 break
 817                 if retval == 0:
 818                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 819                         self.try_rename(tmpfilename, filename)
 820                         return True
 821                 else:
 822                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 823                         return False
 824
 825         def _do_download(self, filename, url, player_url):
 826                 # Check file already present
 827                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 828                         self.report_file_already_downloaded(filename)
 829                         return True
 830
 831                 # Attempt to download using rtmpdump
 832                 if url.startswith('rtmp'):
 833                         return self._download_with_rtmpdump(filename, url, player_url)
 834
 835                 tmpfilename = self.temp_name(filename)
 836                 stream = None
 837                 open_mode = 'wb'
 838
 839                 # Do not include the Accept-Encoding header
 840                 headers = {'Youtubedl-no-compression': 'True'}
 841                 basic_request = urllib2.Request(url, None, headers)
 842                 request = urllib2.Request(url, None, headers)
 843
 844                 # Establish possible resume length
 845                 if os.path.isfile(tmpfilename):
 846                         resume_len = os.path.getsize(tmpfilename)
 847                 else:
 848                         resume_len = 0
 849
 850                 # Request parameters in case of being able to resume
 851                 if self.params.get('continuedl', False) and resume_len != 0:
 852                         self.report_resuming_byte(resume_len)
 853                         request.add_header('Range','bytes=%d-' % resume_len)
 854                         open_mode = 'ab'
 855
 856                 count = 0
 857                 retries = self.params.get('retries', 0)
 858                 while count <= retries:
 859                         # Establish connection
 860                         try:
 861                                 data = urllib2.urlopen(request)
 862                                 break
 863                         except (urllib2.HTTPError, ), err:
 864                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 865                                         # Unexpected HTTP error
 866                                         raise
 867                                 elif err.code == 416:
 868                                         # Unable to resume (requested range not satisfiable)
 869                                         try:
 870                                                 # Open the connection again without the range header
 871                                                 data = urllib2.urlopen(basic_request)
 872                                                 content_length = data.info()['Content-Length']
 873                                         except (urllib2.HTTPError, ), err:
 874                                                 if err.code < 500 or err.code >= 600:
 875                                                         raise
 876                                         else:
 877                                                 # Examine the reported length
 878                                                 if (content_length is not None and
 879                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 880                                                         # The file had already been fully downloaded.
 881                                                         # Explanation to the above condition: in issue #175 it was revealed that
 882                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 883                                                         # changing the file size slightly and causing problems for some users. So
 884                                                         # I decided to implement a suggested change and consider the file
 885                                                         # completely downloaded if the file size differs less than 100 bytes from
 886                                                         # the one in the hard drive.
 887                                                         self.report_file_already_downloaded(filename)
 888                                                         self.try_rename(tmpfilename, filename)
 889                                                         return True
 890                                                 else:
 891                                                         # The length does not match, we start the download over
 892                                                         self.report_unable_to_resume()
 893                                                         open_mode = 'wb'
 894                                                         break
 895                         # Retry
 896                         count += 1
 897                         if count <= retries:
 898                                 self.report_retry(count, retries)
 899
 900                 if count > retries:
 901                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 902                         return False
 903
 904                 data_len = data.info().get('Content-length', None)
 905                 if data_len is not None:
 906                         data_len = long(data_len) + resume_len
 907                 data_len_str = self.format_bytes(data_len)
 908                 byte_counter = 0 + resume_len
 909                 block_size = 1024
 910                 start = time.time()
 911                 while True:
 912                         # Download and write
 913                         before = time.time()
 914                         data_block = data.read(block_size)
 915                         after = time.time()
 916                         if len(data_block) == 0:
 917                                 break
 918                         byte_counter += len(data_block)
 919
 920                         # Open file just in time
 921                         if stream is None:
 922                                 try:
 923                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 924                                         filename = self.undo_temp_name(tmpfilename)
 925                                         self.report_destination(filename)
 926                                 except (OSError, IOError), err:
 927                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 928                                         return False
 929                         try:
 930                                 stream.write(data_block)
 931                         except (IOError, OSError), err:
 932                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 933                                 return False
 934                         block_size = self.best_block_size(after - before, len(data_block))
 935
 936                         # Progress message
 937                         percent_str = self.calc_percent(byte_counter, data_len)
 938                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 939                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 940                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 941
 942                         # Apply rate limit
 943                         self.slow_down(start, byte_counter - resume_len)
 944
 945                 stream.close()
 946                 self.report_finish()
 947                 if data_len is not None and byte_counter != data_len:
 948                         raise ContentTooShortError(byte_counter, long(data_len))
 949                 self.try_rename(tmpfilename, filename)
 950
 951                 # Update file modification time
 952                 if self.params.get('updatetime', True):
 953                         self.try_utime(filename, data.info().get('last-modified', None))
 954
 955                 return True
 956
 957 class InfoExtractor(object):
 958         """Information Extractor class.
 959
 960         Information extractors are the classes that, given a URL, extract
 961         information from the video (or videos) the URL refers to. This
 962         information includes the real video URL, the video title and simplified
 963         title, author and others. The information is stored in a dictionary
 964         which is then passed to the FileDownloader. The FileDownloader
 965         processes this information possibly downloading the video to the file
 966         system, among other possible outcomes. The dictionaries must include
 967         the following fields:
 968
 969         id:             Video identifier.
 970         url:            Final video URL.
 971         uploader:       Nickname of the video uploader.
 972         title:          Literal title.
 973         stitle:         Simplified title.
 974         ext:            Video filename extension.
 975         format:         Video format.
 976         player_url:     SWF Player URL (may be None).
 977
 978         The following fields are optional. Their primary purpose is to allow
 979         youtube-dl to serve as the backend for a video search function, such
 980         as the one in youtube2mp3.  They are only used when their respective
 981         forced printing functions are called:
 982
 983         thumbnail:      Full URL to a video thumbnail image.
 984         description:    One-line video description.
 985
 986         Subclasses of this one should re-define the _real_initialize() and
 987         _real_extract() methods, as well as the suitable() static method.
 988         Probably, they should also be instantiated and added to the main
 989         downloader.
 990         """
 991
 992         _ready = False
 993         _downloader = None
 994
 995         def __init__(self, downloader=None):
 996                 """Constructor. Receives an optional downloader."""
 997                 self._ready = False
 998                 self.set_downloader(downloader)
 999
1000         @staticmethod
1001         def suitable(url):
1002                 """Receives a URL and returns True if suitable for this IE."""
1003                 return False
1004
1005         def initialize(self):
1006                 """Initializes an instance (authentication, etc)."""
1007                 if not self._ready:
1008                         self._real_initialize()
1009                         self._ready = True
1010
1011         def extract(self, url):
1012                 """Extracts URL information and returns it in list of dicts."""
1013                 self.initialize()
1014                 return self._real_extract(url)
1015
1016         def set_downloader(self, downloader):
1017                 """Sets the downloader for this IE."""
1018                 self._downloader = downloader
1019
1020         def _real_initialize(self):
1021                 """Real initialization process. Redefine in subclasses."""
1022                 pass
1023
1024         def _real_extract(self, url):
1025                 """Real extraction process. Redefine in subclasses."""
1026                 pass
1027
1028 class YoutubeIE(InfoExtractor):
1029         """Information extractor for youtube.com."""
1030
1031         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1032         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1033         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1034         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1035         _NETRC_MACHINE = 'youtube'
1036         # Listed in order of quality
1037         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1038         _video_extensions = {
1039                 '13': '3gp',
1040                 '17': 'mp4',
1041                 '18': 'mp4',
1042                 '22': 'mp4',
1043                 '37': 'mp4',
1044                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1045                 '43': 'webm',
1046                 '45': 'webm',
1047         }
1048
1049         @staticmethod
1050         def suitable(url):
1051                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1052
1053         def report_lang(self):
1054                 """Report attempt to set language."""
1055                 self._downloader.to_screen(u'[youtube] Setting language')
1056
1057         def report_login(self):
1058                 """Report attempt to log in."""
1059                 self._downloader.to_screen(u'[youtube] Logging in')
1060
1061         def report_age_confirmation(self):
1062                 """Report attempt to confirm age."""
1063                 self._downloader.to_screen(u'[youtube] Confirming age')
1064
1065         def report_video_webpage_download(self, video_id):
1066                 """Report attempt to download video webpage."""
1067                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1068
1069         def report_video_info_webpage_download(self, video_id):
1070                 """Report attempt to download video info webpage."""
1071                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1072
1073         def report_information_extraction(self, video_id):
1074                 """Report attempt to extract video information."""
1075                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1076
1077         def report_unavailable_format(self, video_id, format):
1078                 """Report extracted video URL."""
1079                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1080
1081         def report_rtmp_download(self):
1082                 """Indicate the download will use the RTMP protocol."""
1083                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1084
1085         def _real_initialize(self):
1086                 if self._downloader is None:
1087                         return
1088
1089                 username = None
1090                 password = None
1091                 downloader_params = self._downloader.params
1092
1093                 # Attempt to use provided username and password or .netrc data
1094                 if downloader_params.get('username', None) is not None:
1095                         username = downloader_params['username']
1096                         password = downloader_params['password']
1097                 elif downloader_params.get('usenetrc', False):
1098                         try:
1099                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1100                                 if info is not None:
1101                                         username = info[0]
1102                                         password = info[2]
1103                                 else:
1104                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1105                         except (IOError, netrc.NetrcParseError), err:
1106                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1107                                 return
1108
1109                 # Set language
1110                 request = urllib2.Request(self._LANG_URL)
1111                 try:
1112                         self.report_lang()
1113                         urllib2.urlopen(request).read()
1114                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1115                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1116                         return
1117
1118                 # No authentication to be performed
1119                 if username is None:
1120                         return
1121
1122                 # Log in
1123                 login_form = {
1124                                 'current_form': 'loginForm',
1125                                 'next':         '/',
1126                                 'action_login': 'Log In',
1127                                 'username':     username,
1128                                 'password':     password,
1129                                 }
1130                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1131                 try:
1132                         self.report_login()
1133                         login_results = urllib2.urlopen(request).read()
1134                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1135                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1136                                 return
1137                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1139                         return
1140
1141                 # Confirm age
1142                 age_form = {
1143                                 'next_url':             '/',
1144                                 'action_confirm':       'Confirm',
1145                                 }
1146                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1147                 try:
1148                         self.report_age_confirmation()
1149                         age_results = urllib2.urlopen(request).read()
1150                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1151                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1152                         return
1153
1154         def _real_extract(self, url):
1155                 # Extract video id from URL
1156                 mobj = re.match(self._VALID_URL, url)
1157                 if mobj is None:
1158                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1159                         return
1160                 video_id = mobj.group(2)
1161
1162                 # Get video webpage
1163                 self.report_video_webpage_download(video_id)
1164                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1165                 try:
1166                         video_webpage = urllib2.urlopen(request).read()
1167                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1168                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1169                         return
1170
1171                 # Attempt to extract SWF player URL
1172                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1173                 if mobj is not None:
1174                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1175                 else:
1176                         player_url = None
1177
1178                 # Get video info
1179                 self.report_video_info_webpage_download(video_id)
1180                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1181                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1182                                            % (video_id, el_type))
1183                         request = urllib2.Request(video_info_url)
1184                         try:
1185                                 video_info_webpage = urllib2.urlopen(request).read()
1186                                 video_info = parse_qs(video_info_webpage)
1187                                 if 'token' in video_info:
1188                                         break
1189                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1191                                 return
1192                 if 'token' not in video_info:
1193                         if 'reason' in video_info:
1194                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1195                         else:
1196                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1197                         return
1198
1199                 # Start extracting information
1200                 self.report_information_extraction(video_id)
1201
1202                 # uploader
1203                 if 'author' not in video_info:
1204                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1205                         return
1206                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1207
1208                 # title
1209                 if 'title' not in video_info:
1210                         self._downloader.trouble(u'ERROR: unable to extract video title')
1211                         return
1212                 video_title = urllib.unquote_plus(video_info['title'][0])
1213                 video_title = video_title.decode('utf-8')
1214                 video_title = sanitize_title(video_title)
1215
1216                 # simplified title
1217                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1218                 simple_title = simple_title.strip(ur'_')
1219
1220                 # thumbnail image
1221                 if 'thumbnail_url' not in video_info:
1222                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1223                         video_thumbnail = ''
1224                 else:   # don't panic if we can't find it
1225                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1226
1227                 # upload date
1228                 upload_date = u'NA'
1229                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1230                 if mobj is not None:
1231                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1232                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1233                         for expression in format_expressions:
1234                                 try:
1235                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1236                                 except:
1237                                         pass
1238
1239                 # description
1240                 try:
1241                         lxml.etree
1242                 except NameError:
1243                         video_description = u'No description available.'
1244                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1245                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1246                                 if mobj is not None:
1247                                         video_description = mobj.group(1).decode('utf-8')
1248                 else:
1249                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1250                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1251                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1252                         # TODO use another parser
1253
1254                 # token
1255                 video_token = urllib.unquote_plus(video_info['token'][0])
1256
1257                 # Decide which formats to download
1258                 req_format = self._downloader.params.get('format', None)
1259
1260                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1261                         self.report_rtmp_download()
1262                         video_url_list = [(None, video_info['conn'][0])]
1263                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1264                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1265                         url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1266                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1267                         url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1268
1269                         format_limit = self._downloader.params.get('format_limit', None)
1270                         if format_limit is not None and format_limit in self._available_formats:
1271                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1272                         else:
1273                                 format_list = self._available_formats
1274                         existing_formats = [x for x in format_list if x in url_map]
1275                         if len(existing_formats) == 0:
1276                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1277                                 return
1278                         if req_format is None:
1279                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1280                         elif req_format == '-1':
1281                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1282                         else:
1283                                 # Specific format
1284                                 if req_format not in url_map:
1285                                         self._downloader.trouble(u'ERROR: requested format not available')
1286                                         return
1287                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1288                 else:
1289                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1290                         return
1291
1292                 for format_param, video_real_url in video_url_list:
1293                         # At this point we have a new video
1294                         self._downloader.increment_downloads()
1295
1296                         # Extension
1297                         video_extension = self._video_extensions.get(format_param, 'flv')
1298
1299                         # Find the video URL in fmt_url_map or conn paramters
1300                         try:
1301                                 # Process video information
1302                                 self._downloader.process_info({
1303                                         'id':           video_id.decode('utf-8'),
1304                                         'url':          video_real_url.decode('utf-8'),
1305                                         'uploader':     video_uploader.decode('utf-8'),
1306                                         'upload_date':  upload_date,
1307                                         'title':        video_title,
1308                                         'stitle':       simple_title,
1309                                         'ext':          video_extension.decode('utf-8'),
1310                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1311                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1312                                         'description':  video_description,
1313                                         'player_url':   player_url,
1314                                 })
1315                         except UnavailableVideoError, err:
1316                                 self._downloader.trouble(u'\nERROR: unable to download video')
1317
1318
1319 class MetacafeIE(InfoExtractor):
1320         """Information Extractor for metacafe.com."""
1321
1322         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1323         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1324         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1325         _youtube_ie = None
1326
1327         def __init__(self, youtube_ie, downloader=None):
1328                 InfoExtractor.__init__(self, downloader)
1329                 self._youtube_ie = youtube_ie
1330
1331         @staticmethod
1332         def suitable(url):
1333                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1334
1335         def report_disclaimer(self):
1336                 """Report disclaimer retrieval."""
1337                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1338
1339         def report_age_confirmation(self):
1340                 """Report attempt to confirm age."""
1341                 self._downloader.to_screen(u'[metacafe] Confirming age')
1342
1343         def report_download_webpage(self, video_id):
1344                 """Report webpage download."""
1345                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1346
1347         def report_extraction(self, video_id):
1348                 """Report information extraction."""
1349                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1350
1351         def _real_initialize(self):
1352                 # Retrieve disclaimer
1353                 request = urllib2.Request(self._DISCLAIMER)
1354                 try:
1355                         self.report_disclaimer()
1356                         disclaimer = urllib2.urlopen(request).read()
1357                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1358                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1359                         return
1360
1361                 # Confirm age
1362                 disclaimer_form = {
1363                         'filters': '0',
1364                         'submit': "Continue - I'm over 18",
1365                         }
1366                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1367                 try:
1368                         self.report_age_confirmation()
1369                         disclaimer = urllib2.urlopen(request).read()
1370                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1371                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1372                         return
1373
1374         def _real_extract(self, url):
1375                 # Extract id and simplified title from URL
1376                 mobj = re.match(self._VALID_URL, url)
1377                 if mobj is None:
1378                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1379                         return
1380
1381                 video_id = mobj.group(1)
1382
1383                 # Check if video comes from YouTube
1384                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1385                 if mobj2 is not None:
1386                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1387                         return
1388
1389                 # At this point we have a new video
1390                 self._downloader.increment_downloads()
1391
1392                 simple_title = mobj.group(2).decode('utf-8')
1393
1394                 # Retrieve video webpage to extract further information
1395                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1396                 try:
1397                         self.report_download_webpage(video_id)
1398                         webpage = urllib2.urlopen(request).read()
1399                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1400                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1401                         return
1402
1403                 # Extract URL, uploader and title from webpage
1404                 self.report_extraction(video_id)
1405                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1406                 if mobj is not None:
1407                         mediaURL = urllib.unquote(mobj.group(1))
1408                         video_extension = mediaURL[-3:]
1409
1410                         # Extract gdaKey if available
1411                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1412                         if mobj is None:
1413                                 video_url = mediaURL
1414                         else:
1415                                 gdaKey = mobj.group(1)
1416                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1417                 else:
1418                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1419                         if mobj is None:
1420                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1421                                 return
1422                         vardict = parse_qs(mobj.group(1))
1423                         if 'mediaData' not in vardict:
1424                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1425                                 return
1426                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1427                         if mobj is None:
1428                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1429                                 return
1430                         mediaURL = mobj.group(1).replace('\\/', '/')
1431                         video_extension = mediaURL[-3:]
1432                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1433
1434                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1435                 if mobj is None:
1436                         self._downloader.trouble(u'ERROR: unable to extract title')
1437                         return
1438                 video_title = mobj.group(1).decode('utf-8')
1439                 video_title = sanitize_title(video_title)
1440
1441                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1442                 if mobj is None:
1443                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1444                         return
1445                 video_uploader = mobj.group(1)
1446
1447                 try:
1448                         # Process video information
1449                         self._downloader.process_info({
1450                                 'id':           video_id.decode('utf-8'),
1451                                 'url':          video_url.decode('utf-8'),
1452                                 'uploader':     video_uploader.decode('utf-8'),
1453                                 'upload_date':  u'NA',
1454                                 'title':        video_title,
1455                                 'stitle':       simple_title,
1456                                 'ext':          video_extension.decode('utf-8'),
1457                                 'format':       u'NA',
1458                                 'player_url':   None,
1459                         })
1460                 except UnavailableVideoError:
1461                         self._downloader.trouble(u'\nERROR: unable to download video')
1462
1463
1464 class DailymotionIE(InfoExtractor):
1465         """Information Extractor for Dailymotion"""
1466
1467         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1468
1469         def __init__(self, downloader=None):
1470                 InfoExtractor.__init__(self, downloader)
1471
1472         @staticmethod
1473         def suitable(url):
1474                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1475
1476         def report_download_webpage(self, video_id):
1477                 """Report webpage download."""
1478                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1479
1480         def report_extraction(self, video_id):
1481                 """Report information extraction."""
1482                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1483
1484         def _real_initialize(self):
1485                 return
1486
1487         def _real_extract(self, url):
1488                 # Extract id and simplified title from URL
1489                 mobj = re.match(self._VALID_URL, url)
1490                 if mobj is None:
1491                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1492                         return
1493
1494                 # At this point we have a new video
1495                 self._downloader.increment_downloads()
1496                 video_id = mobj.group(1)
1497
1498                 simple_title = mobj.group(2).decode('utf-8')
1499                 video_extension = 'flv'
1500
1501                 # Retrieve video webpage to extract further information
1502                 request = urllib2.Request(url)
1503                 try:
1504                         self.report_download_webpage(video_id)
1505                         webpage = urllib2.urlopen(request).read()
1506                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1507                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1508                         return
1509
1510                 # Extract URL, uploader and title from webpage
1511                 self.report_extraction(video_id)
1512                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1513                 if mobj is None:
1514                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1515                         return
1516                 mediaURL = urllib.unquote(mobj.group(1))
1517
1518                 # if needed add http://www.dailymotion.com/ if relative URL
1519
1520                 video_url = mediaURL
1521
1522                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1523                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1524                 if mobj is None:
1525                         self._downloader.trouble(u'ERROR: unable to extract title')
1526                         return
1527                 video_title = mobj.group(1).decode('utf-8')
1528                 video_title = sanitize_title(video_title)
1529
1530                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1531                 if mobj is None:
1532                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1533                         return
1534                 video_uploader = mobj.group(1)
1535
1536                 try:
1537                         # Process video information
1538                         self._downloader.process_info({
1539                                 'id':           video_id.decode('utf-8'),
1540                                 'url':          video_url.decode('utf-8'),
1541                                 'uploader':     video_uploader.decode('utf-8'),
1542                                 'upload_date':  u'NA',
1543                                 'title':        video_title,
1544                                 'stitle':       simple_title,
1545                                 'ext':          video_extension.decode('utf-8'),
1546                                 'format':       u'NA',
1547                                 'player_url':   None,
1548                         })
1549                 except UnavailableVideoError:
1550                         self._downloader.trouble(u'\nERROR: unable to download video')
1551
1552 class GoogleIE(InfoExtractor):
1553         """Information extractor for video.google.com."""
1554
1555         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1556
1557         def __init__(self, downloader=None):
1558                 InfoExtractor.__init__(self, downloader)
1559
1560         @staticmethod
1561         def suitable(url):
1562                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1563
1564         def report_download_webpage(self, video_id):
1565                 """Report webpage download."""
1566                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1567
1568         def report_extraction(self, video_id):
1569                 """Report information extraction."""
1570                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1571
1572         def _real_initialize(self):
1573                 return
1574
1575         def _real_extract(self, url):
1576                 # Extract id from URL
1577                 mobj = re.match(self._VALID_URL, url)
1578                 if mobj is None:
1579                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1580                         return
1581
1582                 # At this point we have a new video
1583                 self._downloader.increment_downloads()
1584                 video_id = mobj.group(1)
1585
1586                 video_extension = 'mp4'
1587
1588                 # Retrieve video webpage to extract further information
1589                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1590                 try:
1591                         self.report_download_webpage(video_id)
1592                         webpage = urllib2.urlopen(request).read()
1593                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1594                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1595                         return
1596
1597                 # Extract URL, uploader, and title from webpage
1598                 self.report_extraction(video_id)
1599                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1600                 if mobj is None:
1601                         video_extension = 'flv'
1602                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1603                 if mobj is None:
1604                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1605                         return
1606                 mediaURL = urllib.unquote(mobj.group(1))
1607                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1608                 mediaURL = mediaURL.replace('\\x26', '\x26')
1609
1610                 video_url = mediaURL
1611
1612                 mobj = re.search(r'<title>(.*)</title>', webpage)
1613                 if mobj is None:
1614                         self._downloader.trouble(u'ERROR: unable to extract title')
1615                         return
1616                 video_title = mobj.group(1).decode('utf-8')
1617                 video_title = sanitize_title(video_title)
1618                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1619
1620                 # Extract video description
1621                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1622                 if mobj is None:
1623                         self._downloader.trouble(u'ERROR: unable to extract video description')
1624                         return
1625                 video_description = mobj.group(1).decode('utf-8')
1626                 if not video_description:
1627                         video_description = 'No description available.'
1628
1629                 # Extract video thumbnail
1630                 if self._downloader.params.get('forcethumbnail', False):
1631                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1632                         try:
1633                                 webpage = urllib2.urlopen(request).read()
1634                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1635                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1636                                 return
1637                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1638                         if mobj is None:
1639                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1640                                 return
1641                         video_thumbnail = mobj.group(1)
1642                 else:   # we need something to pass to process_info
1643                         video_thumbnail = ''
1644
1645
1646                 try:
1647                         # Process video information
1648                         self._downloader.process_info({
1649                                 'id':           video_id.decode('utf-8'),
1650                                 'url':          video_url.decode('utf-8'),
1651                                 'uploader':     u'NA',
1652                                 'upload_date':  u'NA',
1653                                 'title':        video_title,
1654                                 'stitle':       simple_title,
1655                                 'ext':          video_extension.decode('utf-8'),
1656                                 'format':       u'NA',
1657                                 'player_url':   None,
1658                         })
1659                 except UnavailableVideoError:
1660                         self._downloader.trouble(u'\nERROR: unable to download video')
1661
1662
1663 class PhotobucketIE(InfoExtractor):
1664         """Information extractor for photobucket.com."""
1665
1666         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1667
1668         def __init__(self, downloader=None):
1669                 InfoExtractor.__init__(self, downloader)
1670
1671         @staticmethod
1672         def suitable(url):
1673                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1674
1675         def report_download_webpage(self, video_id):
1676                 """Report webpage download."""
1677                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1678
1679         def report_extraction(self, video_id):
1680                 """Report information extraction."""
1681                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1682
1683         def _real_initialize(self):
1684                 return
1685
1686         def _real_extract(self, url):
1687                 # Extract id from URL
1688                 mobj = re.match(self._VALID_URL, url)
1689                 if mobj is None:
1690                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1691                         return
1692
1693                 # At this point we have a new video
1694                 self._downloader.increment_downloads()
1695                 video_id = mobj.group(1)
1696
1697                 video_extension = 'flv'
1698
1699                 # Retrieve video webpage to extract further information
1700                 request = urllib2.Request(url)
1701                 try:
1702                         self.report_download_webpage(video_id)
1703                         webpage = urllib2.urlopen(request).read()
1704                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1705                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1706                         return
1707
1708                 # Extract URL, uploader, and title from webpage
1709                 self.report_extraction(video_id)
1710                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1711                 if mobj is None:
1712                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1713                         return
1714                 mediaURL = urllib.unquote(mobj.group(1))
1715
1716                 video_url = mediaURL
1717
1718                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1719                 if mobj is None:
1720                         self._downloader.trouble(u'ERROR: unable to extract title')
1721                         return
1722                 video_title = mobj.group(1).decode('utf-8')
1723                 video_title = sanitize_title(video_title)
1724                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1725
1726                 video_uploader = mobj.group(2).decode('utf-8')
1727
1728                 try:
1729                         # Process video information
1730                         self._downloader.process_info({
1731                                 'id':           video_id.decode('utf-8'),
1732                                 'url':          video_url.decode('utf-8'),
1733                                 'uploader':     video_uploader,
1734                                 'upload_date':  u'NA',
1735                                 'title':        video_title,
1736                                 'stitle':       simple_title,
1737                                 'ext':          video_extension.decode('utf-8'),
1738                                 'format':       u'NA',
1739                                 'player_url':   None,
1740                         })
1741                 except UnavailableVideoError:
1742                         self._downloader.trouble(u'\nERROR: unable to download video')
1743
1744
1745 class YahooIE(InfoExtractor):
1746         """Information extractor for video.yahoo.com."""
1747
1748         # _VALID_URL matches all Yahoo! Video URLs
1749         # _VPAGE_URL matches only the extractable '/watch/' URLs
1750         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1751         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1752
1753         def __init__(self, downloader=None):
1754                 InfoExtractor.__init__(self, downloader)
1755
1756         @staticmethod
1757         def suitable(url):
1758                 return (re.match(YahooIE._VALID_URL, url) is not None)
1759
1760         def report_download_webpage(self, video_id):
1761                 """Report webpage download."""
1762                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1763
1764         def report_extraction(self, video_id):
1765                 """Report information extraction."""
1766                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1767
1768         def _real_initialize(self):
1769                 return
1770
1771         def _real_extract(self, url, new_video=True):
1772                 # Extract ID from URL
1773                 mobj = re.match(self._VALID_URL, url)
1774                 if mobj is None:
1775                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1776                         return
1777
1778                 # At this point we have a new video
1779                 self._downloader.increment_downloads()
1780                 video_id = mobj.group(2)
1781                 video_extension = 'flv'
1782
1783                 # Rewrite valid but non-extractable URLs as
1784                 # extractable English language /watch/ URLs
1785                 if re.match(self._VPAGE_URL, url) is None:
1786                         request = urllib2.Request(url)
1787                         try:
1788                                 webpage = urllib2.urlopen(request).read()
1789                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1790                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1791                                 return
1792
1793                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1794                         if mobj is None:
1795                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1796                                 return
1797                         yahoo_id = mobj.group(1)
1798
1799                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1800                         if mobj is None:
1801                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1802                                 return
1803                         yahoo_vid = mobj.group(1)
1804
1805                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1806                         return self._real_extract(url, new_video=False)
1807
1808                 # Retrieve video webpage to extract further information
1809                 request = urllib2.Request(url)
1810                 try:
1811                         self.report_download_webpage(video_id)
1812                         webpage = urllib2.urlopen(request).read()
1813                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1814                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1815                         return
1816
1817                 # Extract uploader and title from webpage
1818                 self.report_extraction(video_id)
1819                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1820                 if mobj is None:
1821                         self._downloader.trouble(u'ERROR: unable to extract video title')
1822                         return
1823                 video_title = mobj.group(1).decode('utf-8')
1824                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1825
1826                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1827                 if mobj is None:
1828                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1829                         return
1830                 video_uploader = mobj.group(1).decode('utf-8')
1831
1832                 # Extract video thumbnail
1833                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1834                 if mobj is None:
1835                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1836                         return
1837                 video_thumbnail = mobj.group(1).decode('utf-8')
1838
1839                 # Extract video description
1840                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1841                 if mobj is None:
1842                         self._downloader.trouble(u'ERROR: unable to extract video description')
1843                         return
1844                 video_description = mobj.group(1).decode('utf-8')
1845                 if not video_description: video_description = 'No description available.'
1846
1847                 # Extract video height and width
1848                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1849                 if mobj is None:
1850                         self._downloader.trouble(u'ERROR: unable to extract video height')
1851                         return
1852                 yv_video_height = mobj.group(1)
1853
1854                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1855                 if mobj is None:
1856                         self._downloader.trouble(u'ERROR: unable to extract video width')
1857                         return
1858                 yv_video_width = mobj.group(1)
1859
1860                 # Retrieve video playlist to extract media URL
1861                 # I'm not completely sure what all these options are, but we
1862                 # seem to need most of them, otherwise the server sends a 401.
1863                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1864                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1865                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1866                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1867                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1868                 try:
1869                         self.report_download_webpage(video_id)
1870                         webpage = urllib2.urlopen(request).read()
1871                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1872                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1873                         return
1874
1875                 # Extract media URL from playlist XML
1876                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1877                 if mobj is None:
1878                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1879                         return
1880                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1881                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1882
1883                 try:
1884                         # Process video information
1885                         self._downloader.process_info({
1886                                 'id':           video_id.decode('utf-8'),
1887                                 'url':          video_url,
1888                                 'uploader':     video_uploader,
1889                                 'upload_date':  u'NA',
1890                                 'title':        video_title,
1891                                 'stitle':       simple_title,
1892                                 'ext':          video_extension.decode('utf-8'),
1893                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1894                                 'description':  video_description,
1895                                 'thumbnail':    video_thumbnail,
1896                                 'description':  video_description,
1897                                 'player_url':   None,
1898                         })
1899                 except UnavailableVideoError:
1900                         self._downloader.trouble(u'\nERROR: unable to download video')
1901
1902
1903 class GenericIE(InfoExtractor):
1904         """Generic last-resort information extractor."""
1905
1906         def __init__(self, downloader=None):
1907                 InfoExtractor.__init__(self, downloader)
1908
1909         @staticmethod
1910         def suitable(url):
1911                 return True
1912
1913         def report_download_webpage(self, video_id):
1914                 """Report webpage download."""
1915                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1916                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1917
1918         def report_extraction(self, video_id):
1919                 """Report information extraction."""
1920                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1921
1922         def _real_initialize(self):
1923                 return
1924
1925         def _real_extract(self, url):
1926                 # At this point we have a new video
1927                 self._downloader.increment_downloads()
1928
1929                 video_id = url.split('/')[-1]
1930                 request = urllib2.Request(url)
1931                 try:
1932                         self.report_download_webpage(video_id)
1933                         webpage = urllib2.urlopen(request).read()
1934                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1935                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1936                         return
1937                 except ValueError, err:
1938                         # since this is the last-resort InfoExtractor, if
1939                         # this error is thrown, it'll be thrown here
1940                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1941                         return
1942
1943                 self.report_extraction(video_id)
1944                 # Start with something easy: JW Player in SWFObject
1945                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1946                 if mobj is None:
1947                         # Broaden the search a little bit
1948                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1949                 if mobj is None:
1950                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1951                         return
1952
1953                 # It's possible that one of the regexes
1954                 # matched, but returned an empty group:
1955                 if mobj.group(1) is None:
1956                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1957                         return
1958
1959                 video_url = urllib.unquote(mobj.group(1))
1960                 video_id  = os.path.basename(video_url)
1961
1962                 # here's a fun little line of code for you:
1963                 video_extension = os.path.splitext(video_id)[1][1:]
1964                 video_id        = os.path.splitext(video_id)[0]
1965
1966                 # it's tempting to parse this further, but you would
1967                 # have to take into account all the variations like
1968                 #   Video Title - Site Name
1969                 #   Site Name | Video Title
1970                 #   Video Title - Tagline | Site Name
1971                 # and so on and so forth; it's just not practical
1972                 mobj = re.search(r'<title>(.*)</title>', webpage)
1973                 if mobj is None:
1974                         self._downloader.trouble(u'ERROR: unable to extract title')
1975                         return
1976                 video_title = mobj.group(1).decode('utf-8')
1977                 video_title = sanitize_title(video_title)
1978                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1979
1980                 # video uploader is domain name
1981                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1982                 if mobj is None:
1983                         self._downloader.trouble(u'ERROR: unable to extract title')
1984                         return
1985                 video_uploader = mobj.group(1).decode('utf-8')
1986
1987                 try:
1988                         # Process video information
1989                         self._downloader.process_info({
1990                                 'id':           video_id.decode('utf-8'),
1991                                 'url':          video_url.decode('utf-8'),
1992                                 'uploader':     video_uploader,
1993                                 'upload_date':  u'NA',
1994                                 'title':        video_title,
1995                                 'stitle':       simple_title,
1996                                 'ext':          video_extension.decode('utf-8'),
1997                                 'format':       u'NA',
1998                                 'player_url':   None,
1999                         })
2000                 except UnavailableVideoError, err:
2001                         self._downloader.trouble(u'\nERROR: unable to download video')
2002
2003
2004 class YoutubeSearchIE(InfoExtractor):
2005         """Information Extractor for YouTube search queries."""
2006         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2007         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2008         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2009         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2010         _youtube_ie = None
2011         _max_youtube_results = 1000
2012
2013         def __init__(self, youtube_ie, downloader=None):
2014                 InfoExtractor.__init__(self, downloader)
2015                 self._youtube_ie = youtube_ie
2016
2017         @staticmethod
2018         def suitable(url):
2019                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2020
2021         def report_download_page(self, query, pagenum):
2022                 """Report attempt to download playlist page with given number."""
2023                 query = query.decode(preferredencoding())
2024                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2025
2026         def _real_initialize(self):
2027                 self._youtube_ie.initialize()
2028
2029         def _real_extract(self, query):
2030                 mobj = re.match(self._VALID_QUERY, query)
2031                 if mobj is None:
2032                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2033                         return
2034
2035                 prefix, query = query.split(':')
2036                 prefix = prefix[8:]
2037                 query  = query.encode('utf-8')
2038                 if prefix == '':
2039                         self._download_n_results(query, 1)
2040                         return
2041                 elif prefix == 'all':
2042                         self._download_n_results(query, self._max_youtube_results)
2043                         return
2044                 else:
2045                         try:
2046                                 n = long(prefix)
2047                                 if n <= 0:
2048                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2049                                         return
2050                                 elif n > self._max_youtube_results:
2051                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2052                                         n = self._max_youtube_results
2053                                 self._download_n_results(query, n)
2054                                 return
2055                         except ValueError: # parsing prefix as integer fails
2056                                 self._download_n_results(query, 1)
2057                                 return
2058
2059         def _download_n_results(self, query, n):
2060                 """Downloads a specified number of results for a query"""
2061
2062                 video_ids = []
2063                 already_seen = set()
2064                 pagenum = 1
2065
2066                 while True:
2067                         self.report_download_page(query, pagenum)
2068                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2069                         request = urllib2.Request(result_url)
2070                         try:
2071                                 page = urllib2.urlopen(request).read()
2072                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2073                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2074                                 return
2075
2076                         # Extract video identifiers
2077                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2078                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2079                                 if video_id not in already_seen:
2080                                         video_ids.append(video_id)
2081                                         already_seen.add(video_id)
2082                                         if len(video_ids) == n:
2083                                                 # Specified n videos reached
2084                                                 for id in video_ids:
2085                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2086                                                 return
2087
2088                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2089                                 for id in video_ids:
2090                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2091                                 return
2092
2093                         pagenum = pagenum + 1
2094
2095 class GoogleSearchIE(InfoExtractor):
2096         """Information Extractor for Google Video search queries."""
2097         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2098         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2099         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2100         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2101         _google_ie = None
2102         _max_google_results = 1000
2103
2104         def __init__(self, google_ie, downloader=None):
2105                 InfoExtractor.__init__(self, downloader)
2106                 self._google_ie = google_ie
2107
2108         @staticmethod
2109         def suitable(url):
2110                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2111
2112         def report_download_page(self, query, pagenum):
2113                 """Report attempt to download playlist page with given number."""
2114                 query = query.decode(preferredencoding())
2115                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2116
2117         def _real_initialize(self):
2118                 self._google_ie.initialize()
2119
2120         def _real_extract(self, query):
2121                 mobj = re.match(self._VALID_QUERY, query)
2122                 if mobj is None:
2123                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2124                         return
2125
2126                 prefix, query = query.split(':')
2127                 prefix = prefix[8:]
2128                 query  = query.encode('utf-8')
2129                 if prefix == '':
2130                         self._download_n_results(query, 1)
2131                         return
2132                 elif prefix == 'all':
2133                         self._download_n_results(query, self._max_google_results)
2134                         return
2135                 else:
2136                         try:
2137                                 n = long(prefix)
2138                                 if n <= 0:
2139                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2140                                         return
2141                                 elif n > self._max_google_results:
2142                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2143                                         n = self._max_google_results
2144                                 self._download_n_results(query, n)
2145                                 return
2146                         except ValueError: # parsing prefix as integer fails
2147                                 self._download_n_results(query, 1)
2148                                 return
2149
2150         def _download_n_results(self, query, n):
2151                 """Downloads a specified number of results for a query"""
2152
2153                 video_ids = []
2154                 already_seen = set()
2155                 pagenum = 1
2156
2157                 while True:
2158                         self.report_download_page(query, pagenum)
2159                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2160                         request = urllib2.Request(result_url)
2161                         try:
2162                                 page = urllib2.urlopen(request).read()
2163                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2164                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2165                                 return
2166
2167                         # Extract video identifiers
2168                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2169                                 video_id = mobj.group(1)
2170                                 if video_id not in already_seen:
2171                                         video_ids.append(video_id)
2172                                         already_seen.add(video_id)
2173                                         if len(video_ids) == n:
2174                                                 # Specified n videos reached
2175                                                 for id in video_ids:
2176                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2177                                                 return
2178
2179                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2180                                 for id in video_ids:
2181                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2182                                 return
2183
2184                         pagenum = pagenum + 1
2185
2186 class YahooSearchIE(InfoExtractor):
2187         """Information Extractor for Yahoo! Video search queries."""
2188         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2189         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2190         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2191         _MORE_PAGES_INDICATOR = r'\s*Next'
2192         _yahoo_ie = None
2193         _max_yahoo_results = 1000
2194
2195         def __init__(self, yahoo_ie, downloader=None):
2196                 InfoExtractor.__init__(self, downloader)
2197                 self._yahoo_ie = yahoo_ie
2198
2199         @staticmethod
2200         def suitable(url):
2201                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2202
2203         def report_download_page(self, query, pagenum):
2204                 """Report attempt to download playlist page with given number."""
2205                 query = query.decode(preferredencoding())
2206                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2207
2208         def _real_initialize(self):
2209                 self._yahoo_ie.initialize()
2210
2211         def _real_extract(self, query):
2212                 mobj = re.match(self._VALID_QUERY, query)
2213                 if mobj is None:
2214                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2215                         return
2216
2217                 prefix, query = query.split(':')
2218                 prefix = prefix[8:]
2219                 query  = query.encode('utf-8')
2220                 if prefix == '':
2221                         self._download_n_results(query, 1)
2222                         return
2223                 elif prefix == 'all':
2224                         self._download_n_results(query, self._max_yahoo_results)
2225                         return
2226                 else:
2227                         try:
2228                                 n = long(prefix)
2229                                 if n <= 0:
2230                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2231                                         return
2232                                 elif n > self._max_yahoo_results:
2233                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2234                                         n = self._max_yahoo_results
2235                                 self._download_n_results(query, n)
2236                                 return
2237                         except ValueError: # parsing prefix as integer fails
2238                                 self._download_n_results(query, 1)
2239                                 return
2240
2241         def _download_n_results(self, query, n):
2242                 """Downloads a specified number of results for a query"""
2243
2244                 video_ids = []
2245                 already_seen = set()
2246                 pagenum = 1
2247
2248                 while True:
2249                         self.report_download_page(query, pagenum)
2250                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2251                         request = urllib2.Request(result_url)
2252                         try:
2253                                 page = urllib2.urlopen(request).read()
2254                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2255                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2256                                 return
2257
2258                         # Extract video identifiers
2259                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2260                                 video_id = mobj.group(1)
2261                                 if video_id not in already_seen:
2262                                         video_ids.append(video_id)
2263                                         already_seen.add(video_id)
2264                                         if len(video_ids) == n:
2265                                                 # Specified n videos reached
2266                                                 for id in video_ids:
2267                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2268                                                 return
2269
2270                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2271                                 for id in video_ids:
2272                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2273                                 return
2274
2275                         pagenum = pagenum + 1
2276
2277 class YoutubePlaylistIE(InfoExtractor):
2278         """Information Extractor for YouTube playlists."""
2279
2280         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2281         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2282         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2283         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2284         _youtube_ie = None
2285
2286         def __init__(self, youtube_ie, downloader=None):
2287                 InfoExtractor.__init__(self, downloader)
2288                 self._youtube_ie = youtube_ie
2289
2290         @staticmethod
2291         def suitable(url):
2292                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2293
2294         def report_download_page(self, playlist_id, pagenum):
2295                 """Report attempt to download playlist page with given number."""
2296                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2297
2298         def _real_initialize(self):
2299                 self._youtube_ie.initialize()
2300
2301         def _real_extract(self, url):
2302                 # Extract playlist id
2303                 mobj = re.match(self._VALID_URL, url)
2304                 if mobj is None:
2305                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2306                         return
2307
2308                 # Single video case
2309                 if mobj.group(3) is not None:
2310                         self._youtube_ie.extract(mobj.group(3))
2311                         return
2312
2313                 # Download playlist pages
2314                 # prefix is 'p' as default for playlists but there are other types that need extra care
2315                 playlist_prefix = mobj.group(1)
2316                 if playlist_prefix == 'a':
2317                         playlist_access = 'artist'
2318                 else:
2319                         playlist_prefix = 'p'
2320                         playlist_access = 'view_play_list'
2321                 playlist_id = mobj.group(2)
2322                 video_ids = []
2323                 pagenum = 1
2324
2325                 while True:
2326                         self.report_download_page(playlist_id, pagenum)
2327                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2328                         try:
2329                                 page = urllib2.urlopen(request).read()
2330                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2331                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2332                                 return
2333
2334                         # Extract video identifiers
2335                         ids_in_page = []
2336                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2337                                 if mobj.group(1) not in ids_in_page:
2338                                         ids_in_page.append(mobj.group(1))
2339                         video_ids.extend(ids_in_page)
2340
2341                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2342                                 break
2343                         pagenum = pagenum + 1
2344
2345                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2346                 playlistend = self._downloader.params.get('playlistend', -1)
2347                 video_ids = video_ids[playliststart:playlistend]
2348
2349                 for id in video_ids:
2350                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2351                 return
2352
2353 class YoutubeUserIE(InfoExtractor):
2354         """Information Extractor for YouTube users."""
2355
2356         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2357         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2358         _GDATA_PAGE_SIZE = 50
2359         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2360         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2361         _youtube_ie = None
2362
2363         def __init__(self, youtube_ie, downloader=None):
2364                 InfoExtractor.__init__(self, downloader)
2365                 self._youtube_ie = youtube_ie
2366
2367         @staticmethod
2368         def suitable(url):
2369                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2370
2371         def report_download_page(self, username, start_index):
2372                 """Report attempt to download user page."""
2373                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2374                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2375
2376         def _real_initialize(self):
2377                 self._youtube_ie.initialize()
2378
2379         def _real_extract(self, url):
2380                 # Extract username
2381                 mobj = re.match(self._VALID_URL, url)
2382                 if mobj is None:
2383                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2384                         return
2385
2386                 username = mobj.group(1)
2387
2388                 # Download video ids using YouTube Data API. Result size per
2389                 # query is limited (currently to 50 videos) so we need to query
2390                 # page by page until there are no video ids - it means we got
2391                 # all of them.
2392
2393                 video_ids = []
2394                 pagenum = 0
2395
2396                 while True:
2397                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2398                         self.report_download_page(username, start_index)
2399
2400                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2401
2402                         try:
2403                                 page = urllib2.urlopen(request).read()
2404                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2405                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2406                                 return
2407
2408                         # Extract video identifiers
2409                         ids_in_page = []
2410
2411                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2412                                 if mobj.group(1) not in ids_in_page:
2413                                         ids_in_page.append(mobj.group(1))
2414
2415                         video_ids.extend(ids_in_page)
2416
2417                         # A little optimization - if current page is not
2418                         # "full", ie. does not contain PAGE_SIZE video ids then
2419                         # we can assume that this page is the last one - there
2420                         # are no more ids on further pages - no need to query
2421                         # again.
2422
2423                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2424                                 break
2425
2426                         pagenum += 1
2427
2428                 all_ids_count = len(video_ids)
2429                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2430                 playlistend = self._downloader.params.get('playlistend', -1)
2431
2432                 if playlistend == -1:
2433                         video_ids = video_ids[playliststart:]
2434                 else:
2435                         video_ids = video_ids[playliststart:playlistend]
2436
2437                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2438                                            (username, all_ids_count, len(video_ids)))
2439
2440                 for video_id in video_ids:
2441                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2442
2443
2444 class DepositFilesIE(InfoExtractor):
2445         """Information extractor for depositfiles.com"""
2446
2447         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2448
2449         def __init__(self, downloader=None):
2450                 InfoExtractor.__init__(self, downloader)
2451
2452         @staticmethod
2453         def suitable(url):
2454                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2455
2456         def report_download_webpage(self, file_id):
2457                 """Report webpage download."""
2458                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2459
2460         def report_extraction(self, file_id):
2461                 """Report information extraction."""
2462                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2463
2464         def _real_initialize(self):
2465                 return
2466
2467         def _real_extract(self, url):
2468                 # At this point we have a new file
2469                 self._downloader.increment_downloads()
2470
2471                 file_id = url.split('/')[-1]
2472                 # Rebuild url in english locale
2473                 url = 'http://depositfiles.com/en/files/' + file_id
2474
2475                 # Retrieve file webpage with 'Free download' button pressed
2476                 free_download_indication = { 'gateway_result' : '1' }
2477                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2478                 try:
2479                         self.report_download_webpage(file_id)
2480                         webpage = urllib2.urlopen(request).read()
2481                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2482                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2483                         return
2484
2485                 # Search for the real file URL
2486                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2487                 if (mobj is None) or (mobj.group(1) is None):
2488                         # Try to figure out reason of the error.
2489                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2490                         if (mobj is not None) and (mobj.group(1) is not None):
2491                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2492                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2493                         else:
2494                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2495                         return
2496
2497                 file_url = mobj.group(1)
2498                 file_extension = os.path.splitext(file_url)[1][1:]
2499
2500                 # Search for file title
2501                 mobj = re.search(r'<b title="(.*?)">', webpage)
2502                 if mobj is None:
2503                         self._downloader.trouble(u'ERROR: unable to extract title')
2504                         return
2505                 file_title = mobj.group(1).decode('utf-8')
2506
2507                 try:
2508                         # Process file information
2509                         self._downloader.process_info({
2510                                 'id':           file_id.decode('utf-8'),
2511                                 'url':          file_url.decode('utf-8'),
2512                                 'uploader':     u'NA',
2513                                 'upload_date':  u'NA',
2514                                 'title':        file_title,
2515                                 'stitle':       file_title,
2516                                 'ext':          file_extension.decode('utf-8'),
2517                                 'format':       u'NA',
2518                                 'player_url':   None,
2519                         })
2520                 except UnavailableVideoError, err:
2521                         self._downloader.trouble(u'ERROR: unable to download file')
2522
2523 class FacebookIE(InfoExtractor):
2524         """Information Extractor for Facebook"""
2525
2526         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2527         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2528         _NETRC_MACHINE = 'facebook'
2529         _available_formats = ['highqual', 'lowqual']
2530         _video_extensions = {
2531                 'highqual': 'mp4',
2532                 'lowqual': 'mp4',
2533         }
2534
2535         def __init__(self, downloader=None):
2536                 InfoExtractor.__init__(self, downloader)
2537
2538         @staticmethod
2539         def suitable(url):
2540                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2541
2542         def _reporter(self, message):
2543                 """Add header and report message."""
2544                 self._downloader.to_screen(u'[facebook] %s' % message)
2545
2546         def report_login(self):
2547                 """Report attempt to log in."""
2548                 self._reporter(u'Logging in')
2549
2550         def report_video_webpage_download(self, video_id):
2551                 """Report attempt to download video webpage."""
2552                 self._reporter(u'%s: Downloading video webpage' % video_id)
2553
2554         def report_information_extraction(self, video_id):
2555                 """Report attempt to extract video information."""
2556                 self._reporter(u'%s: Extracting video information' % video_id)
2557
2558         def _parse_page(self, video_webpage):
2559                 """Extract video information from page"""
2560                 # General data
2561                 data = {'title': r'class="video_title datawrap">(.*?)</',
2562                         'description': r'<div class="datawrap">(.*?)</div>',
2563                         'owner': r'\("video_owner_name", "(.*?)"\)',
2564                         'upload_date': r'data-date="(.*?)"',
2565                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2566                         }
2567                 video_info = {}
2568                 for piece in data.keys():
2569                         mobj = re.search(data[piece], video_webpage)
2570                         if mobj is not None:
2571                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2572
2573                 # Video urls
2574                 video_urls = {}
2575                 for fmt in self._available_formats:
2576                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2577                         if mobj is not None:
2578                                 # URL is in a Javascript segment inside an escaped Unicode format within
2579                                 # the generally utf-8 page
2580                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2581                 video_info['video_urls'] = video_urls
2582
2583                 return video_info
2584
2585         def _real_initialize(self):
2586                 if self._downloader is None:
2587                         return
2588
2589                 useremail = None
2590                 password = None
2591                 downloader_params = self._downloader.params
2592
2593                 # Attempt to use provided username and password or .netrc data
2594                 if downloader_params.get('username', None) is not None:
2595                         useremail = downloader_params['username']
2596                         password = downloader_params['password']
2597                 elif downloader_params.get('usenetrc', False):
2598                         try:
2599                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2600                                 if info is not None:
2601                                         useremail = info[0]
2602                                         password = info[2]
2603                                 else:
2604                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2605                         except (IOError, netrc.NetrcParseError), err:
2606                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2607                                 return
2608
2609                 if useremail is None:
2610                         return
2611
2612                 # Log in
2613                 login_form = {
2614                         'email': useremail,
2615                         'pass': password,
2616                         'login': 'Log+In'
2617                         }
2618                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2619                 try:
2620                         self.report_login()
2621                         login_results = urllib2.urlopen(request).read()
2622                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2623                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2624                                 return
2625                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2626                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2627                         return
2628
2629         def _real_extract(self, url):
2630                 mobj = re.match(self._VALID_URL, url)
2631                 if mobj is None:
2632                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2633                         return
2634                 video_id = mobj.group('ID')
2635
2636                 # Get video webpage
2637                 self.report_video_webpage_download(video_id)
2638                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2639                 try:
2640                         page = urllib2.urlopen(request)
2641                         video_webpage = page.read()
2642                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2643                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2644                         return
2645
2646                 # Start extracting information
2647                 self.report_information_extraction(video_id)
2648
2649                 # Extract information
2650                 video_info = self._parse_page(video_webpage)
2651
2652                 # uploader
2653                 if 'owner' not in video_info:
2654                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2655                         return
2656                 video_uploader = video_info['owner']
2657
2658                 # title
2659                 if 'title' not in video_info:
2660                         self._downloader.trouble(u'ERROR: unable to extract video title')
2661                         return
2662                 video_title = video_info['title']
2663                 video_title = video_title.decode('utf-8')
2664                 video_title = sanitize_title(video_title)
2665
2666                 # simplified title
2667                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2668                 simple_title = simple_title.strip(ur'_')
2669
2670                 # thumbnail image
2671                 if 'thumbnail' not in video_info:
2672                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2673                         video_thumbnail = ''
2674                 else:
2675                         video_thumbnail = video_info['thumbnail']
2676
2677                 # upload date
2678                 upload_date = u'NA'
2679                 if 'upload_date' in video_info:
2680                         upload_time = video_info['upload_date']
2681                         timetuple = email.utils.parsedate_tz(upload_time)
2682                         if timetuple is not None:
2683                                 try:
2684                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2685                                 except:
2686                                         pass
2687
2688                 # description
2689                 video_description = video_info.get('description', 'No description available.')
2690
2691                 url_map = video_info['video_urls']
2692                 if len(url_map.keys()) > 0:
2693                         # Decide which formats to download
2694                         req_format = self._downloader.params.get('format', None)
2695                         format_limit = self._downloader.params.get('format_limit', None)
2696
2697                         if format_limit is not None and format_limit in self._available_formats:
2698                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2699                         else:
2700                                 format_list = self._available_formats
2701                         existing_formats = [x for x in format_list if x in url_map]
2702                         if len(existing_formats) == 0:
2703                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2704                                 return
2705                         if req_format is None:
2706                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2707                         elif req_format == '-1':
2708                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2709                         else:
2710                                 # Specific format
2711                                 if req_format not in url_map:
2712                                         self._downloader.trouble(u'ERROR: requested format not available')
2713                                         return
2714                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2715
2716                 for format_param, video_real_url in video_url_list:
2717
2718                         # At this point we have a new video
2719                         self._downloader.increment_downloads()
2720
2721                         # Extension
2722                         video_extension = self._video_extensions.get(format_param, 'mp4')
2723
2724                         # Find the video URL in fmt_url_map or conn paramters
2725                         try:
2726                                 # Process video information
2727                                 self._downloader.process_info({
2728                                         'id':           video_id.decode('utf-8'),
2729                                         'url':          video_real_url.decode('utf-8'),
2730                                         'uploader':     video_uploader.decode('utf-8'),
2731                                         'upload_date':  upload_date,
2732                                         'title':        video_title,
2733                                         'stitle':       simple_title,
2734                                         'ext':          video_extension.decode('utf-8'),
2735                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2736                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2737                                         'description':  video_description.decode('utf-8'),
2738                                         'player_url':   None,
2739                                 })
2740                         except UnavailableVideoError, err:
2741                                 self._downloader.trouble(u'\nERROR: unable to download video')
2742
2743 class BlipTVIE(InfoExtractor):
2744         """Information extractor for blip.tv"""
2745
2746         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2747         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2748
2749         @staticmethod
2750         def suitable(url):
2751                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2752
2753         def report_extraction(self, file_id):
2754                 """Report information extraction."""
2755                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2756
2757         def _simplify_title(self, title):
2758                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2759                 res = res.strip(ur'_')
2760                 return res
2761
2762         def _real_extract(self, url):
2763                 mobj = re.match(self._VALID_URL, url)
2764                 if mobj is None:
2765                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2766                         return
2767
2768                 if '?' in url:
2769                         cchar = '&'
2770                 else:
2771                         cchar = '?'
2772                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2773                 request = urllib2.Request(json_url)
2774                 self.report_extraction(mobj.group(1))
2775                 try:
2776                         json_code = urllib2.urlopen(request).read()
2777                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2778                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2779                         return
2780                 try:
2781                         json_data = json.loads(json_code)
2782                         if 'Post' in json_data:
2783                                 data = json_data['Post']
2784                         else:
2785                                 data = json_data
2786
2787                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2788                         video_url = data['media']['url']
2789                         umobj = re.match(self._URL_EXT, video_url)
2790                         if umobj is None:
2791                                 raise ValueError('Can not determine filename extension')
2792                         ext = umobj.group(1)
2793
2794                         self._downloader.increment_downloads()
2795
2796                         info = {
2797                                 'id': data['item_id'],
2798                                 'url': video_url,
2799                                 'uploader': data['display_name'],
2800                                 'upload_date': upload_date,
2801                                 'title': data['title'],
2802                                 'stitle': self._simplify_title(data['title']),
2803                                 'ext': ext,
2804                                 'format': data['media']['mimeType'],
2805                                 'thumbnail': data['thumbnailUrl'],
2806                                 'description': data['description'],
2807                                 'player_url': data['embedUrl']
2808                         }
2809                 except (ValueError,KeyError), err:
2810                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2811                         return
2812
2813                 try:
2814                         self._downloader.process_info(info)
2815                 except UnavailableVideoError, err:
2816                         self._downloader.trouble(u'\nERROR: unable to download video')
2817
2818
2819 class PostProcessor(object):
2820         """Post Processor class.
2821
2822         PostProcessor objects can be added to downloaders with their
2823         add_post_processor() method. When the downloader has finished a
2824         successful download, it will take its internal chain of PostProcessors
2825         and start calling the run() method on each one of them, first with
2826         an initial argument and then with the returned value of the previous
2827         PostProcessor.
2828
2829         The chain will be stopped if one of them ever returns None or the end
2830         of the chain is reached.
2831
2832         PostProcessor objects follow a "mutual registration" process similar
2833         to InfoExtractor objects.
2834         """
2835
2836         _downloader = None
2837
2838         def __init__(self, downloader=None):
2839                 self._downloader = downloader
2840
2841         def set_downloader(self, downloader):
2842                 """Sets the downloader for this PP."""
2843                 self._downloader = downloader
2844
2845         def run(self, information):
2846                 """Run the PostProcessor.
2847
2848                 The "information" argument is a dictionary like the ones
2849                 composed by InfoExtractors. The only difference is that this
2850                 one has an extra field called "filepath" that points to the
2851                 downloaded file.
2852
2853                 When this method returns None, the postprocessing chain is
2854                 stopped. However, this method may return an information
2855                 dictionary that will be passed to the next postprocessing
2856                 object in the chain. It can be the one it received after
2857                 changing some fields.
2858
2859                 In addition, this method may raise a PostProcessingError
2860                 exception that will be taken into account by the downloader
2861                 it was called from.
2862                 """
2863                 return information # by default, do nothing
2864
2865 class FFmpegExtractAudioPP(PostProcessor):
2866
2867         def __init__(self, downloader=None, preferredcodec=None):
2868                 PostProcessor.__init__(self, downloader)
2869                 if preferredcodec is None:
2870                         preferredcodec = 'best'
2871                 self._preferredcodec = preferredcodec
2872
2873         @staticmethod
2874         def get_audio_codec(path):
2875                 try:
2876                         cmd = ['ffprobe', '-show_streams', '--', path]
2877                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2878                         output = handle.communicate()[0]
2879                         if handle.wait() != 0:
2880                                 return None
2881                 except (IOError, OSError):
2882                         return None
2883                 audio_codec = None
2884                 for line in output.split('\n'):
2885                         if line.startswith('codec_name='):
2886                                 audio_codec = line.split('=')[1].strip()
2887                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2888                                 return audio_codec
2889                 return None
2890
2891         @staticmethod
2892         def run_ffmpeg(path, out_path, codec, more_opts):
2893                 try:
2894                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2895                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2896                         return (ret == 0)
2897                 except (IOError, OSError):
2898                         return False
2899
2900         def run(self, information):
2901                 path = information['filepath']
2902
2903                 filecodec = self.get_audio_codec(path)
2904                 if filecodec is None:
2905                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2906                         return None
2907
2908                 more_opts = []
2909                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2910                         if filecodec == 'aac' or filecodec == 'mp3':
2911                                 # Lossless if possible
2912                                 acodec = 'copy'
2913                                 extension = filecodec
2914                                 if filecodec == 'aac':
2915                                         more_opts = ['-f', 'adts']
2916                         else:
2917                                 # MP3 otherwise.
2918                                 acodec = 'libmp3lame'
2919                                 extension = 'mp3'
2920                                 more_opts = ['-ab', '128k']
2921                 else:
2922                         # We convert the audio (lossy)
2923                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2924                         extension = self._preferredcodec
2925                         more_opts = ['-ab', '128k']
2926                         if self._preferredcodec == 'aac':
2927                                 more_opts += ['-f', 'adts']
2928
2929                 (prefix, ext) = os.path.splitext(path)
2930                 new_path = prefix + '.' + extension
2931                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2932                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2933
2934                 if not status:
2935                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2936                         return None
2937
2938                 try:
2939                         os.remove(path)
2940                 except (IOError, OSError):
2941                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2942                         return None
2943
2944                 information['filepath'] = new_path
2945                 return information
2946
2947 ### MAIN PROGRAM ###
2948 if __name__ == '__main__':
2949         try:
2950                 # Modules needed only when running the main program
2951                 import getpass
2952                 import optparse
2953
2954                 # Function to update the program file with the latest version from the repository.
2955                 def update_self(downloader, filename):
2956                         # Note: downloader only used for options
2957                         if not os.access(filename, os.W_OK):
2958                                 sys.exit('ERROR: no write permissions on %s' % filename)
2959
2960                         downloader.to_screen('Updating to latest stable version...')
2961                         try:
2962                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2963                                 latest_version = urllib.urlopen(latest_url).read().strip()
2964                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2965                                 newcontent = urllib.urlopen(prog_url).read()
2966                         except (IOError, OSError), err:
2967                                 sys.exit('ERROR: unable to download latest version')
2968                         try:
2969                                 stream = open(filename, 'w')
2970                                 stream.write(newcontent)
2971                                 stream.close()
2972                         except (IOError, OSError), err:
2973                                 sys.exit('ERROR: unable to overwrite current version')
2974                         downloader.to_screen('Updated to version %s' % latest_version)
2975
2976                 # Parse command line
2977                 parser = optparse.OptionParser(
2978                         usage='Usage: %prog [options] url...',
2979                         version='2011.07.09-phihag',
2980                         conflict_handler='resolve',
2981                 )
2982
2983                 parser.add_option('-h', '--help',
2984                                 action='help', help='print this help text and exit')
2985                 parser.add_option('-v', '--version',
2986                                 action='version', help='print program version and exit')
2987                 parser.add_option('-U', '--update',
2988                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2989                 parser.add_option('-i', '--ignore-errors',
2990                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2991                 parser.add_option('-r', '--rate-limit',
2992                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2993                 parser.add_option('-R', '--retries',
2994                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2995                 parser.add_option('--playlist-start',
2996                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2997                 parser.add_option('--playlist-end',
2998                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2999                 parser.add_option('--dump-user-agent',
3000                                 action='store_true', dest='dump_user_agent',
3001                                 help='display the current browser identification', default=False)
3002
3003                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3004                 authentication.add_option('-u', '--username',
3005                                 dest='username', metavar='USERNAME', help='account username')
3006                 authentication.add_option('-p', '--password',
3007                                 dest='password', metavar='PASSWORD', help='account password')
3008                 authentication.add_option('-n', '--netrc',
3009                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3010                 parser.add_option_group(authentication)
3011
3012                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3013                 video_format.add_option('-f', '--format',
3014                                 action='store', dest='format', metavar='FORMAT', help='video format code')
3015                 video_format.add_option('--all-formats',
3016                                 action='store_const', dest='format', help='download all available video formats', const='-1')
3017                 video_format.add_option('--max-quality',
3018                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3019                 parser.add_option_group(video_format)
3020
3021                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3022                 verbosity.add_option('-q', '--quiet',
3023                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3024                 verbosity.add_option('-s', '--simulate',
3025                                 action='store_true', dest='simulate', help='do not download video', default=False)
3026                 verbosity.add_option('-g', '--get-url',
3027                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3028                 verbosity.add_option('-e', '--get-title',
3029                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3030                 verbosity.add_option('--get-thumbnail',
3031                                 action='store_true', dest='getthumbnail',
3032                                 help='simulate, quiet but print thumbnail URL', default=False)
3033                 verbosity.add_option('--get-description',
3034                                 action='store_true', dest='getdescription',
3035                                 help='simulate, quiet but print video description', default=False)
3036                 verbosity.add_option('--get-filename',
3037                                 action='store_true', dest='getfilename',
3038                                 help='simulate, quiet but print output filename', default=False)
3039                 verbosity.add_option('--no-progress',
3040                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3041                 verbosity.add_option('--console-title',
3042                                 action='store_true', dest='consoletitle',
3043                                 help='display progress in console titlebar', default=False)
3044                 parser.add_option_group(verbosity)
3045
3046                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3047                 filesystem.add_option('-t', '--title',
3048                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
3049                 filesystem.add_option('-l', '--literal',
3050                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3051                 filesystem.add_option('-A', '--auto-number',
3052                                 action='store_true', dest='autonumber',
3053                                 help='number downloaded files starting from 00000', default=False)
3054                 filesystem.add_option('-o', '--output',
3055                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3056                 filesystem.add_option('-a', '--batch-file',
3057                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3058                 filesystem.add_option('-w', '--no-overwrites',
3059                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3060                 filesystem.add_option('-c', '--continue',
3061                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3062                 filesystem.add_option('--cookies',
3063                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3064                 filesystem.add_option('--no-part',
3065                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
3066                 filesystem.add_option('--no-mtime',
3067                                 action='store_false', dest='updatetime',
3068                                 help='do not use the Last-modified header to set the file modification time', default=True)
3069                 filesystem.add_option('--write-description',
3070                                 action='store_true', dest='writedescription',
3071                                 help='write video description to a .description file', default=False)
3072                 filesystem.add_option('--write-info-json',
3073                                 action='store_true', dest='writeinfojson',
3074                                 help='write video metadata to a .info.json file', default=False)
3075                 parser.add_option_group(filesystem)
3076
3077                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3078                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3079                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3080                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3081                                 help='"best", "aac" or "mp3"; best by default')
3082                 parser.add_option_group(postproc)
3083
3084                 (opts, args) = parser.parse_args()
3085
3086                 # Open appropriate CookieJar
3087                 if opts.cookiefile is None:
3088                         jar = cookielib.CookieJar()
3089                 else:
3090                         try:
3091                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3092                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3093                                         jar.load()
3094                         except (IOError, OSError), err:
3095                                 sys.exit(u'ERROR: unable to open cookie file')
3096
3097                 # Dump user agent
3098                 if opts.dump_user_agent:
3099                         print std_headers['User-Agent']
3100                         sys.exit(0)
3101
3102                 # General configuration
3103                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3104                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3105                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3106
3107                 # Batch file verification
3108                 batchurls = []
3109                 if opts.batchfile is not None:
3110                         try:
3111                                 if opts.batchfile == '-':
3112                                         batchfd = sys.stdin
3113                                 else:
3114                                         batchfd = open(opts.batchfile, 'r')
3115                                 batchurls = batchfd.readlines()
3116                                 batchurls = [x.strip() for x in batchurls]
3117                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3118                         except IOError:
3119                                 sys.exit(u'ERROR: batch file could not be read')
3120                 all_urls = batchurls + args
3121
3122                 # Conflicting, missing and erroneous options
3123                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3124                         parser.error(u'using .netrc conflicts with giving username/password')
3125                 if opts.password is not None and opts.username is None:
3126                         parser.error(u'account username missing')
3127                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3128                         parser.error(u'using output template conflicts with using title, literal title or auto number')
3129                 if opts.usetitle and opts.useliteral:
3130                         parser.error(u'using title conflicts with using literal title')
3131                 if opts.username is not None and opts.password is None:
3132                         opts.password = getpass.getpass(u'Type account password and press return:')
3133                 if opts.ratelimit is not None:
3134                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3135                         if numeric_limit is None:
3136                                 parser.error(u'invalid rate limit specified')
3137                         opts.ratelimit = numeric_limit
3138                 if opts.retries is not None:
3139                         try:
3140                                 opts.retries = long(opts.retries)
3141                         except (TypeError, ValueError), err:
3142                                 parser.error(u'invalid retry count specified')
3143                 try:
3144                         opts.playliststart = long(opts.playliststart)
3145                         if opts.playliststart <= 0:
3146                                 raise ValueError
3147                 except (TypeError, ValueError), err:
3148                         parser.error(u'invalid playlist start number specified')
3149                 try:
3150                         opts.playlistend = long(opts.playlistend)
3151                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3152                                 raise ValueError
3153                 except (TypeError, ValueError), err:
3154                         parser.error(u'invalid playlist end number specified')
3155                 if opts.extractaudio:
3156                         if opts.audioformat not in ['best', 'aac', 'mp3']:
3157                                 parser.error(u'invalid audio format specified')
3158
3159                 # Information extractors
3160                 youtube_ie = YoutubeIE()
3161                 metacafe_ie = MetacafeIE(youtube_ie)
3162                 dailymotion_ie = DailymotionIE()
3163                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3164                 youtube_user_ie = YoutubeUserIE(youtube_ie)
3165                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3166                 google_ie = GoogleIE()
3167                 google_search_ie = GoogleSearchIE(google_ie)
3168                 photobucket_ie = PhotobucketIE()
3169                 yahoo_ie = YahooIE()
3170                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3171                 deposit_files_ie = DepositFilesIE()
3172                 facebook_ie = FacebookIE()
3173                 bliptv_ie = BlipTVIE()
3174                 generic_ie = GenericIE()
3175
3176                 # File downloader
3177                 fd = FileDownloader({
3178                         'usenetrc': opts.usenetrc,
3179                         'username': opts.username,
3180                         'password': opts.password,
3181                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3182                         'forceurl': opts.geturl,
3183                         'forcetitle': opts.gettitle,
3184                         'forcethumbnail': opts.getthumbnail,
3185                         'forcedescription': opts.getdescription,
3186                         'forcefilename': opts.getfilename,
3187                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3188                         'format': opts.format,
3189                         'format_limit': opts.format_limit,
3190                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3191                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3192                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3193                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3194                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3195                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3196                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3197                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3198                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3199                                 or u'%(id)s.%(ext)s'),
3200                         'ignoreerrors': opts.ignoreerrors,
3201                         'ratelimit': opts.ratelimit,
3202                         'nooverwrites': opts.nooverwrites,
3203                         'retries': opts.retries,
3204                         'continuedl': opts.continue_dl,
3205                         'noprogress': opts.noprogress,
3206                         'playliststart': opts.playliststart,
3207                         'playlistend': opts.playlistend,
3208                         'logtostderr': opts.outtmpl == '-',
3209                         'consoletitle': opts.consoletitle,
3210                         'nopart': opts.nopart,
3211                         'updatetime': opts.updatetime,
3212                         'writedescription': opts.writedescription,
3213                         'writeinfojson': opts.writeinfojson,
3214                         })
3215                 fd.add_info_extractor(youtube_search_ie)
3216                 fd.add_info_extractor(youtube_pl_ie)
3217                 fd.add_info_extractor(youtube_user_ie)
3218                 fd.add_info_extractor(metacafe_ie)
3219                 fd.add_info_extractor(dailymotion_ie)
3220                 fd.add_info_extractor(youtube_ie)
3221                 fd.add_info_extractor(google_ie)
3222                 fd.add_info_extractor(google_search_ie)
3223                 fd.add_info_extractor(photobucket_ie)
3224                 fd.add_info_extractor(yahoo_ie)
3225                 fd.add_info_extractor(yahoo_search_ie)
3226                 fd.add_info_extractor(deposit_files_ie)
3227                 fd.add_info_extractor(facebook_ie)
3228                 fd.add_info_extractor(bliptv_ie)
3229
3230                 # This must come last since it's the
3231                 # fallback if none of the others work
3232                 fd.add_info_extractor(generic_ie)
3233
3234                 # PostProcessors
3235                 if opts.extractaudio:
3236                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3237
3238                 # Update version
3239                 if opts.update_self:
3240                         update_self(fd, sys.argv[0])
3241
3242                 # Maybe do nothing
3243                 if len(all_urls) < 1:
3244                         if not opts.update_self:
3245                                 parser.error(u'you must provide at least one URL')
3246                         else:
3247                                 sys.exit()
3248                 retcode = fd.download(all_urls)
3249
3250                 # Dump cookie jar if requested
3251                 if opts.cookiefile is not None:
3252                         try:
3253                                 jar.save()
3254                         except (IOError, OSError), err:
3255                                 sys.exit(u'ERROR: unable to save cookie jar')
3256
3257                 sys.exit(retcode)
3258
3259         except DownloadError:
3260                 sys.exit(1)
3261         except SameFileError:
3262                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3263         except KeyboardInterrupt:
3264                 sys.exit(u'\nERROR: Interrupted by user')