youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # Author: Paweł Paprota
   9 # Author: Gergely Imreh
  10 # Author: Philipp Hagemeister <phihag@phihag.de>
  11 # License: Public domain code
  12 import cookielib
  13 import datetime
  14 import gzip
  15 import htmlentitydefs
  16 import httplib
  17 import locale
  18 import math
  19 import netrc
  20 import os
  21 import os.path
  22 import re
  23 import socket
  24 import string
  25 import subprocess
  26 import sys
  27 import time
  28 import urllib
  29 import urllib2
  30 import warnings
  31 import zlib
  32
  33 if os.name == 'nt':
  34         import ctypes
  35
  36 try:
  37         import email.utils
  38 except ImportError: # Python 2.4
  39         import email.Utils
  40 try:
  41         import cStringIO as StringIO
  42 except ImportError:
  43         import StringIO
  44
  45 # parse_qs was moved from the cgi module to the urlparse module recently.
  46 try:
  47         from urlparse import parse_qs
  48 except ImportError:
  49         from cgi import parse_qs
  50
  51 try:
  52         import lxml.etree
  53 except ImportError: # Python < 2.6
  54         pass # Handled below
  55
  56 std_headers = {
  57         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
  58         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60         'Accept-Encoding': 'gzip, deflate',
  61         'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  65
  66 try:
  67         import json
  68 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  69         import re
  70         class json(object):
  71                 @staticmethod
  72                 def loads(s):
  73                         s = s.decode('UTF-8')
  74                         def raiseError(msg, i):
  75                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  76                         def skipSpace(i, expectMore=True):
  77                                 while i < len(s) and s[i] in ' \t\r\n':
  78                                         i += 1
  79                                 if expectMore:
  80                                         if i >= len(s):
  81                                                 raiseError('Premature end', i)
  82                                 return i
  83                         def decodeEscape(match):
  84                                 esc = match.group(1)
  85                                 _STATIC = {
  86                                         '"': '"',
  87                                         '\\': '\\',
  88                                         '/': '/',
  89                                         'b': unichr(0x8),
  90                                         'f': unichr(0xc),
  91                                         'n': '\n',
  92                                         'r': '\r',
  93                                         't': '\t',
  94                                 }
  95                                 if esc in _STATIC:
  96                                         return _STATIC[esc]
  97                                 if esc[0] == 'u':
  98                                         if len(esc) == 1+4:
  99                                                 return unichr(int(esc[1:5], 16))
 100                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 101                                                 hi = int(esc[1:5], 16)
 102                                                 low = int(esc[7:11], 16)
 103                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 104                                 raise ValueError('Unknown escape ' + str(esc))
 105                         def parseString(i):
 106                                 i += 1
 107                                 e = i
 108                                 while True:
 109                                         e = s.index('"', e)
 110                                         bslashes = 0
 111                                         while s[e-bslashes-1] == '\\':
 112                                                 bslashes += 1
 113                                         if bslashes % 2 == 1:
 114                                                 e += 1
 115                                                 continue
 116                                         break
 117                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 118                                 stri = rexp.sub(decodeEscape, s[i:e])
 119                                 return (e+1,stri)
 120                         def parseObj(i):
 121                                 i += 1
 122                                 res = {}
 123                                 i = skipSpace(i)
 124                                 if s[i] == '}': # Empty dictionary
 125                                         return (i+1,res)
 126                                 while True:
 127                                         if s[i] != '"':
 128                                                 raiseError('Expected a string object key', i)
 129                                         i,key = parseString(i)
 130                                         i = skipSpace(i)
 131                                         if i >= len(s) or s[i] != ':':
 132                                                 raiseError('Expected a colon', i)
 133                                         i,val = parse(i+1)
 134                                         res[key] = val
 135                                         i = skipSpace(i)
 136                                         if s[i] == '}':
 137                                                 return (i+1, res)
 138                                         if s[i] != ',':
 139                                                 raiseError('Expected comma or closing curly brace', i)
 140                                         i = skipSpace(i+1)
 141                         def parseArray(i):
 142                                 res = []
 143                                 i = skipSpace(i+1)
 144                                 if s[i] == ']': # Empty array
 145                                         return (i+1,res)
 146                                 while True:
 147                                         i,val = parse(i)
 148                                         res.append(val)
 149                                         i = skipSpace(i) # Raise exception if premature end
 150                                         if s[i] == ']':
 151                                                 return (i+1, res)
 152                                         if s[i] != ',':
 153                                                 raiseError('Expected a comma or closing bracket', i)
 154                                         i = skipSpace(i+1)
 155                         def parseDiscrete(i):
 156                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 157                                         if s.startswith(k, i):
 158                                                 return (i+len(k), v)
 159                                 raiseError('Not a boolean (or null)', i)
 160                         def parseNumber(i):
 161                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 162                                 if mobj is None:
 163                                         raiseError('Not a number', i)
 164                                 nums = mobj.group(1)
 165                                 if '.' in nums or 'e' in nums or 'E' in nums:
 166                                         return (i+len(nums), float(nums))
 167                                 return (i+len(nums), int(nums))
 168                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 169                         def parse(i):
 170                                 i = skipSpace(i)
 171                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 172                                 i = skipSpace(i, False)
 173                                 return (i,res)
 174                         i,res = parse(0)
 175                         if i < len(s):
 176                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 177                         return res
 178
 179 def preferredencoding():
 180         """Get preferred encoding.
 181
 182         Returns the best encoding scheme for the system, based on
 183         locale.getpreferredencoding() and some further tweaks.
 184         """
 185         def yield_preferredencoding():
 186                 try:
 187                         pref = locale.getpreferredencoding()
 188                         u'TEST'.encode(pref)
 189                 except:
 190                         pref = 'UTF-8'
 191                 while True:
 192                         yield pref
 193         return yield_preferredencoding().next()
 194
 195 def htmlentity_transform(matchobj):
 196         """Transforms an HTML entity to a Unicode character.
 197
 198         This function receives a match object and is intended to be used with
 199         the re.sub() function.
 200         """
 201         entity = matchobj.group(1)
 202
 203         # Known non-numeric HTML entity
 204         if entity in htmlentitydefs.name2codepoint:
 205                 return unichr(htmlentitydefs.name2codepoint[entity])
 206
 207         # Unicode character
 208         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 209         if mobj is not None:
 210                 numstr = mobj.group(1)
 211                 if numstr.startswith(u'x'):
 212                         base = 16
 213                         numstr = u'0%s' % numstr
 214                 else:
 215                         base = 10
 216                 return unichr(long(numstr, base))
 217
 218         # Unknown entity in name, return its literal representation
 219         return (u'&%s;' % entity)
 220
 221 def sanitize_title(utitle):
 222         """Sanitizes a video title so it could be used as part of a filename."""
 223         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 224         return utitle.replace(unicode(os.sep), u'%')
 225
 226 def sanitize_open(filename, open_mode):
 227         """Try to open the given filename, and slightly tweak it if this fails.
 228
 229         Attempts to open the given filename. If this fails, it tries to change
 230         the filename slightly, step by step, until it's either able to open it
 231         or it fails and raises a final exception, like the standard open()
 232         function.
 233
 234         It returns the tuple (stream, definitive_file_name).
 235         """
 236         try:
 237                 if filename == u'-':
 238                         if sys.platform == 'win32':
 239                                 import msvcrt
 240                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 241                         return (sys.stdout, filename)
 242                 stream = open(filename, open_mode)
 243                 return (stream, filename)
 244         except (IOError, OSError), err:
 245                 # In case of error, try to remove win32 forbidden chars
 246                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 247
 248                 # An exception here should be caught in the caller
 249                 stream = open(filename, open_mode)
 250                 return (stream, filename)
 251
 252 def timeconvert(timestr):
 253     """Convert RFC 2822 defined time string into system timestamp"""
 254     timestamp = None
 255     timetuple = email.utils.parsedate_tz(timestr)
 256     if timetuple is not None:
 257         timestamp = email.utils.mktime_tz(timetuple)
 258     return timestamp
 259
 260 class DownloadError(Exception):
 261         """Download Error exception.
 262
 263         This exception may be thrown by FileDownloader objects if they are not
 264         configured to continue on errors. They will contain the appropriate
 265         error message.
 266         """
 267         pass
 268
 269 class SameFileError(Exception):
 270         """Same File exception.
 271
 272         This exception will be thrown by FileDownloader objects if they detect
 273         multiple files would have to be downloaded to the same file on disk.
 274         """
 275         pass
 276
 277 class PostProcessingError(Exception):
 278         """Post Processing exception.
 279
 280         This exception may be raised by PostProcessor's .run() method to
 281         indicate an error in the postprocessing task.
 282         """
 283         pass
 284
 285 class UnavailableVideoError(Exception):
 286         """Unavailable Format exception.
 287
 288         This exception will be thrown when a video is requested
 289         in a format that is not available for that video.
 290         """
 291         pass
 292
 293 class ContentTooShortError(Exception):
 294         """Content Too Short exception.
 295
 296         This exception may be raised by FileDownloader objects when a file they
 297         download is too small for what the server announced first, indicating
 298         the connection was probably interrupted.
 299         """
 300         # Both in bytes
 301         downloaded = None
 302         expected = None
 303
 304         def __init__(self, downloaded, expected):
 305                 self.downloaded = downloaded
 306                 self.expected = expected
 307
 308 class YoutubeDLHandler(urllib2.HTTPHandler):
 309         """Handler for HTTP requests and responses.
 310
 311         This class, when installed with an OpenerDirector, automatically adds
 312         the standard headers to every HTTP request and handles gzipped and
 313         deflated responses from web servers. If compression is to be avoided in
 314         a particular request, the original request in the program code only has
 315         to include the HTTP header "Youtubedl-No-Compression", which will be
 316         removed before making the real request.
 317
 318         Part of this code was copied from:
 319
 320           http://techknack.net/python-urllib2-handlers/
 321
 322         Andrew Rowls, the author of that code, agreed to release it to the
 323         public domain.
 324         """
 325
 326         @staticmethod
 327         def deflate(data):
 328                 try:
 329                         return zlib.decompress(data, -zlib.MAX_WBITS)
 330                 except zlib.error:
 331                         return zlib.decompress(data)
 332
 333         @staticmethod
 334         def addinfourl_wrapper(stream, headers, url, code):
 335                 if hasattr(urllib2.addinfourl, 'getcode'):
 336                         return urllib2.addinfourl(stream, headers, url, code)
 337                 ret = urllib2.addinfourl(stream, headers, url)
 338                 ret.code = code
 339                 return ret
 340
 341         def http_request(self, req):
 342                 for h in std_headers:
 343                         if h in req.headers:
 344                                 del req.headers[h]
 345                         req.add_header(h, std_headers[h])
 346                 if 'Youtubedl-no-compression' in req.headers:
 347                         if 'Accept-encoding' in req.headers:
 348                                 del req.headers['Accept-encoding']
 349                         del req.headers['Youtubedl-no-compression']
 350                 return req
 351
 352         def http_response(self, req, resp):
 353                 old_resp = resp
 354                 # gzip
 355                 if resp.headers.get('Content-encoding', '') == 'gzip':
 356                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 357                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 358                         resp.msg = old_resp.msg
 359                 # deflate
 360                 if resp.headers.get('Content-encoding', '') == 'deflate':
 361                         gz = StringIO.StringIO(self.deflate(resp.read()))
 362                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 363                         resp.msg = old_resp.msg
 364                 return resp
 365
 366 class FileDownloader(object):
 367         """File Downloader class.
 368
 369         File downloader objects are the ones responsible of downloading the
 370         actual video file and writing it to disk if the user has requested
 371         it, among some other tasks. In most cases there should be one per
 372         program. As, given a video URL, the downloader doesn't know how to
 373         extract all the needed information, task that InfoExtractors do, it
 374         has to pass the URL to one of them.
 375
 376         For this, file downloader objects have a method that allows
 377         InfoExtractors to be registered in a given order. When it is passed
 378         a URL, the file downloader handles it to the first InfoExtractor it
 379         finds that reports being able to handle it. The InfoExtractor extracts
 380         all the information about the video or videos the URL refers to, and
 381         asks the FileDownloader to process the video information, possibly
 382         downloading the video.
 383
 384         File downloaders accept a lot of parameters. In order not to saturate
 385         the object constructor with arguments, it receives a dictionary of
 386         options instead. These options are available through the params
 387         attribute for the InfoExtractors to use. The FileDownloader also
 388         registers itself as the downloader in charge for the InfoExtractors
 389         that are added to it, so this is a "mutual registration".
 390
 391         Available options:
 392
 393         username:         Username for authentication purposes.
 394         password:         Password for authentication purposes.
 395         usenetrc:         Use netrc for authentication instead.
 396         quiet:            Do not print messages to stdout.
 397         forceurl:         Force printing final URL.
 398         forcetitle:       Force printing title.
 399         forcethumbnail:   Force printing thumbnail URL.
 400         forcedescription: Force printing description.
 401         forcefilename:    Force printing final filename.
 402         simulate:         Do not download the video files.
 403         format:           Video format code.
 404         format_limit:     Highest quality format to try.
 405         outtmpl:          Template for output names.
 406         ignoreerrors:     Do not stop on download errors.
 407         ratelimit:        Download speed limit, in bytes/sec.
 408         nooverwrites:     Prevent overwriting files.
 409         retries:          Number of times to retry for HTTP error 5xx
 410         continuedl:       Try to continue downloads if possible.
 411         noprogress:       Do not print the progress bar.
 412         playliststart:    Playlist item to start at.
 413         playlistend:      Playlist item to end at.
 414         logtostderr:      Log messages to stderr instead of stdout.
 415         consoletitle:     Display progress in console window's titlebar.
 416         nopart:           Do not use temporary .part files.
 417         updatetime:       Use the Last-modified header to set output file timestamps.
 418         writedescription: Write the video description to a .description file
 419         writeinfojson:    Write the video description to a .info.json file
 420         """
 421
 422         params = None
 423         _ies = []
 424         _pps = []
 425         _download_retcode = None
 426         _num_downloads = None
 427         _screen_file = None
 428
 429         def __init__(self, params):
 430                 """Create a FileDownloader object with the given options."""
 431                 self._ies = []
 432                 self._pps = []
 433                 self._download_retcode = 0
 434                 self._num_downloads = 0
 435                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 436                 self.params = params
 437
 438         @staticmethod
 439         def pmkdir(filename):
 440                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 441                 components = filename.split(os.sep)
 442                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 443                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 444                 for dir in aggregate:
 445                         if not os.path.exists(dir):
 446                                 os.mkdir(dir)
 447
 448         @staticmethod
 449         def format_bytes(bytes):
 450                 if bytes is None:
 451                         return 'N/A'
 452                 if type(bytes) is str:
 453                         bytes = float(bytes)
 454                 if bytes == 0.0:
 455                         exponent = 0
 456                 else:
 457                         exponent = long(math.log(bytes, 1024.0))
 458                 suffix = 'bkMGTPEZY'[exponent]
 459                 converted = float(bytes) / float(1024**exponent)
 460                 return '%.2f%s' % (converted, suffix)
 461
 462         @staticmethod
 463         def calc_percent(byte_counter, data_len):
 464                 if data_len is None:
 465                         return '---.-%'
 466                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 467
 468         @staticmethod
 469         def calc_eta(start, now, total, current):
 470                 if total is None:
 471                         return '--:--'
 472                 dif = now - start
 473                 if current == 0 or dif < 0.001: # One millisecond
 474                         return '--:--'
 475                 rate = float(current) / dif
 476                 eta = long((float(total) - float(current)) / rate)
 477                 (eta_mins, eta_secs) = divmod(eta, 60)
 478                 if eta_mins > 99:
 479                         return '--:--'
 480                 return '%02d:%02d' % (eta_mins, eta_secs)
 481
 482         @staticmethod
 483         def calc_speed(start, now, bytes):
 484                 dif = now - start
 485                 if bytes == 0 or dif < 0.001: # One millisecond
 486                         return '%10s' % '---b/s'
 487                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 488
 489         @staticmethod
 490         def best_block_size(elapsed_time, bytes):
 491                 new_min = max(bytes / 2.0, 1.0)
 492                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 493                 if elapsed_time < 0.001:
 494                         return long(new_max)
 495                 rate = bytes / elapsed_time
 496                 if rate > new_max:
 497                         return long(new_max)
 498                 if rate < new_min:
 499                         return long(new_min)
 500                 return long(rate)
 501
 502         @staticmethod
 503         def parse_bytes(bytestr):
 504                 """Parse a string indicating a byte quantity into a long integer."""
 505                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 506                 if matchobj is None:
 507                         return None
 508                 number = float(matchobj.group(1))
 509                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 510                 return long(round(number * multiplier))
 511
 512         def add_info_extractor(self, ie):
 513                 """Add an InfoExtractor object to the end of the list."""
 514                 self._ies.append(ie)
 515                 ie.set_downloader(self)
 516
 517         def add_post_processor(self, pp):
 518                 """Add a PostProcessor object to the end of the chain."""
 519                 self._pps.append(pp)
 520                 pp.set_downloader(self)
 521
 522         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 523                 """Print message to stdout if not in quiet mode."""
 524                 try:
 525                         if not self.params.get('quiet', False):
 526                                 terminator = [u'\n', u''][skip_eol]
 527                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 528                         self._screen_file.flush()
 529                 except (UnicodeEncodeError), err:
 530                         if not ignore_encoding_errors:
 531                                 raise
 532
 533         def to_stderr(self, message):
 534                 """Print message to stderr."""
 535                 print >>sys.stderr, message.encode(preferredencoding())
 536
 537         def to_cons_title(self, message):
 538                 """Set console/terminal window title to message."""
 539                 if not self.params.get('consoletitle', False):
 540                         return
 541                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 542                         # c_wchar_p() might not be necessary if `message` is
 543                         # already of type unicode()
 544                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 545                 elif 'TERM' in os.environ:
 546                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 547
 548         def fixed_template(self):
 549                 """Checks if the output template is fixed."""
 550                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 551
 552         def trouble(self, message=None):
 553                 """Determine action to take when a download problem appears.
 554
 555                 Depending on if the downloader has been configured to ignore
 556                 download errors or not, this method may throw an exception or
 557                 not when errors are found, after printing the message.
 558                 """
 559                 if message is not None:
 560                         self.to_stderr(message)
 561                 if not self.params.get('ignoreerrors', False):
 562                         raise DownloadError(message)
 563                 self._download_retcode = 1
 564
 565         def slow_down(self, start_time, byte_counter):
 566                 """Sleep if the download speed is over the rate limit."""
 567                 rate_limit = self.params.get('ratelimit', None)
 568                 if rate_limit is None or byte_counter == 0:
 569                         return
 570                 now = time.time()
 571                 elapsed = now - start_time
 572                 if elapsed <= 0.0:
 573                         return
 574                 speed = float(byte_counter) / elapsed
 575                 if speed > rate_limit:
 576                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 577
 578         def temp_name(self, filename):
 579                 """Returns a temporary filename for the given filename."""
 580                 if self.params.get('nopart', False) or filename == u'-' or \
 581                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 582                         return filename
 583                 return filename + u'.part'
 584
 585         def undo_temp_name(self, filename):
 586                 if filename.endswith(u'.part'):
 587                         return filename[:-len(u'.part')]
 588                 return filename
 589
 590         def try_rename(self, old_filename, new_filename):
 591                 try:
 592                         if old_filename == new_filename:
 593                                 return
 594                         os.rename(old_filename, new_filename)
 595                 except (IOError, OSError), err:
 596                         self.trouble(u'ERROR: unable to rename file')
 597
 598         def try_utime(self, filename, last_modified_hdr):
 599                 """Try to set the last-modified time of the given file."""
 600                 if last_modified_hdr is None:
 601                         return
 602                 if not os.path.isfile(filename):
 603                         return
 604                 timestr = last_modified_hdr
 605                 if timestr is None:
 606                         return
 607                 filetime = timeconvert(timestr)
 608                 if filetime is None:
 609                         return
 610                 try:
 611                         os.utime(filename,(time.time(), filetime))
 612                 except:
 613                         pass
 614
 615         def report_writedescription(self, descfn):
 616                 """ Report that the description file is being written """
 617                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 618
 619         def report_writeinfojson(self, infofn):
 620                 """ Report that the metadata file has been written """
 621                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 622
 623         def report_destination(self, filename):
 624                 """Report destination filename."""
 625                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 626
 627         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 628                 """Report download progress."""
 629                 if self.params.get('noprogress', False):
 630                         return
 631                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 632                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 633                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 634                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 635
 636         def report_resuming_byte(self, resume_len):
 637                 """Report attempt to resume at given byte."""
 638                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 639
 640         def report_retry(self, count, retries):
 641                 """Report retry in case of HTTP error 5xx"""
 642                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 643
 644         def report_file_already_downloaded(self, file_name):
 645                 """Report file has already been fully downloaded."""
 646                 try:
 647                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 648                 except (UnicodeEncodeError), err:
 649                         self.to_screen(u'[download] The file has already been downloaded')
 650
 651         def report_unable_to_resume(self):
 652                 """Report it was impossible to resume download."""
 653                 self.to_screen(u'[download] Unable to resume')
 654
 655         def report_finish(self):
 656                 """Report download finished."""
 657                 if self.params.get('noprogress', False):
 658                         self.to_screen(u'[download] Download completed')
 659                 else:
 660                         self.to_screen(u'')
 661
 662         def increment_downloads(self):
 663                 """Increment the ordinal that assigns a number to each file."""
 664                 self._num_downloads += 1
 665
 666         def prepare_filename(self, info_dict):
 667                 """Generate the output filename."""
 668                 try:
 669                         template_dict = dict(info_dict)
 670                         template_dict['epoch'] = unicode(long(time.time()))
 671                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 672                         filename = self.params['outtmpl'] % template_dict
 673                         return filename
 674                 except (ValueError, KeyError), err:
 675                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 676                         return None
 677
 678         def process_info(self, info_dict):
 679                 """Process a single dictionary returned by an InfoExtractor."""
 680                 filename = self.prepare_filename(info_dict)
 681                 # Do nothing else if in simulate mode
 682                 if self.params.get('simulate', False):
 683                         # Forced printings
 684                         if self.params.get('forcetitle', False):
 685                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 686                         if self.params.get('forceurl', False):
 687                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 688                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 689                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 690                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 691                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 692                         if self.params.get('forcefilename', False) and filename is not None:
 693                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 694
 695                         return
 696
 697                 if filename is None:
 698                         return
 699                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 700                         self.to_stderr(u'WARNING: file exists and will be skipped')
 701                         return
 702
 703                 try:
 704                         self.pmkdir(filename)
 705                 except (OSError, IOError), err:
 706                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 707                         return
 708
 709                 if self.params.get('writedescription', False):
 710                         try:
 711                                 descfn = filename + '.description'
 712                                 self.report_writedescription(descfn)
 713                                 descfile = open(descfn, 'wb')
 714                                 try:
 715                                         descfile.write(info_dict['description'].encode('utf-8'))
 716                                 finally:
 717                                         descfile.close()
 718                         except (OSError, IOError):
 719                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 720                                 return
 721
 722                 if self.params.get('writeinfojson', False):
 723                         infofn = filename + '.info.json'
 724                         self.report_writeinfojson(infofn)
 725                         try:
 726                                 json.dump
 727                         except (NameError,AttributeError):
 728                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 729                                 return
 730                         try:
 731                                 infof = open(infofn, 'wb')
 732                                 try:
 733                                         json.dump(info_dict, infof)
 734                                 finally:
 735                                         infof.close()
 736                         except (OSError, IOError):
 737                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 738                                 return
 739
 740                 try:
 741                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 742                 except (OSError, IOError), err:
 743                         raise UnavailableVideoError
 744                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 745                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 746                         return
 747                 except (ContentTooShortError, ), err:
 748                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 749                         return
 750
 751                 if success:
 752                         try:
 753                                 self.post_process(filename, info_dict)
 754                         except (PostProcessingError), err:
 755                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 756                                 return
 757
 758         def download(self, url_list):
 759                 """Download a given list of URLs."""
 760                 if len(url_list) > 1 and self.fixed_template():
 761                         raise SameFileError(self.params['outtmpl'])
 762
 763                 for url in url_list:
 764                         suitable_found = False
 765                         for ie in self._ies:
 766                                 # Go to next InfoExtractor if not suitable
 767                                 if not ie.suitable(url):
 768                                         continue
 769
 770                                 # Suitable InfoExtractor found
 771                                 suitable_found = True
 772
 773                                 # Extract information from URL and process it
 774                                 ie.extract(url)
 775
 776                                 # Suitable InfoExtractor had been found; go to next URL
 777                                 break
 778
 779                         if not suitable_found:
 780                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 781
 782                 return self._download_retcode
 783
 784         def post_process(self, filename, ie_info):
 785                 """Run the postprocessing chain on the given file."""
 786                 info = dict(ie_info)
 787                 info['filepath'] = filename
 788                 for pp in self._pps:
 789                         info = pp.run(info)
 790                         if info is None:
 791                                 break
 792
 793         def _download_with_rtmpdump(self, filename, url, player_url):
 794                 self.report_destination(filename)
 795                 tmpfilename = self.temp_name(filename)
 796
 797                 # Check for rtmpdump first
 798                 try:
 799                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 800                 except (OSError, IOError):
 801                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 802                         return False
 803
 804                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 805                 # the connection was interrumpted and resuming appears to be
 806                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 807                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 808                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 809                 while retval == 2 or retval == 1:
 810                         prevsize = os.path.getsize(tmpfilename)
 811                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 812                         time.sleep(5.0) # This seems to be needed
 813                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 814                         cursize = os.path.getsize(tmpfilename)
 815                         if prevsize == cursize and retval == 1:
 816                                 break
 817                 if retval == 0:
 818                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 819                         self.try_rename(tmpfilename, filename)
 820                         return True
 821                 else:
 822                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 823                         return False
 824
 825         def _do_download(self, filename, url, player_url):
 826                 # Check file already present
 827                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 828                         self.report_file_already_downloaded(filename)
 829                         return True
 830
 831                 # Attempt to download using rtmpdump
 832                 if url.startswith('rtmp'):
 833                         return self._download_with_rtmpdump(filename, url, player_url)
 834
 835                 tmpfilename = self.temp_name(filename)
 836                 stream = None
 837                 open_mode = 'wb'
 838
 839                 # Do not include the Accept-Encoding header
 840                 headers = {'Youtubedl-no-compression': 'True'}
 841                 basic_request = urllib2.Request(url, None, headers)
 842                 request = urllib2.Request(url, None, headers)
 843
 844                 # Establish possible resume length
 845                 if os.path.isfile(tmpfilename):
 846                         resume_len = os.path.getsize(tmpfilename)
 847                 else:
 848                         resume_len = 0
 849
 850                 # Request parameters in case of being able to resume
 851                 if self.params.get('continuedl', False) and resume_len != 0:
 852                         self.report_resuming_byte(resume_len)
 853                         request.add_header('Range','bytes=%d-' % resume_len)
 854                         open_mode = 'ab'
 855
 856                 count = 0
 857                 retries = self.params.get('retries', 0)
 858                 while count <= retries:
 859                         # Establish connection
 860                         try:
 861                                 data = urllib2.urlopen(request)
 862                                 break
 863                         except (urllib2.HTTPError, ), err:
 864                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 865                                         # Unexpected HTTP error
 866                                         raise
 867                                 elif err.code == 416:
 868                                         # Unable to resume (requested range not satisfiable)
 869                                         try:
 870                                                 # Open the connection again without the range header
 871                                                 data = urllib2.urlopen(basic_request)
 872                                                 content_length = data.info()['Content-Length']
 873                                         except (urllib2.HTTPError, ), err:
 874                                                 if err.code < 500 or err.code >= 600:
 875                                                         raise
 876                                         else:
 877                                                 # Examine the reported length
 878                                                 if (content_length is not None and
 879                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 880                                                         # The file had already been fully downloaded.
 881                                                         # Explanation to the above condition: in issue #175 it was revealed that
 882                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 883                                                         # changing the file size slightly and causing problems for some users. So
 884                                                         # I decided to implement a suggested change and consider the file
 885                                                         # completely downloaded if the file size differs less than 100 bytes from
 886                                                         # the one in the hard drive.
 887                                                         self.report_file_already_downloaded(filename)
 888                                                         self.try_rename(tmpfilename, filename)
 889                                                         return True
 890                                                 else:
 891                                                         # The length does not match, we start the download over
 892                                                         self.report_unable_to_resume()
 893                                                         open_mode = 'wb'
 894                                                         break
 895                         # Retry
 896                         count += 1
 897                         if count <= retries:
 898                                 self.report_retry(count, retries)
 899
 900                 if count > retries:
 901                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 902                         return False
 903
 904                 data_len = data.info().get('Content-length', None)
 905                 if data_len is not None:
 906                         data_len = long(data_len) + resume_len
 907                 data_len_str = self.format_bytes(data_len)
 908                 byte_counter = 0 + resume_len
 909                 block_size = 1024
 910                 start = time.time()
 911                 while True:
 912                         # Download and write
 913                         before = time.time()
 914                         data_block = data.read(block_size)
 915                         after = time.time()
 916                         if len(data_block) == 0:
 917                                 break
 918                         byte_counter += len(data_block)
 919
 920                         # Open file just in time
 921                         if stream is None:
 922                                 try:
 923                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 924                                         filename = self.undo_temp_name(tmpfilename)
 925                                         self.report_destination(filename)
 926                                 except (OSError, IOError), err:
 927                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 928                                         return False
 929                         try:
 930                                 stream.write(data_block)
 931                         except (IOError, OSError), err:
 932                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 933                                 return False
 934                         block_size = self.best_block_size(after - before, len(data_block))
 935
 936                         # Progress message
 937                         percent_str = self.calc_percent(byte_counter, data_len)
 938                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 939                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 940                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 941
 942                         # Apply rate limit
 943                         self.slow_down(start, byte_counter - resume_len)
 944
 945                 stream.close()
 946                 self.report_finish()
 947                 if data_len is not None and byte_counter != data_len:
 948                         raise ContentTooShortError(byte_counter, long(data_len))
 949                 self.try_rename(tmpfilename, filename)
 950
 951                 # Update file modification time
 952                 if self.params.get('updatetime', True):
 953                         self.try_utime(filename, data.info().get('last-modified', None))
 954
 955                 return True
 956
 957 class InfoExtractor(object):
 958         """Information Extractor class.
 959
 960         Information extractors are the classes that, given a URL, extract
 961         information from the video (or videos) the URL refers to. This
 962         information includes the real video URL, the video title and simplified
 963         title, author and others. The information is stored in a dictionary
 964         which is then passed to the FileDownloader. The FileDownloader
 965         processes this information possibly downloading the video to the file
 966         system, among other possible outcomes. The dictionaries must include
 967         the following fields:
 968
 969         id:             Video identifier.
 970         url:            Final video URL.
 971         uploader:       Nickname of the video uploader.
 972         title:          Literal title.
 973         stitle:         Simplified title.
 974         ext:            Video filename extension.
 975         format:         Video format.
 976         player_url:     SWF Player URL (may be None).
 977
 978         The following fields are optional. Their primary purpose is to allow
 979         youtube-dl to serve as the backend for a video search function, such
 980         as the one in youtube2mp3.  They are only used when their respective
 981         forced printing functions are called:
 982
 983         thumbnail:      Full URL to a video thumbnail image.
 984         description:    One-line video description.
 985
 986         Subclasses of this one should re-define the _real_initialize() and
 987         _real_extract() methods, as well as the suitable() static method.
 988         Probably, they should also be instantiated and added to the main
 989         downloader.
 990         """
 991
 992         _ready = False
 993         _downloader = None
 994
 995         def __init__(self, downloader=None):
 996                 """Constructor. Receives an optional downloader."""
 997                 self._ready = False
 998                 self.set_downloader(downloader)
 999
1000         @staticmethod
1001         def suitable(url):
1002                 """Receives a URL and returns True if suitable for this IE."""
1003                 return False
1004
1005         def initialize(self):
1006                 """Initializes an instance (authentication, etc)."""
1007                 if not self._ready:
1008                         self._real_initialize()
1009                         self._ready = True
1010
1011         def extract(self, url):
1012                 """Extracts URL information and returns it in list of dicts."""
1013                 self.initialize()
1014                 return self._real_extract(url)
1015
1016         def set_downloader(self, downloader):
1017                 """Sets the downloader for this IE."""
1018                 self._downloader = downloader
1019
1020         def _real_initialize(self):
1021                 """Real initialization process. Redefine in subclasses."""
1022                 pass
1023
1024         def _real_extract(self, url):
1025                 """Real extraction process. Redefine in subclasses."""
1026                 pass
1027
1028 class YoutubeIE(InfoExtractor):
1029         """Information extractor for youtube.com."""
1030
1031         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1032         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1033         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1034         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1035         _NETRC_MACHINE = 'youtube'
1036         # Listed in order of quality
1037         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1038         _video_extensions = {
1039                 '13': '3gp',
1040                 '17': 'mp4',
1041                 '18': 'mp4',
1042                 '22': 'mp4',
1043                 '37': 'mp4',
1044                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1045                 '43': 'webm',
1046                 '45': 'webm',
1047         }
1048
1049         @staticmethod
1050         def suitable(url):
1051                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1052
1053         def report_lang(self):
1054                 """Report attempt to set language."""
1055                 self._downloader.to_screen(u'[youtube] Setting language')
1056
1057         def report_login(self):
1058                 """Report attempt to log in."""
1059                 self._downloader.to_screen(u'[youtube] Logging in')
1060
1061         def report_age_confirmation(self):
1062                 """Report attempt to confirm age."""
1063                 self._downloader.to_screen(u'[youtube] Confirming age')
1064
1065         def report_video_webpage_download(self, video_id):
1066                 """Report attempt to download video webpage."""
1067                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1068
1069         def report_video_info_webpage_download(self, video_id):
1070                 """Report attempt to download video info webpage."""
1071                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1072
1073         def report_information_extraction(self, video_id):
1074                 """Report attempt to extract video information."""
1075                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1076
1077         def report_unavailable_format(self, video_id, format):
1078                 """Report extracted video URL."""
1079                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1080
1081         def report_rtmp_download(self):
1082                 """Indicate the download will use the RTMP protocol."""
1083                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1084
1085         def _real_initialize(self):
1086                 if self._downloader is None:
1087                         return
1088
1089                 username = None
1090                 password = None
1091                 downloader_params = self._downloader.params
1092
1093                 # Attempt to use provided username and password or .netrc data
1094                 if downloader_params.get('username', None) is not None:
1095                         username = downloader_params['username']
1096                         password = downloader_params['password']
1097                 elif downloader_params.get('usenetrc', False):
1098                         try:
1099                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1100                                 if info is not None:
1101                                         username = info[0]
1102                                         password = info[2]
1103                                 else:
1104                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1105                         except (IOError, netrc.NetrcParseError), err:
1106                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1107                                 return
1108
1109                 # Set language
1110                 request = urllib2.Request(self._LANG_URL)
1111                 try:
1112                         self.report_lang()
1113                         urllib2.urlopen(request).read()
1114                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1115                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1116                         return
1117
1118                 # No authentication to be performed
1119                 if username is None:
1120                         return
1121
1122                 # Log in
1123                 login_form = {
1124                                 'current_form': 'loginForm',
1125                                 'next':         '/',
1126                                 'action_login': 'Log In',
1127                                 'username':     username,
1128                                 'password':     password,
1129                                 }
1130                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1131                 try:
1132                         self.report_login()
1133                         login_results = urllib2.urlopen(request).read()
1134                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1135                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1136                                 return
1137                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1139                         return
1140
1141                 # Confirm age
1142                 age_form = {
1143                                 'next_url':             '/',
1144                                 'action_confirm':       'Confirm',
1145                                 }
1146                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1147                 try:
1148                         self.report_age_confirmation()
1149                         age_results = urllib2.urlopen(request).read()
1150                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1151                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1152                         return
1153
1154         def _real_extract(self, url):
1155                 # Extract video id from URL
1156                 mobj = re.match(self._VALID_URL, url)
1157                 if mobj is None:
1158                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1159                         return
1160                 video_id = mobj.group(2)
1161
1162                 # Get video webpage
1163                 self.report_video_webpage_download(video_id)
1164                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1165                 try:
1166                         video_webpage = urllib2.urlopen(request).read()
1167                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1168                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1169                         return
1170
1171                 # Attempt to extract SWF player URL
1172                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1173                 if mobj is not None:
1174                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1175                 else:
1176                         player_url = None
1177
1178                 # Get video info
1179                 self.report_video_info_webpage_download(video_id)
1180                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1181                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1182                                            % (video_id, el_type))
1183                         request = urllib2.Request(video_info_url)
1184                         try:
1185                                 video_info_webpage = urllib2.urlopen(request).read()
1186                                 video_info = parse_qs(video_info_webpage)
1187                                 if 'token' in video_info:
1188                                         break
1189                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1191                                 return
1192                 if 'token' not in video_info:
1193                         if 'reason' in video_info:
1194                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1195                         else:
1196                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1197                         return
1198
1199                 # Start extracting information
1200                 self.report_information_extraction(video_id)
1201
1202                 # uploader
1203                 if 'author' not in video_info:
1204                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1205                         return
1206                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1207
1208                 # title
1209                 if 'title' not in video_info:
1210                         self._downloader.trouble(u'ERROR: unable to extract video title')
1211                         return
1212                 video_title = urllib.unquote_plus(video_info['title'][0])
1213                 video_title = video_title.decode('utf-8')
1214                 video_title = sanitize_title(video_title)
1215
1216                 # simplified title
1217                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1218                 simple_title = simple_title.strip(ur'_')
1219
1220                 # thumbnail image
1221                 if 'thumbnail_url' not in video_info:
1222                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1223                         video_thumbnail = ''
1224                 else:   # don't panic if we can't find it
1225                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1226
1227                 # upload date
1228                 upload_date = u'NA'
1229                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1230                 if mobj is not None:
1231                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1232                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1233                         for expression in format_expressions:
1234                                 try:
1235                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1236                                 except:
1237                                         pass
1238
1239                 # description
1240                 try:
1241                         lxml.etree
1242                 except NameError:
1243                         video_description = u'No description available.'
1244                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1245                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1246                                 if mobj is not None:
1247                                         video_description = mobj.group(1).decode('utf-8')
1248                 else:
1249                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1250                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1251                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1252                         # TODO use another parser
1253
1254                 # token
1255                 video_token = urllib.unquote_plus(video_info['token'][0])
1256
1257                 # Decide which formats to download
1258                 req_format = self._downloader.params.get('format', None)
1259
1260                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1261                         self.report_rtmp_download()
1262                         video_url_list = [(None, video_info['conn'][0])]
1263                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1264                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1265                         url_data = [parse_qs(uds) for uds in url_data_strs]
1266                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1267                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1268
1269                         format_limit = self._downloader.params.get('format_limit', None)
1270                         if format_limit is not None and format_limit in self._available_formats:
1271                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1272                         else:
1273                                 format_list = self._available_formats
1274                         existing_formats = [x for x in format_list if x in url_map]
1275                         if len(existing_formats) == 0:
1276                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1277                                 return
1278                         if req_format is None:
1279                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1280                         elif req_format == '-1':
1281                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1282                         else:
1283                                 # Specific format
1284                                 if req_format not in url_map:
1285                                         self._downloader.trouble(u'ERROR: requested format not available')
1286                                         return
1287                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1288                 else:
1289                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1290                         return
1291
1292                 for format_param, video_real_url in video_url_list:
1293                         # At this point we have a new video
1294                         self._downloader.increment_downloads()
1295
1296                         # Extension
1297                         video_extension = self._video_extensions.get(format_param, 'flv')
1298
1299                         try:
1300                                 # Process video information
1301                                 self._downloader.process_info({
1302                                         'id':           video_id.decode('utf-8'),
1303                                         'url':          video_real_url.decode('utf-8'),
1304                                         'uploader':     video_uploader.decode('utf-8'),
1305                                         'upload_date':  upload_date,
1306                                         'title':        video_title,
1307                                         'stitle':       simple_title,
1308                                         'ext':          video_extension.decode('utf-8'),
1309                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1310                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1311                                         'description':  video_description,
1312                                         'player_url':   player_url,
1313                                 })
1314                         except UnavailableVideoError, err:
1315                                 self._downloader.trouble(u'\nERROR: unable to download video')
1316
1317
1318 class MetacafeIE(InfoExtractor):
1319         """Information Extractor for metacafe.com."""
1320
1321         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1322         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1323         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1324         _youtube_ie = None
1325
1326         def __init__(self, youtube_ie, downloader=None):
1327                 InfoExtractor.__init__(self, downloader)
1328                 self._youtube_ie = youtube_ie
1329
1330         @staticmethod
1331         def suitable(url):
1332                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1333
1334         def report_disclaimer(self):
1335                 """Report disclaimer retrieval."""
1336                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1337
1338         def report_age_confirmation(self):
1339                 """Report attempt to confirm age."""
1340                 self._downloader.to_screen(u'[metacafe] Confirming age')
1341
1342         def report_download_webpage(self, video_id):
1343                 """Report webpage download."""
1344                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1345
1346         def report_extraction(self, video_id):
1347                 """Report information extraction."""
1348                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1349
1350         def _real_initialize(self):
1351                 # Retrieve disclaimer
1352                 request = urllib2.Request(self._DISCLAIMER)
1353                 try:
1354                         self.report_disclaimer()
1355                         disclaimer = urllib2.urlopen(request).read()
1356                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1357                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1358                         return
1359
1360                 # Confirm age
1361                 disclaimer_form = {
1362                         'filters': '0',
1363                         'submit': "Continue - I'm over 18",
1364                         }
1365                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1366                 try:
1367                         self.report_age_confirmation()
1368                         disclaimer = urllib2.urlopen(request).read()
1369                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1370                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1371                         return
1372
1373         def _real_extract(self, url):
1374                 # Extract id and simplified title from URL
1375                 mobj = re.match(self._VALID_URL, url)
1376                 if mobj is None:
1377                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1378                         return
1379
1380                 video_id = mobj.group(1)
1381
1382                 # Check if video comes from YouTube
1383                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1384                 if mobj2 is not None:
1385                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1386                         return
1387
1388                 # At this point we have a new video
1389                 self._downloader.increment_downloads()
1390
1391                 simple_title = mobj.group(2).decode('utf-8')
1392
1393                 # Retrieve video webpage to extract further information
1394                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1395                 try:
1396                         self.report_download_webpage(video_id)
1397                         webpage = urllib2.urlopen(request).read()
1398                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1399                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1400                         return
1401
1402                 # Extract URL, uploader and title from webpage
1403                 self.report_extraction(video_id)
1404                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1405                 if mobj is not None:
1406                         mediaURL = urllib.unquote(mobj.group(1))
1407                         video_extension = mediaURL[-3:]
1408
1409                         # Extract gdaKey if available
1410                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1411                         if mobj is None:
1412                                 video_url = mediaURL
1413                         else:
1414                                 gdaKey = mobj.group(1)
1415                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1416                 else:
1417                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1418                         if mobj is None:
1419                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1420                                 return
1421                         vardict = parse_qs(mobj.group(1))
1422                         if 'mediaData' not in vardict:
1423                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1424                                 return
1425                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1426                         if mobj is None:
1427                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1428                                 return
1429                         mediaURL = mobj.group(1).replace('\\/', '/')
1430                         video_extension = mediaURL[-3:]
1431                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1432
1433                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1434                 if mobj is None:
1435                         self._downloader.trouble(u'ERROR: unable to extract title')
1436                         return
1437                 video_title = mobj.group(1).decode('utf-8')
1438                 video_title = sanitize_title(video_title)
1439
1440                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1441                 if mobj is None:
1442                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1443                         return
1444                 video_uploader = mobj.group(1)
1445
1446                 try:
1447                         # Process video information
1448                         self._downloader.process_info({
1449                                 'id':           video_id.decode('utf-8'),
1450                                 'url':          video_url.decode('utf-8'),
1451                                 'uploader':     video_uploader.decode('utf-8'),
1452                                 'upload_date':  u'NA',
1453                                 'title':        video_title,
1454                                 'stitle':       simple_title,
1455                                 'ext':          video_extension.decode('utf-8'),
1456                                 'format':       u'NA',
1457                                 'player_url':   None,
1458                         })
1459                 except UnavailableVideoError:
1460                         self._downloader.trouble(u'\nERROR: unable to download video')
1461
1462
1463 class DailymotionIE(InfoExtractor):
1464         """Information Extractor for Dailymotion"""
1465
1466         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1467
1468         def __init__(self, downloader=None):
1469                 InfoExtractor.__init__(self, downloader)
1470
1471         @staticmethod
1472         def suitable(url):
1473                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1474
1475         def report_download_webpage(self, video_id):
1476                 """Report webpage download."""
1477                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1478
1479         def report_extraction(self, video_id):
1480                 """Report information extraction."""
1481                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1482
1483         def _real_initialize(self):
1484                 return
1485
1486         def _real_extract(self, url):
1487                 # Extract id and simplified title from URL
1488                 mobj = re.match(self._VALID_URL, url)
1489                 if mobj is None:
1490                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1491                         return
1492
1493                 # At this point we have a new video
1494                 self._downloader.increment_downloads()
1495                 video_id = mobj.group(1)
1496
1497                 simple_title = mobj.group(2).decode('utf-8')
1498                 video_extension = 'flv'
1499
1500                 # Retrieve video webpage to extract further information
1501                 request = urllib2.Request(url)
1502                 try:
1503                         self.report_download_webpage(video_id)
1504                         webpage = urllib2.urlopen(request).read()
1505                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1506                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1507                         return
1508
1509                 # Extract URL, uploader and title from webpage
1510                 self.report_extraction(video_id)
1511                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1512                 if mobj is None:
1513                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1514                         return
1515                 mediaURL = urllib.unquote(mobj.group(1))
1516
1517                 # if needed add http://www.dailymotion.com/ if relative URL
1518
1519                 video_url = mediaURL
1520
1521                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1522                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1523                 if mobj is None:
1524                         self._downloader.trouble(u'ERROR: unable to extract title')
1525                         return
1526                 video_title = mobj.group(1).decode('utf-8')
1527                 video_title = sanitize_title(video_title)
1528
1529                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1530                 if mobj is None:
1531                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1532                         return
1533                 video_uploader = mobj.group(1)
1534
1535                 try:
1536                         # Process video information
1537                         self._downloader.process_info({
1538                                 'id':           video_id.decode('utf-8'),
1539                                 'url':          video_url.decode('utf-8'),
1540                                 'uploader':     video_uploader.decode('utf-8'),
1541                                 'upload_date':  u'NA',
1542                                 'title':        video_title,
1543                                 'stitle':       simple_title,
1544                                 'ext':          video_extension.decode('utf-8'),
1545                                 'format':       u'NA',
1546                                 'player_url':   None,
1547                         })
1548                 except UnavailableVideoError:
1549                         self._downloader.trouble(u'\nERROR: unable to download video')
1550
1551 class GoogleIE(InfoExtractor):
1552         """Information extractor for video.google.com."""
1553
1554         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1555
1556         def __init__(self, downloader=None):
1557                 InfoExtractor.__init__(self, downloader)
1558
1559         @staticmethod
1560         def suitable(url):
1561                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1562
1563         def report_download_webpage(self, video_id):
1564                 """Report webpage download."""
1565                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1566
1567         def report_extraction(self, video_id):
1568                 """Report information extraction."""
1569                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1570
1571         def _real_initialize(self):
1572                 return
1573
1574         def _real_extract(self, url):
1575                 # Extract id from URL
1576                 mobj = re.match(self._VALID_URL, url)
1577                 if mobj is None:
1578                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1579                         return
1580
1581                 # At this point we have a new video
1582                 self._downloader.increment_downloads()
1583                 video_id = mobj.group(1)
1584
1585                 video_extension = 'mp4'
1586
1587                 # Retrieve video webpage to extract further information
1588                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1589                 try:
1590                         self.report_download_webpage(video_id)
1591                         webpage = urllib2.urlopen(request).read()
1592                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1593                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1594                         return
1595
1596                 # Extract URL, uploader, and title from webpage
1597                 self.report_extraction(video_id)
1598                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1599                 if mobj is None:
1600                         video_extension = 'flv'
1601                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1602                 if mobj is None:
1603                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1604                         return
1605                 mediaURL = urllib.unquote(mobj.group(1))
1606                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1607                 mediaURL = mediaURL.replace('\\x26', '\x26')
1608
1609                 video_url = mediaURL
1610
1611                 mobj = re.search(r'<title>(.*)</title>', webpage)
1612                 if mobj is None:
1613                         self._downloader.trouble(u'ERROR: unable to extract title')
1614                         return
1615                 video_title = mobj.group(1).decode('utf-8')
1616                 video_title = sanitize_title(video_title)
1617                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1618
1619                 # Extract video description
1620                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1621                 if mobj is None:
1622                         self._downloader.trouble(u'ERROR: unable to extract video description')
1623                         return
1624                 video_description = mobj.group(1).decode('utf-8')
1625                 if not video_description:
1626                         video_description = 'No description available.'
1627
1628                 # Extract video thumbnail
1629                 if self._downloader.params.get('forcethumbnail', False):
1630                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1631                         try:
1632                                 webpage = urllib2.urlopen(request).read()
1633                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1634                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1635                                 return
1636                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1637                         if mobj is None:
1638                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1639                                 return
1640                         video_thumbnail = mobj.group(1)
1641                 else:   # we need something to pass to process_info
1642                         video_thumbnail = ''
1643
1644
1645                 try:
1646                         # Process video information
1647                         self._downloader.process_info({
1648                                 'id':           video_id.decode('utf-8'),
1649                                 'url':          video_url.decode('utf-8'),
1650                                 'uploader':     u'NA',
1651                                 'upload_date':  u'NA',
1652                                 'title':        video_title,
1653                                 'stitle':       simple_title,
1654                                 'ext':          video_extension.decode('utf-8'),
1655                                 'format':       u'NA',
1656                                 'player_url':   None,
1657                         })
1658                 except UnavailableVideoError:
1659                         self._downloader.trouble(u'\nERROR: unable to download video')
1660
1661
1662 class PhotobucketIE(InfoExtractor):
1663         """Information extractor for photobucket.com."""
1664
1665         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1666
1667         def __init__(self, downloader=None):
1668                 InfoExtractor.__init__(self, downloader)
1669
1670         @staticmethod
1671         def suitable(url):
1672                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1673
1674         def report_download_webpage(self, video_id):
1675                 """Report webpage download."""
1676                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1677
1678         def report_extraction(self, video_id):
1679                 """Report information extraction."""
1680                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1681
1682         def _real_initialize(self):
1683                 return
1684
1685         def _real_extract(self, url):
1686                 # Extract id from URL
1687                 mobj = re.match(self._VALID_URL, url)
1688                 if mobj is None:
1689                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1690                         return
1691
1692                 # At this point we have a new video
1693                 self._downloader.increment_downloads()
1694                 video_id = mobj.group(1)
1695
1696                 video_extension = 'flv'
1697
1698                 # Retrieve video webpage to extract further information
1699                 request = urllib2.Request(url)
1700                 try:
1701                         self.report_download_webpage(video_id)
1702                         webpage = urllib2.urlopen(request).read()
1703                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1704                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1705                         return
1706
1707                 # Extract URL, uploader, and title from webpage
1708                 self.report_extraction(video_id)
1709                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1710                 if mobj is None:
1711                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1712                         return
1713                 mediaURL = urllib.unquote(mobj.group(1))
1714
1715                 video_url = mediaURL
1716
1717                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1718                 if mobj is None:
1719                         self._downloader.trouble(u'ERROR: unable to extract title')
1720                         return
1721                 video_title = mobj.group(1).decode('utf-8')
1722                 video_title = sanitize_title(video_title)
1723                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1724
1725                 video_uploader = mobj.group(2).decode('utf-8')
1726
1727                 try:
1728                         # Process video information
1729                         self._downloader.process_info({
1730                                 'id':           video_id.decode('utf-8'),
1731                                 'url':          video_url.decode('utf-8'),
1732                                 'uploader':     video_uploader,
1733                                 'upload_date':  u'NA',
1734                                 'title':        video_title,
1735                                 'stitle':       simple_title,
1736                                 'ext':          video_extension.decode('utf-8'),
1737                                 'format':       u'NA',
1738                                 'player_url':   None,
1739                         })
1740                 except UnavailableVideoError:
1741                         self._downloader.trouble(u'\nERROR: unable to download video')
1742
1743
1744 class YahooIE(InfoExtractor):
1745         """Information extractor for video.yahoo.com."""
1746
1747         # _VALID_URL matches all Yahoo! Video URLs
1748         # _VPAGE_URL matches only the extractable '/watch/' URLs
1749         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1750         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1751
1752         def __init__(self, downloader=None):
1753                 InfoExtractor.__init__(self, downloader)
1754
1755         @staticmethod
1756         def suitable(url):
1757                 return (re.match(YahooIE._VALID_URL, url) is not None)
1758
1759         def report_download_webpage(self, video_id):
1760                 """Report webpage download."""
1761                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1762
1763         def report_extraction(self, video_id):
1764                 """Report information extraction."""
1765                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1766
1767         def _real_initialize(self):
1768                 return
1769
1770         def _real_extract(self, url, new_video=True):
1771                 # Extract ID from URL
1772                 mobj = re.match(self._VALID_URL, url)
1773                 if mobj is None:
1774                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1775                         return
1776
1777                 # At this point we have a new video
1778                 self._downloader.increment_downloads()
1779                 video_id = mobj.group(2)
1780                 video_extension = 'flv'
1781
1782                 # Rewrite valid but non-extractable URLs as
1783                 # extractable English language /watch/ URLs
1784                 if re.match(self._VPAGE_URL, url) is None:
1785                         request = urllib2.Request(url)
1786                         try:
1787                                 webpage = urllib2.urlopen(request).read()
1788                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1789                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1790                                 return
1791
1792                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1793                         if mobj is None:
1794                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1795                                 return
1796                         yahoo_id = mobj.group(1)
1797
1798                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1799                         if mobj is None:
1800                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1801                                 return
1802                         yahoo_vid = mobj.group(1)
1803
1804                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1805                         return self._real_extract(url, new_video=False)
1806
1807                 # Retrieve video webpage to extract further information
1808                 request = urllib2.Request(url)
1809                 try:
1810                         self.report_download_webpage(video_id)
1811                         webpage = urllib2.urlopen(request).read()
1812                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1813                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1814                         return
1815
1816                 # Extract uploader and title from webpage
1817                 self.report_extraction(video_id)
1818                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1819                 if mobj is None:
1820                         self._downloader.trouble(u'ERROR: unable to extract video title')
1821                         return
1822                 video_title = mobj.group(1).decode('utf-8')
1823                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1824
1825                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1826                 if mobj is None:
1827                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1828                         return
1829                 video_uploader = mobj.group(1).decode('utf-8')
1830
1831                 # Extract video thumbnail
1832                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1833                 if mobj is None:
1834                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1835                         return
1836                 video_thumbnail = mobj.group(1).decode('utf-8')
1837
1838                 # Extract video description
1839                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1840                 if mobj is None:
1841                         self._downloader.trouble(u'ERROR: unable to extract video description')
1842                         return
1843                 video_description = mobj.group(1).decode('utf-8')
1844                 if not video_description: video_description = 'No description available.'
1845
1846                 # Extract video height and width
1847                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1848                 if mobj is None:
1849                         self._downloader.trouble(u'ERROR: unable to extract video height')
1850                         return
1851                 yv_video_height = mobj.group(1)
1852
1853                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1854                 if mobj is None:
1855                         self._downloader.trouble(u'ERROR: unable to extract video width')
1856                         return
1857                 yv_video_width = mobj.group(1)
1858
1859                 # Retrieve video playlist to extract media URL
1860                 # I'm not completely sure what all these options are, but we
1861                 # seem to need most of them, otherwise the server sends a 401.
1862                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1863                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1864                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1865                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1866                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1867                 try:
1868                         self.report_download_webpage(video_id)
1869                         webpage = urllib2.urlopen(request).read()
1870                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1871                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1872                         return
1873
1874                 # Extract media URL from playlist XML
1875                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1876                 if mobj is None:
1877                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1878                         return
1879                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1880                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1881
1882                 try:
1883                         # Process video information
1884                         self._downloader.process_info({
1885                                 'id':           video_id.decode('utf-8'),
1886                                 'url':          video_url,
1887                                 'uploader':     video_uploader,
1888                                 'upload_date':  u'NA',
1889                                 'title':        video_title,
1890                                 'stitle':       simple_title,
1891                                 'ext':          video_extension.decode('utf-8'),
1892                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1893                                 'description':  video_description,
1894                                 'thumbnail':    video_thumbnail,
1895                                 'description':  video_description,
1896                                 'player_url':   None,
1897                         })
1898                 except UnavailableVideoError:
1899                         self._downloader.trouble(u'\nERROR: unable to download video')
1900
1901
1902 class GenericIE(InfoExtractor):
1903         """Generic last-resort information extractor."""
1904
1905         def __init__(self, downloader=None):
1906                 InfoExtractor.__init__(self, downloader)
1907
1908         @staticmethod
1909         def suitable(url):
1910                 return True
1911
1912         def report_download_webpage(self, video_id):
1913                 """Report webpage download."""
1914                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1915                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1916
1917         def report_extraction(self, video_id):
1918                 """Report information extraction."""
1919                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1920
1921         def _real_initialize(self):
1922                 return
1923
1924         def _real_extract(self, url):
1925                 # At this point we have a new video
1926                 self._downloader.increment_downloads()
1927
1928                 video_id = url.split('/')[-1]
1929                 request = urllib2.Request(url)
1930                 try:
1931                         self.report_download_webpage(video_id)
1932                         webpage = urllib2.urlopen(request).read()
1933                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1934                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1935                         return
1936                 except ValueError, err:
1937                         # since this is the last-resort InfoExtractor, if
1938                         # this error is thrown, it'll be thrown here
1939                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1940                         return
1941
1942                 self.report_extraction(video_id)
1943                 # Start with something easy: JW Player in SWFObject
1944                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1945                 if mobj is None:
1946                         # Broaden the search a little bit
1947                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1948                 if mobj is None:
1949                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1950                         return
1951
1952                 # It's possible that one of the regexes
1953                 # matched, but returned an empty group:
1954                 if mobj.group(1) is None:
1955                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1956                         return
1957
1958                 video_url = urllib.unquote(mobj.group(1))
1959                 video_id  = os.path.basename(video_url)
1960
1961                 # here's a fun little line of code for you:
1962                 video_extension = os.path.splitext(video_id)[1][1:]
1963                 video_id        = os.path.splitext(video_id)[0]
1964
1965                 # it's tempting to parse this further, but you would
1966                 # have to take into account all the variations like
1967                 #   Video Title - Site Name
1968                 #   Site Name | Video Title
1969                 #   Video Title - Tagline | Site Name
1970                 # and so on and so forth; it's just not practical
1971                 mobj = re.search(r'<title>(.*)</title>', webpage)
1972                 if mobj is None:
1973                         self._downloader.trouble(u'ERROR: unable to extract title')
1974                         return
1975                 video_title = mobj.group(1).decode('utf-8')
1976                 video_title = sanitize_title(video_title)
1977                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1978
1979                 # video uploader is domain name
1980                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1981                 if mobj is None:
1982                         self._downloader.trouble(u'ERROR: unable to extract title')
1983                         return
1984                 video_uploader = mobj.group(1).decode('utf-8')
1985
1986                 try:
1987                         # Process video information
1988                         self._downloader.process_info({
1989                                 'id':           video_id.decode('utf-8'),
1990                                 'url':          video_url.decode('utf-8'),
1991                                 'uploader':     video_uploader,
1992                                 'upload_date':  u'NA',
1993                                 'title':        video_title,
1994                                 'stitle':       simple_title,
1995                                 'ext':          video_extension.decode('utf-8'),
1996                                 'format':       u'NA',
1997                                 'player_url':   None,
1998                         })
1999                 except UnavailableVideoError, err:
2000                         self._downloader.trouble(u'\nERROR: unable to download video')
2001
2002
2003 class YoutubeSearchIE(InfoExtractor):
2004         """Information Extractor for YouTube search queries."""
2005         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2006         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2007         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2008         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2009         _youtube_ie = None
2010         _max_youtube_results = 1000
2011
2012         def __init__(self, youtube_ie, downloader=None):
2013                 InfoExtractor.__init__(self, downloader)
2014                 self._youtube_ie = youtube_ie
2015
2016         @staticmethod
2017         def suitable(url):
2018                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2019
2020         def report_download_page(self, query, pagenum):
2021                 """Report attempt to download playlist page with given number."""
2022                 query = query.decode(preferredencoding())
2023                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2024
2025         def _real_initialize(self):
2026                 self._youtube_ie.initialize()
2027
2028         def _real_extract(self, query):
2029                 mobj = re.match(self._VALID_QUERY, query)
2030                 if mobj is None:
2031                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2032                         return
2033
2034                 prefix, query = query.split(':')
2035                 prefix = prefix[8:]
2036                 query  = query.encode('utf-8')
2037                 if prefix == '':
2038                         self._download_n_results(query, 1)
2039                         return
2040                 elif prefix == 'all':
2041                         self._download_n_results(query, self._max_youtube_results)
2042                         return
2043                 else:
2044                         try:
2045                                 n = long(prefix)
2046                                 if n <= 0:
2047                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2048                                         return
2049                                 elif n > self._max_youtube_results:
2050                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2051                                         n = self._max_youtube_results
2052                                 self._download_n_results(query, n)
2053                                 return
2054                         except ValueError: # parsing prefix as integer fails
2055                                 self._download_n_results(query, 1)
2056                                 return
2057
2058         def _download_n_results(self, query, n):
2059                 """Downloads a specified number of results for a query"""
2060
2061                 video_ids = []
2062                 already_seen = set()
2063                 pagenum = 1
2064
2065                 while True:
2066                         self.report_download_page(query, pagenum)
2067                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2068                         request = urllib2.Request(result_url)
2069                         try:
2070                                 page = urllib2.urlopen(request).read()
2071                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2072                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2073                                 return
2074
2075                         # Extract video identifiers
2076                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2077                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2078                                 if video_id not in already_seen:
2079                                         video_ids.append(video_id)
2080                                         already_seen.add(video_id)
2081                                         if len(video_ids) == n:
2082                                                 # Specified n videos reached
2083                                                 for id in video_ids:
2084                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2085                                                 return
2086
2087                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2088                                 for id in video_ids:
2089                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2090                                 return
2091
2092                         pagenum = pagenum + 1
2093
2094 class GoogleSearchIE(InfoExtractor):
2095         """Information Extractor for Google Video search queries."""
2096         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2097         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2098         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2099         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2100         _google_ie = None
2101         _max_google_results = 1000
2102
2103         def __init__(self, google_ie, downloader=None):
2104                 InfoExtractor.__init__(self, downloader)
2105                 self._google_ie = google_ie
2106
2107         @staticmethod
2108         def suitable(url):
2109                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2110
2111         def report_download_page(self, query, pagenum):
2112                 """Report attempt to download playlist page with given number."""
2113                 query = query.decode(preferredencoding())
2114                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2115
2116         def _real_initialize(self):
2117                 self._google_ie.initialize()
2118
2119         def _real_extract(self, query):
2120                 mobj = re.match(self._VALID_QUERY, query)
2121                 if mobj is None:
2122                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2123                         return
2124
2125                 prefix, query = query.split(':')
2126                 prefix = prefix[8:]
2127                 query  = query.encode('utf-8')
2128                 if prefix == '':
2129                         self._download_n_results(query, 1)
2130                         return
2131                 elif prefix == 'all':
2132                         self._download_n_results(query, self._max_google_results)
2133                         return
2134                 else:
2135                         try:
2136                                 n = long(prefix)
2137                                 if n <= 0:
2138                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2139                                         return
2140                                 elif n > self._max_google_results:
2141                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2142                                         n = self._max_google_results
2143                                 self._download_n_results(query, n)
2144                                 return
2145                         except ValueError: # parsing prefix as integer fails
2146                                 self._download_n_results(query, 1)
2147                                 return
2148
2149         def _download_n_results(self, query, n):
2150                 """Downloads a specified number of results for a query"""
2151
2152                 video_ids = []
2153                 already_seen = set()
2154                 pagenum = 1
2155
2156                 while True:
2157                         self.report_download_page(query, pagenum)
2158                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2159                         request = urllib2.Request(result_url)
2160                         try:
2161                                 page = urllib2.urlopen(request).read()
2162                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2163                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2164                                 return
2165
2166                         # Extract video identifiers
2167                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2168                                 video_id = mobj.group(1)
2169                                 if video_id not in already_seen:
2170                                         video_ids.append(video_id)
2171                                         already_seen.add(video_id)
2172                                         if len(video_ids) == n:
2173                                                 # Specified n videos reached
2174                                                 for id in video_ids:
2175                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2176                                                 return
2177
2178                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2179                                 for id in video_ids:
2180                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2181                                 return
2182
2183                         pagenum = pagenum + 1
2184
2185 class YahooSearchIE(InfoExtractor):
2186         """Information Extractor for Yahoo! Video search queries."""
2187         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2188         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2189         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2190         _MORE_PAGES_INDICATOR = r'\s*Next'
2191         _yahoo_ie = None
2192         _max_yahoo_results = 1000
2193
2194         def __init__(self, yahoo_ie, downloader=None):
2195                 InfoExtractor.__init__(self, downloader)
2196                 self._yahoo_ie = yahoo_ie
2197
2198         @staticmethod
2199         def suitable(url):
2200                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2201
2202         def report_download_page(self, query, pagenum):
2203                 """Report attempt to download playlist page with given number."""
2204                 query = query.decode(preferredencoding())
2205                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2206
2207         def _real_initialize(self):
2208                 self._yahoo_ie.initialize()
2209
2210         def _real_extract(self, query):
2211                 mobj = re.match(self._VALID_QUERY, query)
2212                 if mobj is None:
2213                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2214                         return
2215
2216                 prefix, query = query.split(':')
2217                 prefix = prefix[8:]
2218                 query  = query.encode('utf-8')
2219                 if prefix == '':
2220                         self._download_n_results(query, 1)
2221                         return
2222                 elif prefix == 'all':
2223                         self._download_n_results(query, self._max_yahoo_results)
2224                         return
2225                 else:
2226                         try:
2227                                 n = long(prefix)
2228                                 if n <= 0:
2229                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2230                                         return
2231                                 elif n > self._max_yahoo_results:
2232                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2233                                         n = self._max_yahoo_results
2234                                 self._download_n_results(query, n)
2235                                 return
2236                         except ValueError: # parsing prefix as integer fails
2237                                 self._download_n_results(query, 1)
2238                                 return
2239
2240         def _download_n_results(self, query, n):
2241                 """Downloads a specified number of results for a query"""
2242
2243                 video_ids = []
2244                 already_seen = set()
2245                 pagenum = 1
2246
2247                 while True:
2248                         self.report_download_page(query, pagenum)
2249                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2250                         request = urllib2.Request(result_url)
2251                         try:
2252                                 page = urllib2.urlopen(request).read()
2253                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2254                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2255                                 return
2256
2257                         # Extract video identifiers
2258                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2259                                 video_id = mobj.group(1)
2260                                 if video_id not in already_seen:
2261                                         video_ids.append(video_id)
2262                                         already_seen.add(video_id)
2263                                         if len(video_ids) == n:
2264                                                 # Specified n videos reached
2265                                                 for id in video_ids:
2266                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2267                                                 return
2268
2269                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2270                                 for id in video_ids:
2271                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2272                                 return
2273
2274                         pagenum = pagenum + 1
2275
2276 class YoutubePlaylistIE(InfoExtractor):
2277         """Information Extractor for YouTube playlists."""
2278
2279         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2280         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2281         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2282         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2283         _youtube_ie = None
2284
2285         def __init__(self, youtube_ie, downloader=None):
2286                 InfoExtractor.__init__(self, downloader)
2287                 self._youtube_ie = youtube_ie
2288
2289         @staticmethod
2290         def suitable(url):
2291                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2292
2293         def report_download_page(self, playlist_id, pagenum):
2294                 """Report attempt to download playlist page with given number."""
2295                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2296
2297         def _real_initialize(self):
2298                 self._youtube_ie.initialize()
2299
2300         def _real_extract(self, url):
2301                 # Extract playlist id
2302                 mobj = re.match(self._VALID_URL, url)
2303                 if mobj is None:
2304                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2305                         return
2306
2307                 # Single video case
2308                 if mobj.group(3) is not None:
2309                         self._youtube_ie.extract(mobj.group(3))
2310                         return
2311
2312                 # Download playlist pages
2313                 # prefix is 'p' as default for playlists but there are other types that need extra care
2314                 playlist_prefix = mobj.group(1)
2315                 if playlist_prefix == 'a':
2316                         playlist_access = 'artist'
2317                 else:
2318                         playlist_prefix = 'p'
2319                         playlist_access = 'view_play_list'
2320                 playlist_id = mobj.group(2)
2321                 video_ids = []
2322                 pagenum = 1
2323
2324                 while True:
2325                         self.report_download_page(playlist_id, pagenum)
2326                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2327                         try:
2328                                 page = urllib2.urlopen(request).read()
2329                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2330                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2331                                 return
2332
2333                         # Extract video identifiers
2334                         ids_in_page = []
2335                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2336                                 if mobj.group(1) not in ids_in_page:
2337                                         ids_in_page.append(mobj.group(1))
2338                         video_ids.extend(ids_in_page)
2339
2340                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2341                                 break
2342                         pagenum = pagenum + 1
2343
2344                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2345                 playlistend = self._downloader.params.get('playlistend', -1)
2346                 video_ids = video_ids[playliststart:playlistend]
2347
2348                 for id in video_ids:
2349                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2350                 return
2351
2352 class YoutubeUserIE(InfoExtractor):
2353         """Information Extractor for YouTube users."""
2354
2355         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2356         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2357         _GDATA_PAGE_SIZE = 50
2358         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2359         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2360         _youtube_ie = None
2361
2362         def __init__(self, youtube_ie, downloader=None):
2363                 InfoExtractor.__init__(self, downloader)
2364                 self._youtube_ie = youtube_ie
2365
2366         @staticmethod
2367         def suitable(url):
2368                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2369
2370         def report_download_page(self, username, start_index):
2371                 """Report attempt to download user page."""
2372                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2373                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2374
2375         def _real_initialize(self):
2376                 self._youtube_ie.initialize()
2377
2378         def _real_extract(self, url):
2379                 # Extract username
2380                 mobj = re.match(self._VALID_URL, url)
2381                 if mobj is None:
2382                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2383                         return
2384
2385                 username = mobj.group(1)
2386
2387                 # Download video ids using YouTube Data API. Result size per
2388                 # query is limited (currently to 50 videos) so we need to query
2389                 # page by page until there are no video ids - it means we got
2390                 # all of them.
2391
2392                 video_ids = []
2393                 pagenum = 0
2394
2395                 while True:
2396                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2397                         self.report_download_page(username, start_index)
2398
2399                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2400
2401                         try:
2402                                 page = urllib2.urlopen(request).read()
2403                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2404                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2405                                 return
2406
2407                         # Extract video identifiers
2408                         ids_in_page = []
2409
2410                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2411                                 if mobj.group(1) not in ids_in_page:
2412                                         ids_in_page.append(mobj.group(1))
2413
2414                         video_ids.extend(ids_in_page)
2415
2416                         # A little optimization - if current page is not
2417                         # "full", ie. does not contain PAGE_SIZE video ids then
2418                         # we can assume that this page is the last one - there
2419                         # are no more ids on further pages - no need to query
2420                         # again.
2421
2422                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2423                                 break
2424
2425                         pagenum += 1
2426
2427                 all_ids_count = len(video_ids)
2428                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2429                 playlistend = self._downloader.params.get('playlistend', -1)
2430
2431                 if playlistend == -1:
2432                         video_ids = video_ids[playliststart:]
2433                 else:
2434                         video_ids = video_ids[playliststart:playlistend]
2435
2436                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2437                                            (username, all_ids_count, len(video_ids)))
2438
2439                 for video_id in video_ids:
2440                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2441
2442
2443 class DepositFilesIE(InfoExtractor):
2444         """Information extractor for depositfiles.com"""
2445
2446         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2447
2448         def __init__(self, downloader=None):
2449                 InfoExtractor.__init__(self, downloader)
2450
2451         @staticmethod
2452         def suitable(url):
2453                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2454
2455         def report_download_webpage(self, file_id):
2456                 """Report webpage download."""
2457                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2458
2459         def report_extraction(self, file_id):
2460                 """Report information extraction."""
2461                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2462
2463         def _real_initialize(self):
2464                 return
2465
2466         def _real_extract(self, url):
2467                 # At this point we have a new file
2468                 self._downloader.increment_downloads()
2469
2470                 file_id = url.split('/')[-1]
2471                 # Rebuild url in english locale
2472                 url = 'http://depositfiles.com/en/files/' + file_id
2473
2474                 # Retrieve file webpage with 'Free download' button pressed
2475                 free_download_indication = { 'gateway_result' : '1' }
2476                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2477                 try:
2478                         self.report_download_webpage(file_id)
2479                         webpage = urllib2.urlopen(request).read()
2480                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2481                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2482                         return
2483
2484                 # Search for the real file URL
2485                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2486                 if (mobj is None) or (mobj.group(1) is None):
2487                         # Try to figure out reason of the error.
2488                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2489                         if (mobj is not None) and (mobj.group(1) is not None):
2490                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2491                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2492                         else:
2493                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2494                         return
2495
2496                 file_url = mobj.group(1)
2497                 file_extension = os.path.splitext(file_url)[1][1:]
2498
2499                 # Search for file title
2500                 mobj = re.search(r'<b title="(.*?)">', webpage)
2501                 if mobj is None:
2502                         self._downloader.trouble(u'ERROR: unable to extract title')
2503                         return
2504                 file_title = mobj.group(1).decode('utf-8')
2505
2506                 try:
2507                         # Process file information
2508                         self._downloader.process_info({
2509                                 'id':           file_id.decode('utf-8'),
2510                                 'url':          file_url.decode('utf-8'),
2511                                 'uploader':     u'NA',
2512                                 'upload_date':  u'NA',
2513                                 'title':        file_title,
2514                                 'stitle':       file_title,
2515                                 'ext':          file_extension.decode('utf-8'),
2516                                 'format':       u'NA',
2517                                 'player_url':   None,
2518                         })
2519                 except UnavailableVideoError, err:
2520                         self._downloader.trouble(u'ERROR: unable to download file')
2521
2522 class FacebookIE(InfoExtractor):
2523         """Information Extractor for Facebook"""
2524
2525         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2526         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2527         _NETRC_MACHINE = 'facebook'
2528         _available_formats = ['highqual', 'lowqual']
2529         _video_extensions = {
2530                 'highqual': 'mp4',
2531                 'lowqual': 'mp4',
2532         }
2533
2534         def __init__(self, downloader=None):
2535                 InfoExtractor.__init__(self, downloader)
2536
2537         @staticmethod
2538         def suitable(url):
2539                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2540
2541         def _reporter(self, message):
2542                 """Add header and report message."""
2543                 self._downloader.to_screen(u'[facebook] %s' % message)
2544
2545         def report_login(self):
2546                 """Report attempt to log in."""
2547                 self._reporter(u'Logging in')
2548
2549         def report_video_webpage_download(self, video_id):
2550                 """Report attempt to download video webpage."""
2551                 self._reporter(u'%s: Downloading video webpage' % video_id)
2552
2553         def report_information_extraction(self, video_id):
2554                 """Report attempt to extract video information."""
2555                 self._reporter(u'%s: Extracting video information' % video_id)
2556
2557         def _parse_page(self, video_webpage):
2558                 """Extract video information from page"""
2559                 # General data
2560                 data = {'title': r'class="video_title datawrap">(.*?)</',
2561                         'description': r'<div class="datawrap">(.*?)</div>',
2562                         'owner': r'\("video_owner_name", "(.*?)"\)',
2563                         'upload_date': r'data-date="(.*?)"',
2564                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2565                         }
2566                 video_info = {}
2567                 for piece in data.keys():
2568                         mobj = re.search(data[piece], video_webpage)
2569                         if mobj is not None:
2570                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2571
2572                 # Video urls
2573                 video_urls = {}
2574                 for fmt in self._available_formats:
2575                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2576                         if mobj is not None:
2577                                 # URL is in a Javascript segment inside an escaped Unicode format within
2578                                 # the generally utf-8 page
2579                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2580                 video_info['video_urls'] = video_urls
2581
2582                 return video_info
2583
2584         def _real_initialize(self):
2585                 if self._downloader is None:
2586                         return
2587
2588                 useremail = None
2589                 password = None
2590                 downloader_params = self._downloader.params
2591
2592                 # Attempt to use provided username and password or .netrc data
2593                 if downloader_params.get('username', None) is not None:
2594                         useremail = downloader_params['username']
2595                         password = downloader_params['password']
2596                 elif downloader_params.get('usenetrc', False):
2597                         try:
2598                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2599                                 if info is not None:
2600                                         useremail = info[0]
2601                                         password = info[2]
2602                                 else:
2603                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2604                         except (IOError, netrc.NetrcParseError), err:
2605                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2606                                 return
2607
2608                 if useremail is None:
2609                         return
2610
2611                 # Log in
2612                 login_form = {
2613                         'email': useremail,
2614                         'pass': password,
2615                         'login': 'Log+In'
2616                         }
2617                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2618                 try:
2619                         self.report_login()
2620                         login_results = urllib2.urlopen(request).read()
2621                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2622                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2623                                 return
2624                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2625                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2626                         return
2627
2628         def _real_extract(self, url):
2629                 mobj = re.match(self._VALID_URL, url)
2630                 if mobj is None:
2631                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2632                         return
2633                 video_id = mobj.group('ID')
2634
2635                 # Get video webpage
2636                 self.report_video_webpage_download(video_id)
2637                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2638                 try:
2639                         page = urllib2.urlopen(request)
2640                         video_webpage = page.read()
2641                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2642                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2643                         return
2644
2645                 # Start extracting information
2646                 self.report_information_extraction(video_id)
2647
2648                 # Extract information
2649                 video_info = self._parse_page(video_webpage)
2650
2651                 # uploader
2652                 if 'owner' not in video_info:
2653                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2654                         return
2655                 video_uploader = video_info['owner']
2656
2657                 # title
2658                 if 'title' not in video_info:
2659                         self._downloader.trouble(u'ERROR: unable to extract video title')
2660                         return
2661                 video_title = video_info['title']
2662                 video_title = video_title.decode('utf-8')
2663                 video_title = sanitize_title(video_title)
2664
2665                 # simplified title
2666                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2667                 simple_title = simple_title.strip(ur'_')
2668
2669                 # thumbnail image
2670                 if 'thumbnail' not in video_info:
2671                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2672                         video_thumbnail = ''
2673                 else:
2674                         video_thumbnail = video_info['thumbnail']
2675
2676                 # upload date
2677                 upload_date = u'NA'
2678                 if 'upload_date' in video_info:
2679                         upload_time = video_info['upload_date']
2680                         timetuple = email.utils.parsedate_tz(upload_time)
2681                         if timetuple is not None:
2682                                 try:
2683                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2684                                 except:
2685                                         pass
2686
2687                 # description
2688                 video_description = video_info.get('description', 'No description available.')
2689
2690                 url_map = video_info['video_urls']
2691                 if len(url_map.keys()) > 0:
2692                         # Decide which formats to download
2693                         req_format = self._downloader.params.get('format', None)
2694                         format_limit = self._downloader.params.get('format_limit', None)
2695
2696                         if format_limit is not None and format_limit in self._available_formats:
2697                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2698                         else:
2699                                 format_list = self._available_formats
2700                         existing_formats = [x for x in format_list if x in url_map]
2701                         if len(existing_formats) == 0:
2702                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2703                                 return
2704                         if req_format is None:
2705                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2706                         elif req_format == '-1':
2707                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2708                         else:
2709                                 # Specific format
2710                                 if req_format not in url_map:
2711                                         self._downloader.trouble(u'ERROR: requested format not available')
2712                                         return
2713                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2714
2715                 for format_param, video_real_url in video_url_list:
2716
2717                         # At this point we have a new video
2718                         self._downloader.increment_downloads()
2719
2720                         # Extension
2721                         video_extension = self._video_extensions.get(format_param, 'mp4')
2722
2723                         try:
2724                                 # Process video information
2725                                 self._downloader.process_info({
2726                                         'id':           video_id.decode('utf-8'),
2727                                         'url':          video_real_url.decode('utf-8'),
2728                                         'uploader':     video_uploader.decode('utf-8'),
2729                                         'upload_date':  upload_date,
2730                                         'title':        video_title,
2731                                         'stitle':       simple_title,
2732                                         'ext':          video_extension.decode('utf-8'),
2733                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2734                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2735                                         'description':  video_description.decode('utf-8'),
2736                                         'player_url':   None,
2737                                 })
2738                         except UnavailableVideoError, err:
2739                                 self._downloader.trouble(u'\nERROR: unable to download video')
2740
2741 class BlipTVIE(InfoExtractor):
2742         """Information extractor for blip.tv"""
2743
2744         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2745         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2746
2747         @staticmethod
2748         def suitable(url):
2749                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2750
2751         def report_extraction(self, file_id):
2752                 """Report information extraction."""
2753                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2754
2755         def _simplify_title(self, title):
2756                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2757                 res = res.strip(ur'_')
2758                 return res
2759
2760         def _real_extract(self, url):
2761                 mobj = re.match(self._VALID_URL, url)
2762                 if mobj is None:
2763                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2764                         return
2765
2766                 if '?' in url:
2767                         cchar = '&'
2768                 else:
2769                         cchar = '?'
2770                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2771                 request = urllib2.Request(json_url)
2772                 self.report_extraction(mobj.group(1))
2773                 try:
2774                         json_code = urllib2.urlopen(request).read()
2775                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2776                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2777                         return
2778                 try:
2779                         json_data = json.loads(json_code)
2780                         if 'Post' in json_data:
2781                                 data = json_data['Post']
2782                         else:
2783                                 data = json_data
2784
2785                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2786                         video_url = data['media']['url']
2787                         umobj = re.match(self._URL_EXT, video_url)
2788                         if umobj is None:
2789                                 raise ValueError('Can not determine filename extension')
2790                         ext = umobj.group(1)
2791
2792                         self._downloader.increment_downloads()
2793
2794                         info = {
2795                                 'id': data['item_id'],
2796                                 'url': video_url,
2797                                 'uploader': data['display_name'],
2798                                 'upload_date': upload_date,
2799                                 'title': data['title'],
2800                                 'stitle': self._simplify_title(data['title']),
2801                                 'ext': ext,
2802                                 'format': data['media']['mimeType'],
2803                                 'thumbnail': data['thumbnailUrl'],
2804                                 'description': data['description'],
2805                                 'player_url': data['embedUrl']
2806                         }
2807                 except (ValueError,KeyError), err:
2808                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2809                         return
2810
2811                 try:
2812                         self._downloader.process_info(info)
2813                 except UnavailableVideoError, err:
2814                         self._downloader.trouble(u'\nERROR: unable to download video')
2815
2816
2817 class PostProcessor(object):
2818         """Post Processor class.
2819
2820         PostProcessor objects can be added to downloaders with their
2821         add_post_processor() method. When the downloader has finished a
2822         successful download, it will take its internal chain of PostProcessors
2823         and start calling the run() method on each one of them, first with
2824         an initial argument and then with the returned value of the previous
2825         PostProcessor.
2826
2827         The chain will be stopped if one of them ever returns None or the end
2828         of the chain is reached.
2829
2830         PostProcessor objects follow a "mutual registration" process similar
2831         to InfoExtractor objects.
2832         """
2833
2834         _downloader = None
2835
2836         def __init__(self, downloader=None):
2837                 self._downloader = downloader
2838
2839         def set_downloader(self, downloader):
2840                 """Sets the downloader for this PP."""
2841                 self._downloader = downloader
2842
2843         def run(self, information):
2844                 """Run the PostProcessor.
2845
2846                 The "information" argument is a dictionary like the ones
2847                 composed by InfoExtractors. The only difference is that this
2848                 one has an extra field called "filepath" that points to the
2849                 downloaded file.
2850
2851                 When this method returns None, the postprocessing chain is
2852                 stopped. However, this method may return an information
2853                 dictionary that will be passed to the next postprocessing
2854                 object in the chain. It can be the one it received after
2855                 changing some fields.
2856
2857                 In addition, this method may raise a PostProcessingError
2858                 exception that will be taken into account by the downloader
2859                 it was called from.
2860                 """
2861                 return information # by default, do nothing
2862
2863 class FFmpegExtractAudioPP(PostProcessor):
2864
2865         def __init__(self, downloader=None, preferredcodec=None):
2866                 PostProcessor.__init__(self, downloader)
2867                 if preferredcodec is None:
2868                         preferredcodec = 'best'
2869                 self._preferredcodec = preferredcodec
2870
2871         @staticmethod
2872         def get_audio_codec(path):
2873                 try:
2874                         cmd = ['ffprobe', '-show_streams', '--', path]
2875                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2876                         output = handle.communicate()[0]
2877                         if handle.wait() != 0:
2878                                 return None
2879                 except (IOError, OSError):
2880                         return None
2881                 audio_codec = None
2882                 for line in output.split('\n'):
2883                         if line.startswith('codec_name='):
2884                                 audio_codec = line.split('=')[1].strip()
2885                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2886                                 return audio_codec
2887                 return None
2888
2889         @staticmethod
2890         def run_ffmpeg(path, out_path, codec, more_opts):
2891                 try:
2892                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2893                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2894                         return (ret == 0)
2895                 except (IOError, OSError):
2896                         return False
2897
2898         def run(self, information):
2899                 path = information['filepath']
2900
2901                 filecodec = self.get_audio_codec(path)
2902                 if filecodec is None:
2903                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2904                         return None
2905
2906                 more_opts = []
2907                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2908                         if filecodec == 'aac' or filecodec == 'mp3':
2909                                 # Lossless if possible
2910                                 acodec = 'copy'
2911                                 extension = filecodec
2912                                 if filecodec == 'aac':
2913                                         more_opts = ['-f', 'adts']
2914                         else:
2915                                 # MP3 otherwise.
2916                                 acodec = 'libmp3lame'
2917                                 extension = 'mp3'
2918                                 more_opts = ['-ab', '128k']
2919                 else:
2920                         # We convert the audio (lossy)
2921                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2922                         extension = self._preferredcodec
2923                         more_opts = ['-ab', '128k']
2924                         if self._preferredcodec == 'aac':
2925                                 more_opts += ['-f', 'adts']
2926
2927                 (prefix, ext) = os.path.splitext(path)
2928                 new_path = prefix + '.' + extension
2929                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2930                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2931
2932                 if not status:
2933                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2934                         return None
2935
2936                 try:
2937                         os.remove(path)
2938                 except (IOError, OSError):
2939                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2940                         return None
2941
2942                 information['filepath'] = new_path
2943                 return information
2944
2945 ### MAIN PROGRAM ###
2946 if __name__ == '__main__':
2947         try:
2948                 # Modules needed only when running the main program
2949                 import getpass
2950                 import optparse
2951
2952                 # Function to update the program file with the latest version from the repository.
2953                 def update_self(downloader, filename):
2954                         # Note: downloader only used for options
2955                         if not os.access(filename, os.W_OK):
2956                                 sys.exit('ERROR: no write permissions on %s' % filename)
2957
2958                         downloader.to_screen('Updating to latest stable version...')
2959                         try:
2960                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2961                                 latest_version = urllib.urlopen(latest_url).read().strip()
2962                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2963                                 newcontent = urllib.urlopen(prog_url).read()
2964                         except (IOError, OSError), err:
2965                                 sys.exit('ERROR: unable to download latest version')
2966                         try:
2967                                 stream = open(filename, 'w')
2968                                 stream.write(newcontent)
2969                                 stream.close()
2970                         except (IOError, OSError), err:
2971                                 sys.exit('ERROR: unable to overwrite current version')
2972                         downloader.to_screen('Updated to version %s' % latest_version)
2973
2974                 # Parse command line
2975                 parser = optparse.OptionParser(
2976                         usage='Usage: %prog [options] url...',
2977                         version='2011.07.09-phihag',
2978                         conflict_handler='resolve',
2979                 )
2980
2981                 parser.add_option('-h', '--help',
2982                                 action='help', help='print this help text and exit')
2983                 parser.add_option('-v', '--version',
2984                                 action='version', help='print program version and exit')
2985                 parser.add_option('-U', '--update',
2986                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2987                 parser.add_option('-i', '--ignore-errors',
2988                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2989                 parser.add_option('-r', '--rate-limit',
2990                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2991                 parser.add_option('-R', '--retries',
2992                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2993                 parser.add_option('--playlist-start',
2994                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2995                 parser.add_option('--playlist-end',
2996                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2997                 parser.add_option('--dump-user-agent',
2998                                 action='store_true', dest='dump_user_agent',
2999                                 help='display the current browser identification', default=False)
3000
3001                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3002                 authentication.add_option('-u', '--username',
3003                                 dest='username', metavar='USERNAME', help='account username')
3004                 authentication.add_option('-p', '--password',
3005                                 dest='password', metavar='PASSWORD', help='account password')
3006                 authentication.add_option('-n', '--netrc',
3007                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3008                 parser.add_option_group(authentication)
3009
3010                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3011                 video_format.add_option('-f', '--format',
3012                                 action='store', dest='format', metavar='FORMAT', help='video format code')
3013                 video_format.add_option('--all-formats',
3014                                 action='store_const', dest='format', help='download all available video formats', const='-1')
3015                 video_format.add_option('--max-quality',
3016                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3017                 parser.add_option_group(video_format)
3018
3019                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3020                 verbosity.add_option('-q', '--quiet',
3021                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3022                 verbosity.add_option('-s', '--simulate',
3023                                 action='store_true', dest='simulate', help='do not download video', default=False)
3024                 verbosity.add_option('-g', '--get-url',
3025                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3026                 verbosity.add_option('-e', '--get-title',
3027                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3028                 verbosity.add_option('--get-thumbnail',
3029                                 action='store_true', dest='getthumbnail',
3030                                 help='simulate, quiet but print thumbnail URL', default=False)
3031                 verbosity.add_option('--get-description',
3032                                 action='store_true', dest='getdescription',
3033                                 help='simulate, quiet but print video description', default=False)
3034                 verbosity.add_option('--get-filename',
3035                                 action='store_true', dest='getfilename',
3036                                 help='simulate, quiet but print output filename', default=False)
3037                 verbosity.add_option('--no-progress',
3038                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3039                 verbosity.add_option('--console-title',
3040                                 action='store_true', dest='consoletitle',
3041                                 help='display progress in console titlebar', default=False)
3042                 parser.add_option_group(verbosity)
3043
3044                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3045                 filesystem.add_option('-t', '--title',
3046                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
3047                 filesystem.add_option('-l', '--literal',
3048                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3049                 filesystem.add_option('-A', '--auto-number',
3050                                 action='store_true', dest='autonumber',
3051                                 help='number downloaded files starting from 00000', default=False)
3052                 filesystem.add_option('-o', '--output',
3053                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3054                 filesystem.add_option('-a', '--batch-file',
3055                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3056                 filesystem.add_option('-w', '--no-overwrites',
3057                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3058                 filesystem.add_option('-c', '--continue',
3059                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3060                 filesystem.add_option('--cookies',
3061                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3062                 filesystem.add_option('--no-part',
3063                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
3064                 filesystem.add_option('--no-mtime',
3065                                 action='store_false', dest='updatetime',
3066                                 help='do not use the Last-modified header to set the file modification time', default=True)
3067                 filesystem.add_option('--write-description',
3068                                 action='store_true', dest='writedescription',
3069                                 help='write video description to a .description file', default=False)
3070                 filesystem.add_option('--write-info-json',
3071                                 action='store_true', dest='writeinfojson',
3072                                 help='write video metadata to a .info.json file', default=False)
3073                 parser.add_option_group(filesystem)
3074
3075                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3076                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3077                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3078                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3079                                 help='"best", "aac" or "mp3"; best by default')
3080                 parser.add_option_group(postproc)
3081
3082                 (opts, args) = parser.parse_args()
3083
3084                 # Open appropriate CookieJar
3085                 if opts.cookiefile is None:
3086                         jar = cookielib.CookieJar()
3087                 else:
3088                         try:
3089                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3090                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3091                                         jar.load()
3092                         except (IOError, OSError), err:
3093                                 sys.exit(u'ERROR: unable to open cookie file')
3094
3095                 # Dump user agent
3096                 if opts.dump_user_agent:
3097                         print std_headers['User-Agent']
3098                         sys.exit(0)
3099
3100                 # General configuration
3101                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3102                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3103                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3104
3105                 # Batch file verification
3106                 batchurls = []
3107                 if opts.batchfile is not None:
3108                         try:
3109                                 if opts.batchfile == '-':
3110                                         batchfd = sys.stdin
3111                                 else:
3112                                         batchfd = open(opts.batchfile, 'r')
3113                                 batchurls = batchfd.readlines()
3114                                 batchurls = [x.strip() for x in batchurls]
3115                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3116                         except IOError:
3117                                 sys.exit(u'ERROR: batch file could not be read')
3118                 all_urls = batchurls + args
3119
3120                 # Conflicting, missing and erroneous options
3121                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3122                         parser.error(u'using .netrc conflicts with giving username/password')
3123                 if opts.password is not None and opts.username is None:
3124                         parser.error(u'account username missing')
3125                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3126                         parser.error(u'using output template conflicts with using title, literal title or auto number')
3127                 if opts.usetitle and opts.useliteral:
3128                         parser.error(u'using title conflicts with using literal title')
3129                 if opts.username is not None and opts.password is None:
3130                         opts.password = getpass.getpass(u'Type account password and press return:')
3131                 if opts.ratelimit is not None:
3132                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3133                         if numeric_limit is None:
3134                                 parser.error(u'invalid rate limit specified')
3135                         opts.ratelimit = numeric_limit
3136                 if opts.retries is not None:
3137                         try:
3138                                 opts.retries = long(opts.retries)
3139                         except (TypeError, ValueError), err:
3140                                 parser.error(u'invalid retry count specified')
3141                 try:
3142                         opts.playliststart = long(opts.playliststart)
3143                         if opts.playliststart <= 0:
3144                                 raise ValueError
3145                 except (TypeError, ValueError), err:
3146                         parser.error(u'invalid playlist start number specified')
3147                 try:
3148                         opts.playlistend = long(opts.playlistend)
3149                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3150                                 raise ValueError
3151                 except (TypeError, ValueError), err:
3152                         parser.error(u'invalid playlist end number specified')
3153                 if opts.extractaudio:
3154                         if opts.audioformat not in ['best', 'aac', 'mp3']:
3155                                 parser.error(u'invalid audio format specified')
3156
3157                 # Information extractors
3158                 youtube_ie = YoutubeIE()
3159                 metacafe_ie = MetacafeIE(youtube_ie)
3160                 dailymotion_ie = DailymotionIE()
3161                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3162                 youtube_user_ie = YoutubeUserIE(youtube_ie)
3163                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3164                 google_ie = GoogleIE()
3165                 google_search_ie = GoogleSearchIE(google_ie)
3166                 photobucket_ie = PhotobucketIE()
3167                 yahoo_ie = YahooIE()
3168                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3169                 deposit_files_ie = DepositFilesIE()
3170                 facebook_ie = FacebookIE()
3171                 bliptv_ie = BlipTVIE()
3172                 generic_ie = GenericIE()
3173
3174                 # File downloader
3175                 fd = FileDownloader({
3176                         'usenetrc': opts.usenetrc,
3177                         'username': opts.username,
3178                         'password': opts.password,
3179                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3180                         'forceurl': opts.geturl,
3181                         'forcetitle': opts.gettitle,
3182                         'forcethumbnail': opts.getthumbnail,
3183                         'forcedescription': opts.getdescription,
3184                         'forcefilename': opts.getfilename,
3185                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3186                         'format': opts.format,
3187                         'format_limit': opts.format_limit,
3188                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3189                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3190                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3191                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3192                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3193                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3194                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3195                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3196                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3197                                 or u'%(id)s.%(ext)s'),
3198                         'ignoreerrors': opts.ignoreerrors,
3199                         'ratelimit': opts.ratelimit,
3200                         'nooverwrites': opts.nooverwrites,
3201                         'retries': opts.retries,
3202                         'continuedl': opts.continue_dl,
3203                         'noprogress': opts.noprogress,
3204                         'playliststart': opts.playliststart,
3205                         'playlistend': opts.playlistend,
3206                         'logtostderr': opts.outtmpl == '-',
3207                         'consoletitle': opts.consoletitle,
3208                         'nopart': opts.nopart,
3209                         'updatetime': opts.updatetime,
3210                         'writedescription': opts.writedescription,
3211                         'writeinfojson': opts.writeinfojson,
3212                         })
3213                 fd.add_info_extractor(youtube_search_ie)
3214                 fd.add_info_extractor(youtube_pl_ie)
3215                 fd.add_info_extractor(youtube_user_ie)
3216                 fd.add_info_extractor(metacafe_ie)
3217                 fd.add_info_extractor(dailymotion_ie)
3218                 fd.add_info_extractor(youtube_ie)
3219                 fd.add_info_extractor(google_ie)
3220                 fd.add_info_extractor(google_search_ie)
3221                 fd.add_info_extractor(photobucket_ie)
3222                 fd.add_info_extractor(yahoo_ie)
3223                 fd.add_info_extractor(yahoo_search_ie)
3224                 fd.add_info_extractor(deposit_files_ie)
3225                 fd.add_info_extractor(facebook_ie)
3226                 fd.add_info_extractor(bliptv_ie)
3227
3228                 # This must come last since it's the
3229                 # fallback if none of the others work
3230                 fd.add_info_extractor(generic_ie)
3231
3232                 # PostProcessors
3233                 if opts.extractaudio:
3234                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3235
3236                 # Update version
3237                 if opts.update_self:
3238                         update_self(fd, sys.argv[0])
3239
3240                 # Maybe do nothing
3241                 if len(all_urls) < 1:
3242                         if not opts.update_self:
3243                                 parser.error(u'you must provide at least one URL')
3244                         else:
3245                                 sys.exit()
3246                 retcode = fd.download(all_urls)
3247
3248                 # Dump cookie jar if requested
3249                 if opts.cookiefile is not None:
3250                         try:
3251                                 jar.save()
3252                         except (IOError, OSError), err:
3253                                 sys.exit(u'ERROR: unable to save cookie jar')
3254
3255                 sys.exit(retcode)
3256
3257         except DownloadError:
3258                 sys.exit(1)
3259         except SameFileError:
3260                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3261         except KeyboardInterrupt:
3262                 sys.exit(u'\nERROR: Interrupted by user')