youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # Author: Paweł Paprota
   9 # Author: Gergely Imreh
  10 # Author: Philipp Hagemeister <phihag@phihag.de>
  11 # License: Public domain code
  12 from __future__ import with_statement
  13 import contextlib
  14 import cookielib
  15 import ctypes
  16 import datetime
  17 import email.utils
  18 import gzip
  19 import htmlentitydefs
  20 import httplib
  21 import locale
  22 import math
  23 import netrc
  24 import os
  25 import os.path
  26 import re
  27 import socket
  28 import string
  29 import subprocess
  30 import sys
  31 import time
  32 import urllib
  33 import urllib2
  34 import warnings
  35 import zlib
  36
  37 try:
  38         import cStringIO as StringIO
  39 except ImportError:
  40         import StringIO
  41
  42 # parse_qs was moved from the cgi module to the urlparse module recently.
  43 try:
  44         from urlparse import parse_qs
  45 except ImportError:
  46         from cgi import parse_qs
  47
  48 try:
  49         import lxml.etree
  50 except ImportError: # Python < 2.6
  51         pass # Handled below
  52
  53 std_headers = {
  54         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
  55         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  56         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  57         'Accept-Encoding': 'gzip, deflate',
  58         'Accept-Language': 'en-us,en;q=0.5',
  59 }
  60
  61 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  62
  63 try:
  64         import json
  65 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  66         import re
  67         class json(object):
  68                 @staticmethod
  69                 def loads(s):
  70                         s = s.decode('UTF-8')
  71                         def raiseError(msg, i):
  72                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  73                         def skipSpace(i, expectMore=True):
  74                                 while i < len(s) and s[i] in ' \t\r\n':
  75                                         i += 1
  76                                 if expectMore:
  77                                         if i >= len(s):
  78                                                 raiseError('Premature end', i)
  79                                 return i
  80                         def decodeEscape(match):
  81                                 esc = match.group(1)
  82                                 _STATIC = {
  83                                         '"': '"',
  84                                         '\\': '\\',
  85                                         '/': '/',
  86                                         'b': unichr(0x8),
  87                                         'f': unichr(0xc),
  88                                         'n': '\n',
  89                                         'r': '\r',
  90                                         't': '\t',
  91                                 }
  92                                 if esc in _STATIC:
  93                                         return _STATIC[esc]
  94                                 if esc[0] == 'u':
  95                                         if len(esc) == 1+4:
  96                                                 return unichr(int(esc[1:5], 16))
  97                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
  98                                                 hi = int(esc[1:5], 16)
  99                                                 low = int(esc[7:11], 16)
 100                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 101                                 raise ValueError('Unknown escape ' + str(esc))
 102                         def parseString(i):
 103                                 i += 1
 104                                 e = i
 105                                 while True:
 106                                         e = s.index('"', e)
 107                                         bslashes = 0
 108                                         while s[e-bslashes-1] == '\\':
 109                                                 bslashes += 1
 110                                         if bslashes % 2 == 1:
 111                                                 e += 1
 112                                                 continue
 113                                         break
 114                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 115                                 stri = rexp.sub(decodeEscape, s[i:e])
 116                                 return (e+1,stri)
 117                         def parseObj(i):
 118                                 i += 1
 119                                 res = {}
 120                                 i = skipSpace(i)
 121                                 if s[i] == '}': # Empty dictionary
 122                                         return (i+1,res)
 123                                 while True:
 124                                         if s[i] != '"':
 125                                                 raiseError('Expected a string object key', i)
 126                                         i,key = parseString(i)
 127                                         i = skipSpace(i)
 128                                         if i >= len(s) or s[i] != ':':
 129                                                 raiseError('Expected a colon', i)
 130                                         i,val = parse(i+1)
 131                                         res[key] = val
 132                                         i = skipSpace(i)
 133                                         if s[i] == '}':
 134                                                 return (i+1, res)
 135                                         if s[i] != ',':
 136                                                 raiseError('Expected comma or closing curly brace', i)
 137                                         i = skipSpace(i+1)
 138                         def parseArray(i):
 139                                 res = []
 140                                 i = skipSpace(i+1)
 141                                 if s[i] == ']': # Empty array
 142                                         return (i+1,res)
 143                                 while True:
 144                                         i,val = parse(i)
 145                                         res.append(val)
 146                                         i = skipSpace(i) # Raise exception if premature end
 147                                         if s[i] == ']':
 148                                                 return (i+1, res)
 149                                         if s[i] != ',':
 150                                                 raiseError('Expected a comma or closing bracket', i)
 151                                         i = skipSpace(i+1)
 152                         def parseDiscrete(i):
 153                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 154                                         if s.startswith(k, i):
 155                                                 return (i+len(k), v)
 156                                 raiseError('Not a boolean (or null)', i)
 157                         def parseNumber(i):
 158                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 159                                 if mobj is None:
 160                                         raiseError('Not a number', i)
 161                                 nums = mobj.group(1)
 162                                 if '.' in nums or 'e' in nums or 'E' in nums:
 163                                         return (i+len(nums), float(nums))
 164                                 return (i+len(nums), int(nums))
 165                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 166                         def parse(i):
 167                                 i = skipSpace(i)
 168                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 169                                 i = skipSpace(i, False)
 170                                 return (i,res)
 171                         i,res = parse(0)
 172                         if i < len(s):
 173                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 174                         return res
 175
 176 def preferredencoding():
 177         """Get preferred encoding.
 178
 179         Returns the best encoding scheme for the system, based on
 180         locale.getpreferredencoding() and some further tweaks.
 181         """
 182         def yield_preferredencoding():
 183                 try:
 184                         pref = locale.getpreferredencoding()
 185                         u'TEST'.encode(pref)
 186                 except:
 187                         pref = 'UTF-8'
 188                 while True:
 189                         yield pref
 190         return yield_preferredencoding().next()
 191
 192 def htmlentity_transform(matchobj):
 193         """Transforms an HTML entity to a Unicode character.
 194
 195         This function receives a match object and is intended to be used with
 196         the re.sub() function.
 197         """
 198         entity = matchobj.group(1)
 199
 200         # Known non-numeric HTML entity
 201         if entity in htmlentitydefs.name2codepoint:
 202                 return unichr(htmlentitydefs.name2codepoint[entity])
 203
 204         # Unicode character
 205         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 206         if mobj is not None:
 207                 numstr = mobj.group(1)
 208                 if numstr.startswith(u'x'):
 209                         base = 16
 210                         numstr = u'0%s' % numstr
 211                 else:
 212                         base = 10
 213                 return unichr(long(numstr, base))
 214
 215         # Unknown entity in name, return its literal representation
 216         return (u'&%s;' % entity)
 217
 218 def sanitize_title(utitle):
 219         """Sanitizes a video title so it could be used as part of a filename."""
 220         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 221         return utitle.replace(unicode(os.sep), u'%')
 222
 223 def sanitize_open(filename, open_mode):
 224         """Try to open the given filename, and slightly tweak it if this fails.
 225
 226         Attempts to open the given filename. If this fails, it tries to change
 227         the filename slightly, step by step, until it's either able to open it
 228         or it fails and raises a final exception, like the standard open()
 229         function.
 230
 231         It returns the tuple (stream, definitive_file_name).
 232         """
 233         try:
 234                 if filename == u'-':
 235                         if sys.platform == 'win32':
 236                                 import msvcrt
 237                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 238                         return (sys.stdout, filename)
 239                 stream = open(filename, open_mode)
 240                 return (stream, filename)
 241         except (IOError, OSError), err:
 242                 # In case of error, try to remove win32 forbidden chars
 243                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 244
 245                 # An exception here should be caught in the caller
 246                 stream = open(filename, open_mode)
 247                 return (stream, filename)
 248
 249 def timeconvert(timestr):
 250     """Convert RFC 2822 defined time string into system timestamp"""
 251     timestamp = None
 252     timetuple = email.utils.parsedate_tz(timestr)
 253     if timetuple is not None:
 254         timestamp = email.utils.mktime_tz(timetuple)
 255     return timestamp
 256
 257 class DownloadError(Exception):
 258         """Download Error exception.
 259
 260         This exception may be thrown by FileDownloader objects if they are not
 261         configured to continue on errors. They will contain the appropriate
 262         error message.
 263         """
 264         pass
 265
 266 class SameFileError(Exception):
 267         """Same File exception.
 268
 269         This exception will be thrown by FileDownloader objects if they detect
 270         multiple files would have to be downloaded to the same file on disk.
 271         """
 272         pass
 273
 274 class PostProcessingError(Exception):
 275         """Post Processing exception.
 276
 277         This exception may be raised by PostProcessor's .run() method to
 278         indicate an error in the postprocessing task.
 279         """
 280         pass
 281
 282 class UnavailableVideoError(Exception):
 283         """Unavailable Format exception.
 284
 285         This exception will be thrown when a video is requested
 286         in a format that is not available for that video.
 287         """
 288         pass
 289
 290 class ContentTooShortError(Exception):
 291         """Content Too Short exception.
 292
 293         This exception may be raised by FileDownloader objects when a file they
 294         download is too small for what the server announced first, indicating
 295         the connection was probably interrupted.
 296         """
 297         # Both in bytes
 298         downloaded = None
 299         expected = None
 300
 301         def __init__(self, downloaded, expected):
 302                 self.downloaded = downloaded
 303                 self.expected = expected
 304
 305 class YoutubeDLHandler(urllib2.HTTPHandler):
 306         """Handler for HTTP requests and responses.
 307
 308         This class, when installed with an OpenerDirector, automatically adds
 309         the standard headers to every HTTP request and handles gzipped and
 310         deflated responses from web servers. If compression is to be avoided in
 311         a particular request, the original request in the program code only has
 312         to include the HTTP header "Youtubedl-No-Compression", which will be
 313         removed before making the real request.
 314
 315         Part of this code was copied from:
 316
 317           http://techknack.net/python-urllib2-handlers/
 318
 319         Andrew Rowls, the author of that code, agreed to release it to the
 320         public domain.
 321         """
 322
 323         @staticmethod
 324         def deflate(data):
 325                 try:
 326                         return zlib.decompress(data, -zlib.MAX_WBITS)
 327                 except zlib.error:
 328                         return zlib.decompress(data)
 329
 330         @staticmethod
 331         def addinfourl_wrapper(stream, headers, url, code):
 332                 if hasattr(urllib2.addinfourl, 'getcode'):
 333                         return urllib2.addinfourl(stream, headers, url, code)
 334                 ret = urllib2.addinfourl(stream, headers, url)
 335                 ret.code = code
 336                 return ret
 337
 338         def http_request(self, req):
 339                 for h in std_headers:
 340                         if h in req.headers:
 341                                 del req.headers[h]
 342                         req.add_header(h, std_headers[h])
 343                 if 'Youtubedl-no-compression' in req.headers:
 344                         if 'Accept-encoding' in req.headers:
 345                                 del req.headers['Accept-encoding']
 346                         del req.headers['Youtubedl-no-compression']
 347                 return req
 348
 349         def http_response(self, req, resp):
 350                 old_resp = resp
 351                 # gzip
 352                 if resp.headers.get('Content-encoding', '') == 'gzip':
 353                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 354                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 355                         resp.msg = old_resp.msg
 356                 # deflate
 357                 if resp.headers.get('Content-encoding', '') == 'deflate':
 358                         gz = StringIO.StringIO(self.deflate(resp.read()))
 359                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 360                         resp.msg = old_resp.msg
 361                 return resp
 362
 363 class FileDownloader(object):
 364         """File Downloader class.
 365
 366         File downloader objects are the ones responsible of downloading the
 367         actual video file and writing it to disk if the user has requested
 368         it, among some other tasks. In most cases there should be one per
 369         program. As, given a video URL, the downloader doesn't know how to
 370         extract all the needed information, task that InfoExtractors do, it
 371         has to pass the URL to one of them.
 372
 373         For this, file downloader objects have a method that allows
 374         InfoExtractors to be registered in a given order. When it is passed
 375         a URL, the file downloader handles it to the first InfoExtractor it
 376         finds that reports being able to handle it. The InfoExtractor extracts
 377         all the information about the video or videos the URL refers to, and
 378         asks the FileDownloader to process the video information, possibly
 379         downloading the video.
 380
 381         File downloaders accept a lot of parameters. In order not to saturate
 382         the object constructor with arguments, it receives a dictionary of
 383         options instead. These options are available through the params
 384         attribute for the InfoExtractors to use. The FileDownloader also
 385         registers itself as the downloader in charge for the InfoExtractors
 386         that are added to it, so this is a "mutual registration".
 387
 388         Available options:
 389
 390         username:         Username for authentication purposes.
 391         password:         Password for authentication purposes.
 392         usenetrc:         Use netrc for authentication instead.
 393         quiet:            Do not print messages to stdout.
 394         forceurl:         Force printing final URL.
 395         forcetitle:       Force printing title.
 396         forcethumbnail:   Force printing thumbnail URL.
 397         forcedescription: Force printing description.
 398         forcefilename:    Force printing final filename.
 399         simulate:         Do not download the video files.
 400         format:           Video format code.
 401         format_limit:     Highest quality format to try.
 402         outtmpl:          Template for output names.
 403         ignoreerrors:     Do not stop on download errors.
 404         ratelimit:        Download speed limit, in bytes/sec.
 405         nooverwrites:     Prevent overwriting files.
 406         retries:          Number of times to retry for HTTP error 5xx
 407         continuedl:       Try to continue downloads if possible.
 408         noprogress:       Do not print the progress bar.
 409         playliststart:    Playlist item to start at.
 410         playlistend:      Playlist item to end at.
 411         logtostderr:      Log messages to stderr instead of stdout.
 412         consoletitle:     Display progress in console window's titlebar.
 413         nopart:           Do not use temporary .part files.
 414         updatetime:       Use the Last-modified header to set output file timestamps.
 415         writedescription: Write the video description to a .description file
 416         writeinfojson:    Write the video description to a .info.json file
 417         """
 418
 419         params = None
 420         _ies = []
 421         _pps = []
 422         _download_retcode = None
 423         _num_downloads = None
 424         _screen_file = None
 425
 426         def __init__(self, params):
 427                 """Create a FileDownloader object with the given options."""
 428                 self._ies = []
 429                 self._pps = []
 430                 self._download_retcode = 0
 431                 self._num_downloads = 0
 432                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 433                 self.params = params
 434
 435         @staticmethod
 436         def pmkdir(filename):
 437                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 438                 components = filename.split(os.sep)
 439                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 440                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 441                 for dir in aggregate:
 442                         if not os.path.exists(dir):
 443                                 os.mkdir(dir)
 444
 445         @staticmethod
 446         def format_bytes(bytes):
 447                 if bytes is None:
 448                         return 'N/A'
 449                 if type(bytes) is str:
 450                         bytes = float(bytes)
 451                 if bytes == 0.0:
 452                         exponent = 0
 453                 else:
 454                         exponent = long(math.log(bytes, 1024.0))
 455                 suffix = 'bkMGTPEZY'[exponent]
 456                 converted = float(bytes) / float(1024**exponent)
 457                 return '%.2f%s' % (converted, suffix)
 458
 459         @staticmethod
 460         def calc_percent(byte_counter, data_len):
 461                 if data_len is None:
 462                         return '---.-%'
 463                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 464
 465         @staticmethod
 466         def calc_eta(start, now, total, current):
 467                 if total is None:
 468                         return '--:--'
 469                 dif = now - start
 470                 if current == 0 or dif < 0.001: # One millisecond
 471                         return '--:--'
 472                 rate = float(current) / dif
 473                 eta = long((float(total) - float(current)) / rate)
 474                 (eta_mins, eta_secs) = divmod(eta, 60)
 475                 if eta_mins > 99:
 476                         return '--:--'
 477                 return '%02d:%02d' % (eta_mins, eta_secs)
 478
 479         @staticmethod
 480         def calc_speed(start, now, bytes):
 481                 dif = now - start
 482                 if bytes == 0 or dif < 0.001: # One millisecond
 483                         return '%10s' % '---b/s'
 484                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 485
 486         @staticmethod
 487         def best_block_size(elapsed_time, bytes):
 488                 new_min = max(bytes / 2.0, 1.0)
 489                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 490                 if elapsed_time < 0.001:
 491                         return long(new_max)
 492                 rate = bytes / elapsed_time
 493                 if rate > new_max:
 494                         return long(new_max)
 495                 if rate < new_min:
 496                         return long(new_min)
 497                 return long(rate)
 498
 499         @staticmethod
 500         def parse_bytes(bytestr):
 501                 """Parse a string indicating a byte quantity into a long integer."""
 502                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 503                 if matchobj is None:
 504                         return None
 505                 number = float(matchobj.group(1))
 506                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 507                 return long(round(number * multiplier))
 508
 509         def add_info_extractor(self, ie):
 510                 """Add an InfoExtractor object to the end of the list."""
 511                 self._ies.append(ie)
 512                 ie.set_downloader(self)
 513
 514         def add_post_processor(self, pp):
 515                 """Add a PostProcessor object to the end of the chain."""
 516                 self._pps.append(pp)
 517                 pp.set_downloader(self)
 518
 519         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 520                 """Print message to stdout if not in quiet mode."""
 521                 try:
 522                         if not self.params.get('quiet', False):
 523                                 terminator = [u'\n', u''][skip_eol]
 524                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 525                         self._screen_file.flush()
 526                 except (UnicodeEncodeError), err:
 527                         if not ignore_encoding_errors:
 528                                 raise
 529
 530         def to_stderr(self, message):
 531                 """Print message to stderr."""
 532                 print >>sys.stderr, message.encode(preferredencoding())
 533
 534         def to_cons_title(self, message):
 535                 """Set console/terminal window title to message."""
 536                 if not self.params.get('consoletitle', False):
 537                         return
 538                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 539                         # c_wchar_p() might not be necessary if `message` is
 540                         # already of type unicode()
 541                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 542                 elif 'TERM' in os.environ:
 543                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 544
 545         def fixed_template(self):
 546                 """Checks if the output template is fixed."""
 547                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 548
 549         def trouble(self, message=None):
 550                 """Determine action to take when a download problem appears.
 551
 552                 Depending on if the downloader has been configured to ignore
 553                 download errors or not, this method may throw an exception or
 554                 not when errors are found, after printing the message.
 555                 """
 556                 if message is not None:
 557                         self.to_stderr(message)
 558                 if not self.params.get('ignoreerrors', False):
 559                         raise DownloadError(message)
 560                 self._download_retcode = 1
 561
 562         def slow_down(self, start_time, byte_counter):
 563                 """Sleep if the download speed is over the rate limit."""
 564                 rate_limit = self.params.get('ratelimit', None)
 565                 if rate_limit is None or byte_counter == 0:
 566                         return
 567                 now = time.time()
 568                 elapsed = now - start_time
 569                 if elapsed <= 0.0:
 570                         return
 571                 speed = float(byte_counter) / elapsed
 572                 if speed > rate_limit:
 573                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 574
 575         def temp_name(self, filename):
 576                 """Returns a temporary filename for the given filename."""
 577                 if self.params.get('nopart', False) or filename == u'-' or \
 578                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 579                         return filename
 580                 return filename + u'.part'
 581
 582         def undo_temp_name(self, filename):
 583                 if filename.endswith(u'.part'):
 584                         return filename[:-len(u'.part')]
 585                 return filename
 586
 587         def try_rename(self, old_filename, new_filename):
 588                 try:
 589                         if old_filename == new_filename:
 590                                 return
 591                         os.rename(old_filename, new_filename)
 592                 except (IOError, OSError), err:
 593                         self.trouble(u'ERROR: unable to rename file')
 594
 595         def try_utime(self, filename, last_modified_hdr):
 596                 """Try to set the last-modified time of the given file."""
 597                 if last_modified_hdr is None:
 598                         return
 599                 if not os.path.isfile(filename):
 600                         return
 601                 timestr = last_modified_hdr
 602                 if timestr is None:
 603                         return
 604                 filetime = timeconvert(timestr)
 605                 if filetime is None:
 606                         return
 607                 try:
 608                         os.utime(filename,(time.time(), filetime))
 609                 except:
 610                         pass
 611
 612         def report_writedescription(self, descfn):
 613                 """ Report that the description file is being written """
 614                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 615
 616         def report_writeinfojson(self, infofn):
 617                 """ Report that the metadata file has been written """
 618                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 619
 620         def report_destination(self, filename):
 621                 """Report destination filename."""
 622                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 623
 624         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 625                 """Report download progress."""
 626                 if self.params.get('noprogress', False):
 627                         return
 628                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 629                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 630                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 631                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 632
 633         def report_resuming_byte(self, resume_len):
 634                 """Report attempt to resume at given byte."""
 635                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 636
 637         def report_retry(self, count, retries):
 638                 """Report retry in case of HTTP error 5xx"""
 639                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 640
 641         def report_file_already_downloaded(self, file_name):
 642                 """Report file has already been fully downloaded."""
 643                 try:
 644                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 645                 except (UnicodeEncodeError), err:
 646                         self.to_screen(u'[download] The file has already been downloaded')
 647
 648         def report_unable_to_resume(self):
 649                 """Report it was impossible to resume download."""
 650                 self.to_screen(u'[download] Unable to resume')
 651
 652         def report_finish(self):
 653                 """Report download finished."""
 654                 if self.params.get('noprogress', False):
 655                         self.to_screen(u'[download] Download completed')
 656                 else:
 657                         self.to_screen(u'')
 658
 659         def increment_downloads(self):
 660                 """Increment the ordinal that assigns a number to each file."""
 661                 self._num_downloads += 1
 662
 663         def prepare_filename(self, info_dict):
 664                 """Generate the output filename."""
 665                 try:
 666                         template_dict = dict(info_dict)
 667                         template_dict['epoch'] = unicode(long(time.time()))
 668                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 669                         filename = self.params['outtmpl'] % template_dict
 670                         return filename
 671                 except (ValueError, KeyError), err:
 672                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 673                         return None
 674
 675         def process_info(self, info_dict):
 676                 """Process a single dictionary returned by an InfoExtractor."""
 677                 filename = self.prepare_filename(info_dict)
 678                 # Do nothing else if in simulate mode
 679                 if self.params.get('simulate', False):
 680                         # Forced printings
 681                         if self.params.get('forcetitle', False):
 682                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 683                         if self.params.get('forceurl', False):
 684                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 685                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 686                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 687                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 688                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 689                         if self.params.get('forcefilename', False) and filename is not None:
 690                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 691
 692                         return
 693
 694                 if filename is None:
 695                         return
 696                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 697                         self.to_stderr(u'WARNING: file exists and will be skipped')
 698                         return
 699
 700                 try:
 701                         self.pmkdir(filename)
 702                 except (OSError, IOError), err:
 703                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 704                         return
 705
 706                 if self.params.get('writedescription', False):
 707                         try:
 708                                 descfn = filename + '.description'
 709                                 self.report_writedescription(descfn)
 710                                 with contextlib.closing(open(descfn, 'wb')) as descfile:
 711                                         descfile.write(info_dict['description'].encode('utf-8'))
 712                         except (OSError, IOError):
 713                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 714                                 return
 715
 716                 if self.params.get('writeinfojson', False):
 717                         infofn = filename + '.info.json'
 718                         self.report_writeinfojson(infofn)
 719                         try:
 720                                 json.dump
 721                         except (NameError,AttributeError):
 722                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 723                                 return
 724                         try:
 725                                 with contextlib.closing(open(infofn, 'wb')) as infof:
 726                                         json.dump(info_dict, infof)
 727                         except (OSError, IOError):
 728                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 729                                 return
 730
 731                 try:
 732                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 733                 except (OSError, IOError), err:
 734                         raise UnavailableVideoError
 735                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 736                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 737                         return
 738                 except (ContentTooShortError, ), err:
 739                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 740                         return
 741
 742                 if success:
 743                         try:
 744                                 self.post_process(filename, info_dict)
 745                         except (PostProcessingError), err:
 746                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 747                                 return
 748
 749         def download(self, url_list):
 750                 """Download a given list of URLs."""
 751                 if len(url_list) > 1 and self.fixed_template():
 752                         raise SameFileError(self.params['outtmpl'])
 753
 754                 for url in url_list:
 755                         suitable_found = False
 756                         for ie in self._ies:
 757                                 # Go to next InfoExtractor if not suitable
 758                                 if not ie.suitable(url):
 759                                         continue
 760
 761                                 # Suitable InfoExtractor found
 762                                 suitable_found = True
 763
 764                                 # Extract information from URL and process it
 765                                 ie.extract(url)
 766
 767                                 # Suitable InfoExtractor had been found; go to next URL
 768                                 break
 769
 770                         if not suitable_found:
 771                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 772
 773                 return self._download_retcode
 774
 775         def post_process(self, filename, ie_info):
 776                 """Run the postprocessing chain on the given file."""
 777                 info = dict(ie_info)
 778                 info['filepath'] = filename
 779                 for pp in self._pps:
 780                         info = pp.run(info)
 781                         if info is None:
 782                                 break
 783
 784         def _download_with_rtmpdump(self, filename, url, player_url):
 785                 self.report_destination(filename)
 786                 tmpfilename = self.temp_name(filename)
 787
 788                 # Check for rtmpdump first
 789                 try:
 790                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 791                 except (OSError, IOError):
 792                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 793                         return False
 794
 795                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 796                 # the connection was interrumpted and resuming appears to be
 797                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 798                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 799                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 800                 while retval == 2 or retval == 1:
 801                         prevsize = os.path.getsize(tmpfilename)
 802                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 803                         time.sleep(5.0) # This seems to be needed
 804                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 805                         cursize = os.path.getsize(tmpfilename)
 806                         if prevsize == cursize and retval == 1:
 807                                 break
 808                 if retval == 0:
 809                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 810                         self.try_rename(tmpfilename, filename)
 811                         return True
 812                 else:
 813                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 814                         return False
 815
 816         def _do_download(self, filename, url, player_url):
 817                 # Check file already present
 818                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 819                         self.report_file_already_downloaded(filename)
 820                         return True
 821
 822                 # Attempt to download using rtmpdump
 823                 if url.startswith('rtmp'):
 824                         return self._download_with_rtmpdump(filename, url, player_url)
 825
 826                 tmpfilename = self.temp_name(filename)
 827                 stream = None
 828                 open_mode = 'wb'
 829
 830                 # Do not include the Accept-Encoding header
 831                 headers = {'Youtubedl-no-compression': 'True'}
 832                 basic_request = urllib2.Request(url, None, headers)
 833                 request = urllib2.Request(url, None, headers)
 834
 835                 # Establish possible resume length
 836                 if os.path.isfile(tmpfilename):
 837                         resume_len = os.path.getsize(tmpfilename)
 838                 else:
 839                         resume_len = 0
 840
 841                 # Request parameters in case of being able to resume
 842                 if self.params.get('continuedl', False) and resume_len != 0:
 843                         self.report_resuming_byte(resume_len)
 844                         request.add_header('Range','bytes=%d-' % resume_len)
 845                         open_mode = 'ab'
 846
 847                 count = 0
 848                 retries = self.params.get('retries', 0)
 849                 while count <= retries:
 850                         # Establish connection
 851                         try:
 852                                 data = urllib2.urlopen(request)
 853                                 break
 854                         except (urllib2.HTTPError, ), err:
 855                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 856                                         # Unexpected HTTP error
 857                                         raise
 858                                 elif err.code == 416:
 859                                         # Unable to resume (requested range not satisfiable)
 860                                         try:
 861                                                 # Open the connection again without the range header
 862                                                 data = urllib2.urlopen(basic_request)
 863                                                 content_length = data.info()['Content-Length']
 864                                         except (urllib2.HTTPError, ), err:
 865                                                 if err.code < 500 or err.code >= 600:
 866                                                         raise
 867                                         else:
 868                                                 # Examine the reported length
 869                                                 if (content_length is not None and
 870                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 871                                                         # The file had already been fully downloaded.
 872                                                         # Explanation to the above condition: in issue #175 it was revealed that
 873                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 874                                                         # changing the file size slightly and causing problems for some users. So
 875                                                         # I decided to implement a suggested change and consider the file
 876                                                         # completely downloaded if the file size differs less than 100 bytes from
 877                                                         # the one in the hard drive.
 878                                                         self.report_file_already_downloaded(filename)
 879                                                         self.try_rename(tmpfilename, filename)
 880                                                         return True
 881                                                 else:
 882                                                         # The length does not match, we start the download over
 883                                                         self.report_unable_to_resume()
 884                                                         open_mode = 'wb'
 885                                                         break
 886                         # Retry
 887                         count += 1
 888                         if count <= retries:
 889                                 self.report_retry(count, retries)
 890
 891                 if count > retries:
 892                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 893                         return False
 894
 895                 data_len = data.info().get('Content-length', None)
 896                 if data_len is not None:
 897                         data_len = long(data_len) + resume_len
 898                 data_len_str = self.format_bytes(data_len)
 899                 byte_counter = 0 + resume_len
 900                 block_size = 1024
 901                 start = time.time()
 902                 while True:
 903                         # Download and write
 904                         before = time.time()
 905                         data_block = data.read(block_size)
 906                         after = time.time()
 907                         if len(data_block) == 0:
 908                                 break
 909                         byte_counter += len(data_block)
 910
 911                         # Open file just in time
 912                         if stream is None:
 913                                 try:
 914                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 915                                         filename = self.undo_temp_name(tmpfilename)
 916                                         self.report_destination(filename)
 917                                 except (OSError, IOError), err:
 918                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 919                                         return False
 920                         try:
 921                                 stream.write(data_block)
 922                         except (IOError, OSError), err:
 923                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 924                                 return False
 925                         block_size = self.best_block_size(after - before, len(data_block))
 926
 927                         # Progress message
 928                         percent_str = self.calc_percent(byte_counter, data_len)
 929                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 930                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 931                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 932
 933                         # Apply rate limit
 934                         self.slow_down(start, byte_counter - resume_len)
 935
 936                 stream.close()
 937                 self.report_finish()
 938                 if data_len is not None and byte_counter != data_len:
 939                         raise ContentTooShortError(byte_counter, long(data_len))
 940                 self.try_rename(tmpfilename, filename)
 941
 942                 # Update file modification time
 943                 if self.params.get('updatetime', True):
 944                         self.try_utime(filename, data.info().get('last-modified', None))
 945
 946                 return True
 947
 948 class InfoExtractor(object):
 949         """Information Extractor class.
 950
 951         Information extractors are the classes that, given a URL, extract
 952         information from the video (or videos) the URL refers to. This
 953         information includes the real video URL, the video title and simplified
 954         title, author and others. The information is stored in a dictionary
 955         which is then passed to the FileDownloader. The FileDownloader
 956         processes this information possibly downloading the video to the file
 957         system, among other possible outcomes. The dictionaries must include
 958         the following fields:
 959
 960         id:             Video identifier.
 961         url:            Final video URL.
 962         uploader:       Nickname of the video uploader.
 963         title:          Literal title.
 964         stitle:         Simplified title.
 965         ext:            Video filename extension.
 966         format:         Video format.
 967         player_url:     SWF Player URL (may be None).
 968
 969         The following fields are optional. Their primary purpose is to allow
 970         youtube-dl to serve as the backend for a video search function, such
 971         as the one in youtube2mp3.  They are only used when their respective
 972         forced printing functions are called:
 973
 974         thumbnail:      Full URL to a video thumbnail image.
 975         description:    One-line video description.
 976
 977         Subclasses of this one should re-define the _real_initialize() and
 978         _real_extract() methods, as well as the suitable() static method.
 979         Probably, they should also be instantiated and added to the main
 980         downloader.
 981         """
 982
 983         _ready = False
 984         _downloader = None
 985
 986         def __init__(self, downloader=None):
 987                 """Constructor. Receives an optional downloader."""
 988                 self._ready = False
 989                 self.set_downloader(downloader)
 990
 991         @staticmethod
 992         def suitable(url):
 993                 """Receives a URL and returns True if suitable for this IE."""
 994                 return False
 995
 996         def initialize(self):
 997                 """Initializes an instance (authentication, etc)."""
 998                 if not self._ready:
 999                         self._real_initialize()
1000                         self._ready = True
1001
1002         def extract(self, url):
1003                 """Extracts URL information and returns it in list of dicts."""
1004                 self.initialize()
1005                 return self._real_extract(url)
1006
1007         def set_downloader(self, downloader):
1008                 """Sets the downloader for this IE."""
1009                 self._downloader = downloader
1010
1011         def _real_initialize(self):
1012                 """Real initialization process. Redefine in subclasses."""
1013                 pass
1014
1015         def _real_extract(self, url):
1016                 """Real extraction process. Redefine in subclasses."""
1017                 pass
1018
1019 class YoutubeIE(InfoExtractor):
1020         """Information extractor for youtube.com."""
1021
1022         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1023         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1024         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1025         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1026         _NETRC_MACHINE = 'youtube'
1027         # Listed in order of quality
1028         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1029         _video_extensions = {
1030                 '13': '3gp',
1031                 '17': 'mp4',
1032                 '18': 'mp4',
1033                 '22': 'mp4',
1034                 '37': 'mp4',
1035                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1036                 '43': 'webm',
1037                 '45': 'webm',
1038         }
1039
1040         @staticmethod
1041         def suitable(url):
1042                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1043
1044         def report_lang(self):
1045                 """Report attempt to set language."""
1046                 self._downloader.to_screen(u'[youtube] Setting language')
1047
1048         def report_login(self):
1049                 """Report attempt to log in."""
1050                 self._downloader.to_screen(u'[youtube] Logging in')
1051
1052         def report_age_confirmation(self):
1053                 """Report attempt to confirm age."""
1054                 self._downloader.to_screen(u'[youtube] Confirming age')
1055
1056         def report_video_webpage_download(self, video_id):
1057                 """Report attempt to download video webpage."""
1058                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1059
1060         def report_video_info_webpage_download(self, video_id):
1061                 """Report attempt to download video info webpage."""
1062                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1063
1064         def report_information_extraction(self, video_id):
1065                 """Report attempt to extract video information."""
1066                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1067
1068         def report_unavailable_format(self, video_id, format):
1069                 """Report extracted video URL."""
1070                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1071
1072         def report_rtmp_download(self):
1073                 """Indicate the download will use the RTMP protocol."""
1074                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1075
1076         def _real_initialize(self):
1077                 if self._downloader is None:
1078                         return
1079
1080                 username = None
1081                 password = None
1082                 downloader_params = self._downloader.params
1083
1084                 # Attempt to use provided username and password or .netrc data
1085                 if downloader_params.get('username', None) is not None:
1086                         username = downloader_params['username']
1087                         password = downloader_params['password']
1088                 elif downloader_params.get('usenetrc', False):
1089                         try:
1090                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1091                                 if info is not None:
1092                                         username = info[0]
1093                                         password = info[2]
1094                                 else:
1095                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1096                         except (IOError, netrc.NetrcParseError), err:
1097                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1098                                 return
1099
1100                 # Set language
1101                 request = urllib2.Request(self._LANG_URL)
1102                 try:
1103                         self.report_lang()
1104                         urllib2.urlopen(request).read()
1105                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1106                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1107                         return
1108
1109                 # No authentication to be performed
1110                 if username is None:
1111                         return
1112
1113                 # Log in
1114                 login_form = {
1115                                 'current_form': 'loginForm',
1116                                 'next':         '/',
1117                                 'action_login': 'Log In',
1118                                 'username':     username,
1119                                 'password':     password,
1120                                 }
1121                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1122                 try:
1123                         self.report_login()
1124                         login_results = urllib2.urlopen(request).read()
1125                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1126                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1127                                 return
1128                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1129                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1130                         return
1131
1132                 # Confirm age
1133                 age_form = {
1134                                 'next_url':             '/',
1135                                 'action_confirm':       'Confirm',
1136                                 }
1137                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1138                 try:
1139                         self.report_age_confirmation()
1140                         age_results = urllib2.urlopen(request).read()
1141                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1142                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1143                         return
1144
1145         def _real_extract(self, url):
1146                 # Extract video id from URL
1147                 mobj = re.match(self._VALID_URL, url)
1148                 if mobj is None:
1149                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1150                         return
1151                 video_id = mobj.group(2)
1152
1153                 # Get video webpage
1154                 self.report_video_webpage_download(video_id)
1155                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1156                 try:
1157                         video_webpage = urllib2.urlopen(request).read()
1158                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1159                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1160                         return
1161
1162                 # Attempt to extract SWF player URL
1163                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1164                 if mobj is not None:
1165                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1166                 else:
1167                         player_url = None
1168
1169                 # Get video info
1170                 self.report_video_info_webpage_download(video_id)
1171                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1172                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1173                                            % (video_id, el_type))
1174                         request = urllib2.Request(video_info_url)
1175                         try:
1176                                 video_info_webpage = urllib2.urlopen(request).read()
1177                                 video_info = parse_qs(video_info_webpage)
1178                                 if 'token' in video_info:
1179                                         break
1180                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1182                                 return
1183                 if 'token' not in video_info:
1184                         if 'reason' in video_info:
1185                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1186                         else:
1187                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1188                         return
1189
1190                 # Start extracting information
1191                 self.report_information_extraction(video_id)
1192
1193                 # uploader
1194                 if 'author' not in video_info:
1195                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1196                         return
1197                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1198
1199                 # title
1200                 if 'title' not in video_info:
1201                         self._downloader.trouble(u'ERROR: unable to extract video title')
1202                         return
1203                 video_title = urllib.unquote_plus(video_info['title'][0])
1204                 video_title = video_title.decode('utf-8')
1205                 video_title = sanitize_title(video_title)
1206
1207                 # simplified title
1208                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1209                 simple_title = simple_title.strip(ur'_')
1210
1211                 # thumbnail image
1212                 if 'thumbnail_url' not in video_info:
1213                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1214                         video_thumbnail = ''
1215                 else:   # don't panic if we can't find it
1216                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1217
1218                 # upload date
1219                 upload_date = u'NA'
1220                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1221                 if mobj is not None:
1222                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1223                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1224                         for expression in format_expressions:
1225                                 try:
1226                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1227                                 except:
1228                                         pass
1229
1230                 # description
1231                 try:
1232                         lxml.etree
1233                 except NameError:
1234                         video_description = u'No description available.'
1235                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1236                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1237                                 if mobj is not None:
1238                                         video_description = mobj.group(1).decode('utf-8')
1239                 else:
1240                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1241                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1242                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1243                         # TODO use another parser
1244
1245                 # token
1246                 video_token = urllib.unquote_plus(video_info['token'][0])
1247
1248                 # Decide which formats to download
1249                 req_format = self._downloader.params.get('format', None)
1250
1251                 if 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1252                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1253                         url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1254                         url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1255
1256                         format_limit = self._downloader.params.get('format_limit', None)
1257                         if format_limit is not None and format_limit in self._available_formats:
1258                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1259                         else:
1260                                 format_list = self._available_formats
1261                         existing_formats = [x for x in format_list if x in url_map]
1262                         if len(existing_formats) == 0:
1263                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1264                                 return
1265                         if req_format is None:
1266                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1267                         elif req_format == '-1':
1268                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1269                         else:
1270                                 # Specific format
1271                                 if req_format not in url_map:
1272                                         self._downloader.trouble(u'ERROR: requested format not available')
1273                                         return
1274                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1275
1276                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1277                         self.report_rtmp_download()
1278                         video_url_list = [(None, video_info['conn'][0])]
1279
1280                 else:
1281                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1282                         return
1283
1284                 for format_param, video_real_url in video_url_list:
1285                         # At this point we have a new video
1286                         self._downloader.increment_downloads()
1287
1288                         # Extension
1289                         video_extension = self._video_extensions.get(format_param, 'flv')
1290
1291                         # Find the video URL in fmt_url_map or conn paramters
1292                         try:
1293                                 # Process video information
1294                                 self._downloader.process_info({
1295                                         'id':           video_id.decode('utf-8'),
1296                                         'url':          video_real_url.decode('utf-8'),
1297                                         'uploader':     video_uploader.decode('utf-8'),
1298                                         'upload_date':  upload_date,
1299                                         'title':        video_title,
1300                                         'stitle':       simple_title,
1301                                         'ext':          video_extension.decode('utf-8'),
1302                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1303                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1304                                         'description':  video_description,
1305                                         'player_url':   player_url,
1306                                 })
1307                         except UnavailableVideoError, err:
1308                                 self._downloader.trouble(u'\nERROR: unable to download video')
1309
1310
1311 class MetacafeIE(InfoExtractor):
1312         """Information Extractor for metacafe.com."""
1313
1314         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1315         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1316         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1317         _youtube_ie = None
1318
1319         def __init__(self, youtube_ie, downloader=None):
1320                 InfoExtractor.__init__(self, downloader)
1321                 self._youtube_ie = youtube_ie
1322
1323         @staticmethod
1324         def suitable(url):
1325                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1326
1327         def report_disclaimer(self):
1328                 """Report disclaimer retrieval."""
1329                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1330
1331         def report_age_confirmation(self):
1332                 """Report attempt to confirm age."""
1333                 self._downloader.to_screen(u'[metacafe] Confirming age')
1334
1335         def report_download_webpage(self, video_id):
1336                 """Report webpage download."""
1337                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1338
1339         def report_extraction(self, video_id):
1340                 """Report information extraction."""
1341                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1342
1343         def _real_initialize(self):
1344                 # Retrieve disclaimer
1345                 request = urllib2.Request(self._DISCLAIMER)
1346                 try:
1347                         self.report_disclaimer()
1348                         disclaimer = urllib2.urlopen(request).read()
1349                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1350                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1351                         return
1352
1353                 # Confirm age
1354                 disclaimer_form = {
1355                         'filters': '0',
1356                         'submit': "Continue - I'm over 18",
1357                         }
1358                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1359                 try:
1360                         self.report_age_confirmation()
1361                         disclaimer = urllib2.urlopen(request).read()
1362                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1364                         return
1365
1366         def _real_extract(self, url):
1367                 # Extract id and simplified title from URL
1368                 mobj = re.match(self._VALID_URL, url)
1369                 if mobj is None:
1370                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1371                         return
1372
1373                 video_id = mobj.group(1)
1374
1375                 # Check if video comes from YouTube
1376                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1377                 if mobj2 is not None:
1378                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1379                         return
1380
1381                 # At this point we have a new video
1382                 self._downloader.increment_downloads()
1383
1384                 simple_title = mobj.group(2).decode('utf-8')
1385
1386                 # Retrieve video webpage to extract further information
1387                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1388                 try:
1389                         self.report_download_webpage(video_id)
1390                         webpage = urllib2.urlopen(request).read()
1391                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1392                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1393                         return
1394
1395                 # Extract URL, uploader and title from webpage
1396                 self.report_extraction(video_id)
1397                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1398                 if mobj is not None:
1399                         mediaURL = urllib.unquote(mobj.group(1))
1400                         video_extension = mediaURL[-3:]
1401
1402                         # Extract gdaKey if available
1403                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1404                         if mobj is None:
1405                                 video_url = mediaURL
1406                         else:
1407                                 gdaKey = mobj.group(1)
1408                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1409                 else:
1410                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1411                         if mobj is None:
1412                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1413                                 return
1414                         vardict = parse_qs(mobj.group(1))
1415                         if 'mediaData' not in vardict:
1416                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1417                                 return
1418                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1419                         if mobj is None:
1420                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1421                                 return
1422                         mediaURL = mobj.group(1).replace('\\/', '/')
1423                         video_extension = mediaURL[-3:]
1424                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1425
1426                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1427                 if mobj is None:
1428                         self._downloader.trouble(u'ERROR: unable to extract title')
1429                         return
1430                 video_title = mobj.group(1).decode('utf-8')
1431                 video_title = sanitize_title(video_title)
1432
1433                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1434                 if mobj is None:
1435                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1436                         return
1437                 video_uploader = mobj.group(1)
1438
1439                 try:
1440                         # Process video information
1441                         self._downloader.process_info({
1442                                 'id':           video_id.decode('utf-8'),
1443                                 'url':          video_url.decode('utf-8'),
1444                                 'uploader':     video_uploader.decode('utf-8'),
1445                                 'upload_date':  u'NA',
1446                                 'title':        video_title,
1447                                 'stitle':       simple_title,
1448                                 'ext':          video_extension.decode('utf-8'),
1449                                 'format':       u'NA',
1450                                 'player_url':   None,
1451                         })
1452                 except UnavailableVideoError:
1453                         self._downloader.trouble(u'\nERROR: unable to download video')
1454
1455
1456 class DailymotionIE(InfoExtractor):
1457         """Information Extractor for Dailymotion"""
1458
1459         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1460
1461         def __init__(self, downloader=None):
1462                 InfoExtractor.__init__(self, downloader)
1463
1464         @staticmethod
1465         def suitable(url):
1466                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1467
1468         def report_download_webpage(self, video_id):
1469                 """Report webpage download."""
1470                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1471
1472         def report_extraction(self, video_id):
1473                 """Report information extraction."""
1474                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1475
1476         def _real_initialize(self):
1477                 return
1478
1479         def _real_extract(self, url):
1480                 # Extract id and simplified title from URL
1481                 mobj = re.match(self._VALID_URL, url)
1482                 if mobj is None:
1483                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1484                         return
1485
1486                 # At this point we have a new video
1487                 self._downloader.increment_downloads()
1488                 video_id = mobj.group(1)
1489
1490                 simple_title = mobj.group(2).decode('utf-8')
1491                 video_extension = 'flv'
1492
1493                 # Retrieve video webpage to extract further information
1494                 request = urllib2.Request(url)
1495                 try:
1496                         self.report_download_webpage(video_id)
1497                         webpage = urllib2.urlopen(request).read()
1498                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1499                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1500                         return
1501
1502                 # Extract URL, uploader and title from webpage
1503                 self.report_extraction(video_id)
1504                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1505                 if mobj is None:
1506                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1507                         return
1508                 mediaURL = urllib.unquote(mobj.group(1))
1509
1510                 # if needed add http://www.dailymotion.com/ if relative URL
1511
1512                 video_url = mediaURL
1513
1514                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1515                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1516                 if mobj is None:
1517                         self._downloader.trouble(u'ERROR: unable to extract title')
1518                         return
1519                 video_title = mobj.group(1).decode('utf-8')
1520                 video_title = sanitize_title(video_title)
1521
1522                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1523                 if mobj is None:
1524                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1525                         return
1526                 video_uploader = mobj.group(1)
1527
1528                 try:
1529                         # Process video information
1530                         self._downloader.process_info({
1531                                 'id':           video_id.decode('utf-8'),
1532                                 'url':          video_url.decode('utf-8'),
1533                                 'uploader':     video_uploader.decode('utf-8'),
1534                                 'upload_date':  u'NA',
1535                                 'title':        video_title,
1536                                 'stitle':       simple_title,
1537                                 'ext':          video_extension.decode('utf-8'),
1538                                 'format':       u'NA',
1539                                 'player_url':   None,
1540                         })
1541                 except UnavailableVideoError:
1542                         self._downloader.trouble(u'\nERROR: unable to download video')
1543
1544 class GoogleIE(InfoExtractor):
1545         """Information extractor for video.google.com."""
1546
1547         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1548
1549         def __init__(self, downloader=None):
1550                 InfoExtractor.__init__(self, downloader)
1551
1552         @staticmethod
1553         def suitable(url):
1554                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1555
1556         def report_download_webpage(self, video_id):
1557                 """Report webpage download."""
1558                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1559
1560         def report_extraction(self, video_id):
1561                 """Report information extraction."""
1562                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1563
1564         def _real_initialize(self):
1565                 return
1566
1567         def _real_extract(self, url):
1568                 # Extract id from URL
1569                 mobj = re.match(self._VALID_URL, url)
1570                 if mobj is None:
1571                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1572                         return
1573
1574                 # At this point we have a new video
1575                 self._downloader.increment_downloads()
1576                 video_id = mobj.group(1)
1577
1578                 video_extension = 'mp4'
1579
1580                 # Retrieve video webpage to extract further information
1581                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1582                 try:
1583                         self.report_download_webpage(video_id)
1584                         webpage = urllib2.urlopen(request).read()
1585                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1587                         return
1588
1589                 # Extract URL, uploader, and title from webpage
1590                 self.report_extraction(video_id)
1591                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1592                 if mobj is None:
1593                         video_extension = 'flv'
1594                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1595                 if mobj is None:
1596                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1597                         return
1598                 mediaURL = urllib.unquote(mobj.group(1))
1599                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1600                 mediaURL = mediaURL.replace('\\x26', '\x26')
1601
1602                 video_url = mediaURL
1603
1604                 mobj = re.search(r'<title>(.*)</title>', webpage)
1605                 if mobj is None:
1606                         self._downloader.trouble(u'ERROR: unable to extract title')
1607                         return
1608                 video_title = mobj.group(1).decode('utf-8')
1609                 video_title = sanitize_title(video_title)
1610                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1611
1612                 # Extract video description
1613                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1614                 if mobj is None:
1615                         self._downloader.trouble(u'ERROR: unable to extract video description')
1616                         return
1617                 video_description = mobj.group(1).decode('utf-8')
1618                 if not video_description:
1619                         video_description = 'No description available.'
1620
1621                 # Extract video thumbnail
1622                 if self._downloader.params.get('forcethumbnail', False):
1623                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1624                         try:
1625                                 webpage = urllib2.urlopen(request).read()
1626                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1627                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1628                                 return
1629                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1630                         if mobj is None:
1631                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1632                                 return
1633                         video_thumbnail = mobj.group(1)
1634                 else:   # we need something to pass to process_info
1635                         video_thumbnail = ''
1636
1637
1638                 try:
1639                         # Process video information
1640                         self._downloader.process_info({
1641                                 'id':           video_id.decode('utf-8'),
1642                                 'url':          video_url.decode('utf-8'),
1643                                 'uploader':     u'NA',
1644                                 'upload_date':  u'NA',
1645                                 'title':        video_title,
1646                                 'stitle':       simple_title,
1647                                 'ext':          video_extension.decode('utf-8'),
1648                                 'format':       u'NA',
1649                                 'player_url':   None,
1650                         })
1651                 except UnavailableVideoError:
1652                         self._downloader.trouble(u'\nERROR: unable to download video')
1653
1654
1655 class PhotobucketIE(InfoExtractor):
1656         """Information extractor for photobucket.com."""
1657
1658         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1659
1660         def __init__(self, downloader=None):
1661                 InfoExtractor.__init__(self, downloader)
1662
1663         @staticmethod
1664         def suitable(url):
1665                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1666
1667         def report_download_webpage(self, video_id):
1668                 """Report webpage download."""
1669                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1670
1671         def report_extraction(self, video_id):
1672                 """Report information extraction."""
1673                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1674
1675         def _real_initialize(self):
1676                 return
1677
1678         def _real_extract(self, url):
1679                 # Extract id from URL
1680                 mobj = re.match(self._VALID_URL, url)
1681                 if mobj is None:
1682                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1683                         return
1684
1685                 # At this point we have a new video
1686                 self._downloader.increment_downloads()
1687                 video_id = mobj.group(1)
1688
1689                 video_extension = 'flv'
1690
1691                 # Retrieve video webpage to extract further information
1692                 request = urllib2.Request(url)
1693                 try:
1694                         self.report_download_webpage(video_id)
1695                         webpage = urllib2.urlopen(request).read()
1696                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1697                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1698                         return
1699
1700                 # Extract URL, uploader, and title from webpage
1701                 self.report_extraction(video_id)
1702                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1703                 if mobj is None:
1704                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1705                         return
1706                 mediaURL = urllib.unquote(mobj.group(1))
1707
1708                 video_url = mediaURL
1709
1710                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1711                 if mobj is None:
1712                         self._downloader.trouble(u'ERROR: unable to extract title')
1713                         return
1714                 video_title = mobj.group(1).decode('utf-8')
1715                 video_title = sanitize_title(video_title)
1716                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1717
1718                 video_uploader = mobj.group(2).decode('utf-8')
1719
1720                 try:
1721                         # Process video information
1722                         self._downloader.process_info({
1723                                 'id':           video_id.decode('utf-8'),
1724                                 'url':          video_url.decode('utf-8'),
1725                                 'uploader':     video_uploader,
1726                                 'upload_date':  u'NA',
1727                                 'title':        video_title,
1728                                 'stitle':       simple_title,
1729                                 'ext':          video_extension.decode('utf-8'),
1730                                 'format':       u'NA',
1731                                 'player_url':   None,
1732                         })
1733                 except UnavailableVideoError:
1734                         self._downloader.trouble(u'\nERROR: unable to download video')
1735
1736
1737 class YahooIE(InfoExtractor):
1738         """Information extractor for video.yahoo.com."""
1739
1740         # _VALID_URL matches all Yahoo! Video URLs
1741         # _VPAGE_URL matches only the extractable '/watch/' URLs
1742         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1743         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1744
1745         def __init__(self, downloader=None):
1746                 InfoExtractor.__init__(self, downloader)
1747
1748         @staticmethod
1749         def suitable(url):
1750                 return (re.match(YahooIE._VALID_URL, url) is not None)
1751
1752         def report_download_webpage(self, video_id):
1753                 """Report webpage download."""
1754                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1755
1756         def report_extraction(self, video_id):
1757                 """Report information extraction."""
1758                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1759
1760         def _real_initialize(self):
1761                 return
1762
1763         def _real_extract(self, url, new_video=True):
1764                 # Extract ID from URL
1765                 mobj = re.match(self._VALID_URL, url)
1766                 if mobj is None:
1767                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1768                         return
1769
1770                 # At this point we have a new video
1771                 self._downloader.increment_downloads()
1772                 video_id = mobj.group(2)
1773                 video_extension = 'flv'
1774
1775                 # Rewrite valid but non-extractable URLs as
1776                 # extractable English language /watch/ URLs
1777                 if re.match(self._VPAGE_URL, url) is None:
1778                         request = urllib2.Request(url)
1779                         try:
1780                                 webpage = urllib2.urlopen(request).read()
1781                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1782                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1783                                 return
1784
1785                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1786                         if mobj is None:
1787                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1788                                 return
1789                         yahoo_id = mobj.group(1)
1790
1791                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1792                         if mobj is None:
1793                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1794                                 return
1795                         yahoo_vid = mobj.group(1)
1796
1797                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1798                         return self._real_extract(url, new_video=False)
1799
1800                 # Retrieve video webpage to extract further information
1801                 request = urllib2.Request(url)
1802                 try:
1803                         self.report_download_webpage(video_id)
1804                         webpage = urllib2.urlopen(request).read()
1805                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1806                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1807                         return
1808
1809                 # Extract uploader and title from webpage
1810                 self.report_extraction(video_id)
1811                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1812                 if mobj is None:
1813                         self._downloader.trouble(u'ERROR: unable to extract video title')
1814                         return
1815                 video_title = mobj.group(1).decode('utf-8')
1816                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1817
1818                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1819                 if mobj is None:
1820                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1821                         return
1822                 video_uploader = mobj.group(1).decode('utf-8')
1823
1824                 # Extract video thumbnail
1825                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1826                 if mobj is None:
1827                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1828                         return
1829                 video_thumbnail = mobj.group(1).decode('utf-8')
1830
1831                 # Extract video description
1832                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1833                 if mobj is None:
1834                         self._downloader.trouble(u'ERROR: unable to extract video description')
1835                         return
1836                 video_description = mobj.group(1).decode('utf-8')
1837                 if not video_description: video_description = 'No description available.'
1838
1839                 # Extract video height and width
1840                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1841                 if mobj is None:
1842                         self._downloader.trouble(u'ERROR: unable to extract video height')
1843                         return
1844                 yv_video_height = mobj.group(1)
1845
1846                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1847                 if mobj is None:
1848                         self._downloader.trouble(u'ERROR: unable to extract video width')
1849                         return
1850                 yv_video_width = mobj.group(1)
1851
1852                 # Retrieve video playlist to extract media URL
1853                 # I'm not completely sure what all these options are, but we
1854                 # seem to need most of them, otherwise the server sends a 401.
1855                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1856                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1857                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1858                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1859                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1860                 try:
1861                         self.report_download_webpage(video_id)
1862                         webpage = urllib2.urlopen(request).read()
1863                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1864                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1865                         return
1866
1867                 # Extract media URL from playlist XML
1868                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1869                 if mobj is None:
1870                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1871                         return
1872                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1873                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1874
1875                 try:
1876                         # Process video information
1877                         self._downloader.process_info({
1878                                 'id':           video_id.decode('utf-8'),
1879                                 'url':          video_url,
1880                                 'uploader':     video_uploader,
1881                                 'upload_date':  u'NA',
1882                                 'title':        video_title,
1883                                 'stitle':       simple_title,
1884                                 'ext':          video_extension.decode('utf-8'),
1885                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1886                                 'description':  video_description,
1887                                 'thumbnail':    video_thumbnail,
1888                                 'description':  video_description,
1889                                 'player_url':   None,
1890                         })
1891                 except UnavailableVideoError:
1892                         self._downloader.trouble(u'\nERROR: unable to download video')
1893
1894
1895 class GenericIE(InfoExtractor):
1896         """Generic last-resort information extractor."""
1897
1898         def __init__(self, downloader=None):
1899                 InfoExtractor.__init__(self, downloader)
1900
1901         @staticmethod
1902         def suitable(url):
1903                 return True
1904
1905         def report_download_webpage(self, video_id):
1906                 """Report webpage download."""
1907                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1908                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1909
1910         def report_extraction(self, video_id):
1911                 """Report information extraction."""
1912                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1913
1914         def _real_initialize(self):
1915                 return
1916
1917         def _real_extract(self, url):
1918                 # At this point we have a new video
1919                 self._downloader.increment_downloads()
1920
1921                 video_id = url.split('/')[-1]
1922                 request = urllib2.Request(url)
1923                 try:
1924                         self.report_download_webpage(video_id)
1925                         webpage = urllib2.urlopen(request).read()
1926                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1927                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1928                         return
1929                 except ValueError, err:
1930                         # since this is the last-resort InfoExtractor, if
1931                         # this error is thrown, it'll be thrown here
1932                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1933                         return
1934
1935                 self.report_extraction(video_id)
1936                 # Start with something easy: JW Player in SWFObject
1937                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1938                 if mobj is None:
1939                         # Broaden the search a little bit
1940                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1941                 if mobj is None:
1942                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1943                         return
1944
1945                 # It's possible that one of the regexes
1946                 # matched, but returned an empty group:
1947                 if mobj.group(1) is None:
1948                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1949                         return
1950
1951                 video_url = urllib.unquote(mobj.group(1))
1952                 video_id  = os.path.basename(video_url)
1953
1954                 # here's a fun little line of code for you:
1955                 video_extension = os.path.splitext(video_id)[1][1:]
1956                 video_id        = os.path.splitext(video_id)[0]
1957
1958                 # it's tempting to parse this further, but you would
1959                 # have to take into account all the variations like
1960                 #   Video Title - Site Name
1961                 #   Site Name | Video Title
1962                 #   Video Title - Tagline | Site Name
1963                 # and so on and so forth; it's just not practical
1964                 mobj = re.search(r'<title>(.*)</title>', webpage)
1965                 if mobj is None:
1966                         self._downloader.trouble(u'ERROR: unable to extract title')
1967                         return
1968                 video_title = mobj.group(1).decode('utf-8')
1969                 video_title = sanitize_title(video_title)
1970                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1971
1972                 # video uploader is domain name
1973                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1974                 if mobj is None:
1975                         self._downloader.trouble(u'ERROR: unable to extract title')
1976                         return
1977                 video_uploader = mobj.group(1).decode('utf-8')
1978
1979                 try:
1980                         # Process video information
1981                         self._downloader.process_info({
1982                                 'id':           video_id.decode('utf-8'),
1983                                 'url':          video_url.decode('utf-8'),
1984                                 'uploader':     video_uploader,
1985                                 'upload_date':  u'NA',
1986                                 'title':        video_title,
1987                                 'stitle':       simple_title,
1988                                 'ext':          video_extension.decode('utf-8'),
1989                                 'format':       u'NA',
1990                                 'player_url':   None,
1991                         })
1992                 except UnavailableVideoError, err:
1993                         self._downloader.trouble(u'\nERROR: unable to download video')
1994
1995
1996 class YoutubeSearchIE(InfoExtractor):
1997         """Information Extractor for YouTube search queries."""
1998         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1999         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2000         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2001         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2002         _youtube_ie = None
2003         _max_youtube_results = 1000
2004
2005         def __init__(self, youtube_ie, downloader=None):
2006                 InfoExtractor.__init__(self, downloader)
2007                 self._youtube_ie = youtube_ie
2008
2009         @staticmethod
2010         def suitable(url):
2011                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2012
2013         def report_download_page(self, query, pagenum):
2014                 """Report attempt to download playlist page with given number."""
2015                 query = query.decode(preferredencoding())
2016                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2017
2018         def _real_initialize(self):
2019                 self._youtube_ie.initialize()
2020
2021         def _real_extract(self, query):
2022                 mobj = re.match(self._VALID_QUERY, query)
2023                 if mobj is None:
2024                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2025                         return
2026
2027                 prefix, query = query.split(':')
2028                 prefix = prefix[8:]
2029                 query  = query.encode('utf-8')
2030                 if prefix == '':
2031                         self._download_n_results(query, 1)
2032                         return
2033                 elif prefix == 'all':
2034                         self._download_n_results(query, self._max_youtube_results)
2035                         return
2036                 else:
2037                         try:
2038                                 n = long(prefix)
2039                                 if n <= 0:
2040                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2041                                         return
2042                                 elif n > self._max_youtube_results:
2043                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2044                                         n = self._max_youtube_results
2045                                 self._download_n_results(query, n)
2046                                 return
2047                         except ValueError: # parsing prefix as integer fails
2048                                 self._download_n_results(query, 1)
2049                                 return
2050
2051         def _download_n_results(self, query, n):
2052                 """Downloads a specified number of results for a query"""
2053
2054                 video_ids = []
2055                 already_seen = set()
2056                 pagenum = 1
2057
2058                 while True:
2059                         self.report_download_page(query, pagenum)
2060                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2061                         request = urllib2.Request(result_url)
2062                         try:
2063                                 page = urllib2.urlopen(request).read()
2064                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2065                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2066                                 return
2067
2068                         # Extract video identifiers
2069                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2070                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2071                                 if video_id not in already_seen:
2072                                         video_ids.append(video_id)
2073                                         already_seen.add(video_id)
2074                                         if len(video_ids) == n:
2075                                                 # Specified n videos reached
2076                                                 for id in video_ids:
2077                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2078                                                 return
2079
2080                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2081                                 for id in video_ids:
2082                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2083                                 return
2084
2085                         pagenum = pagenum + 1
2086
2087 class GoogleSearchIE(InfoExtractor):
2088         """Information Extractor for Google Video search queries."""
2089         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2090         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2091         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2092         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2093         _google_ie = None
2094         _max_google_results = 1000
2095
2096         def __init__(self, google_ie, downloader=None):
2097                 InfoExtractor.__init__(self, downloader)
2098                 self._google_ie = google_ie
2099
2100         @staticmethod
2101         def suitable(url):
2102                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2103
2104         def report_download_page(self, query, pagenum):
2105                 """Report attempt to download playlist page with given number."""
2106                 query = query.decode(preferredencoding())
2107                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2108
2109         def _real_initialize(self):
2110                 self._google_ie.initialize()
2111
2112         def _real_extract(self, query):
2113                 mobj = re.match(self._VALID_QUERY, query)
2114                 if mobj is None:
2115                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2116                         return
2117
2118                 prefix, query = query.split(':')
2119                 prefix = prefix[8:]
2120                 query  = query.encode('utf-8')
2121                 if prefix == '':
2122                         self._download_n_results(query, 1)
2123                         return
2124                 elif prefix == 'all':
2125                         self._download_n_results(query, self._max_google_results)
2126                         return
2127                 else:
2128                         try:
2129                                 n = long(prefix)
2130                                 if n <= 0:
2131                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2132                                         return
2133                                 elif n > self._max_google_results:
2134                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2135                                         n = self._max_google_results
2136                                 self._download_n_results(query, n)
2137                                 return
2138                         except ValueError: # parsing prefix as integer fails
2139                                 self._download_n_results(query, 1)
2140                                 return
2141
2142         def _download_n_results(self, query, n):
2143                 """Downloads a specified number of results for a query"""
2144
2145                 video_ids = []
2146                 already_seen = set()
2147                 pagenum = 1
2148
2149                 while True:
2150                         self.report_download_page(query, pagenum)
2151                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2152                         request = urllib2.Request(result_url)
2153                         try:
2154                                 page = urllib2.urlopen(request).read()
2155                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2156                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2157                                 return
2158
2159                         # Extract video identifiers
2160                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2161                                 video_id = mobj.group(1)
2162                                 if video_id not in already_seen:
2163                                         video_ids.append(video_id)
2164                                         already_seen.add(video_id)
2165                                         if len(video_ids) == n:
2166                                                 # Specified n videos reached
2167                                                 for id in video_ids:
2168                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2169                                                 return
2170
2171                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2172                                 for id in video_ids:
2173                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2174                                 return
2175
2176                         pagenum = pagenum + 1
2177
2178 class YahooSearchIE(InfoExtractor):
2179         """Information Extractor for Yahoo! Video search queries."""
2180         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2181         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2182         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2183         _MORE_PAGES_INDICATOR = r'\s*Next'
2184         _yahoo_ie = None
2185         _max_yahoo_results = 1000
2186
2187         def __init__(self, yahoo_ie, downloader=None):
2188                 InfoExtractor.__init__(self, downloader)
2189                 self._yahoo_ie = yahoo_ie
2190
2191         @staticmethod
2192         def suitable(url):
2193                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2194
2195         def report_download_page(self, query, pagenum):
2196                 """Report attempt to download playlist page with given number."""
2197                 query = query.decode(preferredencoding())
2198                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2199
2200         def _real_initialize(self):
2201                 self._yahoo_ie.initialize()
2202
2203         def _real_extract(self, query):
2204                 mobj = re.match(self._VALID_QUERY, query)
2205                 if mobj is None:
2206                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2207                         return
2208
2209                 prefix, query = query.split(':')
2210                 prefix = prefix[8:]
2211                 query  = query.encode('utf-8')
2212                 if prefix == '':
2213                         self._download_n_results(query, 1)
2214                         return
2215                 elif prefix == 'all':
2216                         self._download_n_results(query, self._max_yahoo_results)
2217                         return
2218                 else:
2219                         try:
2220                                 n = long(prefix)
2221                                 if n <= 0:
2222                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2223                                         return
2224                                 elif n > self._max_yahoo_results:
2225                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2226                                         n = self._max_yahoo_results
2227                                 self._download_n_results(query, n)
2228                                 return
2229                         except ValueError: # parsing prefix as integer fails
2230                                 self._download_n_results(query, 1)
2231                                 return
2232
2233         def _download_n_results(self, query, n):
2234                 """Downloads a specified number of results for a query"""
2235
2236                 video_ids = []
2237                 already_seen = set()
2238                 pagenum = 1
2239
2240                 while True:
2241                         self.report_download_page(query, pagenum)
2242                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2243                         request = urllib2.Request(result_url)
2244                         try:
2245                                 page = urllib2.urlopen(request).read()
2246                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2247                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2248                                 return
2249
2250                         # Extract video identifiers
2251                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2252                                 video_id = mobj.group(1)
2253                                 if video_id not in already_seen:
2254                                         video_ids.append(video_id)
2255                                         already_seen.add(video_id)
2256                                         if len(video_ids) == n:
2257                                                 # Specified n videos reached
2258                                                 for id in video_ids:
2259                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2260                                                 return
2261
2262                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2263                                 for id in video_ids:
2264                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2265                                 return
2266
2267                         pagenum = pagenum + 1
2268
2269 class YoutubePlaylistIE(InfoExtractor):
2270         """Information Extractor for YouTube playlists."""
2271
2272         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2273         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2274         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2275         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2276         _youtube_ie = None
2277
2278         def __init__(self, youtube_ie, downloader=None):
2279                 InfoExtractor.__init__(self, downloader)
2280                 self._youtube_ie = youtube_ie
2281
2282         @staticmethod
2283         def suitable(url):
2284                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2285
2286         def report_download_page(self, playlist_id, pagenum):
2287                 """Report attempt to download playlist page with given number."""
2288                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2289
2290         def _real_initialize(self):
2291                 self._youtube_ie.initialize()
2292
2293         def _real_extract(self, url):
2294                 # Extract playlist id
2295                 mobj = re.match(self._VALID_URL, url)
2296                 if mobj is None:
2297                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2298                         return
2299
2300                 # Single video case
2301                 if mobj.group(3) is not None:
2302                         self._youtube_ie.extract(mobj.group(3))
2303                         return
2304
2305                 # Download playlist pages
2306                 # prefix is 'p' as default for playlists but there are other types that need extra care
2307                 playlist_prefix = mobj.group(1)
2308                 if playlist_prefix == 'a':
2309                         playlist_access = 'artist'
2310                 else:
2311                         playlist_prefix = 'p'
2312                         playlist_access = 'view_play_list'
2313                 playlist_id = mobj.group(2)
2314                 video_ids = []
2315                 pagenum = 1
2316
2317                 while True:
2318                         self.report_download_page(playlist_id, pagenum)
2319                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2320                         try:
2321                                 page = urllib2.urlopen(request).read()
2322                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2323                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2324                                 return
2325
2326                         # Extract video identifiers
2327                         ids_in_page = []
2328                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2329                                 if mobj.group(1) not in ids_in_page:
2330                                         ids_in_page.append(mobj.group(1))
2331                         video_ids.extend(ids_in_page)
2332
2333                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2334                                 break
2335                         pagenum = pagenum + 1
2336
2337                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2338                 playlistend = self._downloader.params.get('playlistend', -1)
2339                 video_ids = video_ids[playliststart:playlistend]
2340
2341                 for id in video_ids:
2342                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2343                 return
2344
2345 class YoutubeUserIE(InfoExtractor):
2346         """Information Extractor for YouTube users."""
2347
2348         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2349         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2350         _GDATA_PAGE_SIZE = 50
2351         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2352         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2353         _youtube_ie = None
2354
2355         def __init__(self, youtube_ie, downloader=None):
2356                 InfoExtractor.__init__(self, downloader)
2357                 self._youtube_ie = youtube_ie
2358
2359         @staticmethod
2360         def suitable(url):
2361                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2362
2363         def report_download_page(self, username, start_index):
2364                 """Report attempt to download user page."""
2365                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2366                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2367
2368         def _real_initialize(self):
2369                 self._youtube_ie.initialize()
2370
2371         def _real_extract(self, url):
2372                 # Extract username
2373                 mobj = re.match(self._VALID_URL, url)
2374                 if mobj is None:
2375                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2376                         return
2377
2378                 username = mobj.group(1)
2379
2380                 # Download video ids using YouTube Data API. Result size per
2381                 # query is limited (currently to 50 videos) so we need to query
2382                 # page by page until there are no video ids - it means we got
2383                 # all of them.
2384
2385                 video_ids = []
2386                 pagenum = 0
2387
2388                 while True:
2389                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2390                         self.report_download_page(username, start_index)
2391
2392                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2393
2394                         try:
2395                                 page = urllib2.urlopen(request).read()
2396                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2397                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2398                                 return
2399
2400                         # Extract video identifiers
2401                         ids_in_page = []
2402
2403                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2404                                 if mobj.group(1) not in ids_in_page:
2405                                         ids_in_page.append(mobj.group(1))
2406
2407                         video_ids.extend(ids_in_page)
2408
2409                         # A little optimization - if current page is not
2410                         # "full", ie. does not contain PAGE_SIZE video ids then
2411                         # we can assume that this page is the last one - there
2412                         # are no more ids on further pages - no need to query
2413                         # again.
2414
2415                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2416                                 break
2417
2418                         pagenum += 1
2419
2420                 all_ids_count = len(video_ids)
2421                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2422                 playlistend = self._downloader.params.get('playlistend', -1)
2423
2424                 if playlistend == -1:
2425                         video_ids = video_ids[playliststart:]
2426                 else:
2427                         video_ids = video_ids[playliststart:playlistend]
2428
2429                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2430                                            (username, all_ids_count, len(video_ids)))
2431
2432                 for video_id in video_ids:
2433                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2434
2435
2436 class DepositFilesIE(InfoExtractor):
2437         """Information extractor for depositfiles.com"""
2438
2439         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2440
2441         def __init__(self, downloader=None):
2442                 InfoExtractor.__init__(self, downloader)
2443
2444         @staticmethod
2445         def suitable(url):
2446                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2447
2448         def report_download_webpage(self, file_id):
2449                 """Report webpage download."""
2450                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2451
2452         def report_extraction(self, file_id):
2453                 """Report information extraction."""
2454                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2455
2456         def _real_initialize(self):
2457                 return
2458
2459         def _real_extract(self, url):
2460                 # At this point we have a new file
2461                 self._downloader.increment_downloads()
2462
2463                 file_id = url.split('/')[-1]
2464                 # Rebuild url in english locale
2465                 url = 'http://depositfiles.com/en/files/' + file_id
2466
2467                 # Retrieve file webpage with 'Free download' button pressed
2468                 free_download_indication = { 'gateway_result' : '1' }
2469                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2470                 try:
2471                         self.report_download_webpage(file_id)
2472                         webpage = urllib2.urlopen(request).read()
2473                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2474                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2475                         return
2476
2477                 # Search for the real file URL
2478                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2479                 if (mobj is None) or (mobj.group(1) is None):
2480                         # Try to figure out reason of the error.
2481                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2482                         if (mobj is not None) and (mobj.group(1) is not None):
2483                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2484                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2485                         else:
2486                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2487                         return
2488
2489                 file_url = mobj.group(1)
2490                 file_extension = os.path.splitext(file_url)[1][1:]
2491
2492                 # Search for file title
2493                 mobj = re.search(r'<b title="(.*?)">', webpage)
2494                 if mobj is None:
2495                         self._downloader.trouble(u'ERROR: unable to extract title')
2496                         return
2497                 file_title = mobj.group(1).decode('utf-8')
2498
2499                 try:
2500                         # Process file information
2501                         self._downloader.process_info({
2502                                 'id':           file_id.decode('utf-8'),
2503                                 'url':          file_url.decode('utf-8'),
2504                                 'uploader':     u'NA',
2505                                 'upload_date':  u'NA',
2506                                 'title':        file_title,
2507                                 'stitle':       file_title,
2508                                 'ext':          file_extension.decode('utf-8'),
2509                                 'format':       u'NA',
2510                                 'player_url':   None,
2511                         })
2512                 except UnavailableVideoError, err:
2513                         self._downloader.trouble(u'ERROR: unable to download file')
2514
2515 class FacebookIE(InfoExtractor):
2516         """Information Extractor for Facebook"""
2517
2518         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2519         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2520         _NETRC_MACHINE = 'facebook'
2521         _available_formats = ['highqual', 'lowqual']
2522         _video_extensions = {
2523                 'highqual': 'mp4',
2524                 'lowqual': 'mp4',
2525         }
2526
2527         def __init__(self, downloader=None):
2528                 InfoExtractor.__init__(self, downloader)
2529
2530         @staticmethod
2531         def suitable(url):
2532                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2533
2534         def _reporter(self, message):
2535                 """Add header and report message."""
2536                 self._downloader.to_screen(u'[facebook] %s' % message)
2537
2538         def report_login(self):
2539                 """Report attempt to log in."""
2540                 self._reporter(u'Logging in')
2541
2542         def report_video_webpage_download(self, video_id):
2543                 """Report attempt to download video webpage."""
2544                 self._reporter(u'%s: Downloading video webpage' % video_id)
2545
2546         def report_information_extraction(self, video_id):
2547                 """Report attempt to extract video information."""
2548                 self._reporter(u'%s: Extracting video information' % video_id)
2549
2550         def _parse_page(self, video_webpage):
2551                 """Extract video information from page"""
2552                 # General data
2553                 data = {'title': r'class="video_title datawrap">(.*?)</',
2554                         'description': r'<div class="datawrap">(.*?)</div>',
2555                         'owner': r'\("video_owner_name", "(.*?)"\)',
2556                         'upload_date': r'data-date="(.*?)"',
2557                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2558                         }
2559                 video_info = {}
2560                 for piece in data.keys():
2561                         mobj = re.search(data[piece], video_webpage)
2562                         if mobj is not None:
2563                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2564
2565                 # Video urls
2566                 video_urls = {}
2567                 for fmt in self._available_formats:
2568                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2569                         if mobj is not None:
2570                                 # URL is in a Javascript segment inside an escaped Unicode format within
2571                                 # the generally utf-8 page
2572                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2573                 video_info['video_urls'] = video_urls
2574
2575                 return video_info
2576
2577         def _real_initialize(self):
2578                 if self._downloader is None:
2579                         return
2580
2581                 useremail = None
2582                 password = None
2583                 downloader_params = self._downloader.params
2584
2585                 # Attempt to use provided username and password or .netrc data
2586                 if downloader_params.get('username', None) is not None:
2587                         useremail = downloader_params['username']
2588                         password = downloader_params['password']
2589                 elif downloader_params.get('usenetrc', False):
2590                         try:
2591                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2592                                 if info is not None:
2593                                         useremail = info[0]
2594                                         password = info[2]
2595                                 else:
2596                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2597                         except (IOError, netrc.NetrcParseError), err:
2598                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2599                                 return
2600
2601                 if useremail is None:
2602                         return
2603
2604                 # Log in
2605                 login_form = {
2606                         'email': useremail,
2607                         'pass': password,
2608                         'login': 'Log+In'
2609                         }
2610                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2611                 try:
2612                         self.report_login()
2613                         login_results = urllib2.urlopen(request).read()
2614                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2615                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2616                                 return
2617                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2619                         return
2620
2621         def _real_extract(self, url):
2622                 mobj = re.match(self._VALID_URL, url)
2623                 if mobj is None:
2624                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2625                         return
2626                 video_id = mobj.group('ID')
2627
2628                 # Get video webpage
2629                 self.report_video_webpage_download(video_id)
2630                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2631                 try:
2632                         page = urllib2.urlopen(request)
2633                         video_webpage = page.read()
2634                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2635                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2636                         return
2637
2638                 # Start extracting information
2639                 self.report_information_extraction(video_id)
2640
2641                 # Extract information
2642                 video_info = self._parse_page(video_webpage)
2643
2644                 # uploader
2645                 if 'owner' not in video_info:
2646                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2647                         return
2648                 video_uploader = video_info['owner']
2649
2650                 # title
2651                 if 'title' not in video_info:
2652                         self._downloader.trouble(u'ERROR: unable to extract video title')
2653                         return
2654                 video_title = video_info['title']
2655                 video_title = video_title.decode('utf-8')
2656                 video_title = sanitize_title(video_title)
2657
2658                 # simplified title
2659                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2660                 simple_title = simple_title.strip(ur'_')
2661
2662                 # thumbnail image
2663                 if 'thumbnail' not in video_info:
2664                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2665                         video_thumbnail = ''
2666                 else:
2667                         video_thumbnail = video_info['thumbnail']
2668
2669                 # upload date
2670                 upload_date = u'NA'
2671                 if 'upload_date' in video_info:
2672                         upload_time = video_info['upload_date']
2673                         timetuple = email.utils.parsedate_tz(upload_time)
2674                         if timetuple is not None:
2675                                 try:
2676                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2677                                 except:
2678                                         pass
2679
2680                 # description
2681                 video_description = video_info.get('description', 'No description available.')
2682
2683                 url_map = video_info['video_urls']
2684                 if len(url_map.keys()) > 0:
2685                         # Decide which formats to download
2686                         req_format = self._downloader.params.get('format', None)
2687                         format_limit = self._downloader.params.get('format_limit', None)
2688
2689                         if format_limit is not None and format_limit in self._available_formats:
2690                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2691                         else:
2692                                 format_list = self._available_formats
2693                         existing_formats = [x for x in format_list if x in url_map]
2694                         if len(existing_formats) == 0:
2695                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2696                                 return
2697                         if req_format is None:
2698                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2699                         elif req_format == '-1':
2700                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2701                         else:
2702                                 # Specific format
2703                                 if req_format not in url_map:
2704                                         self._downloader.trouble(u'ERROR: requested format not available')
2705                                         return
2706                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2707
2708                 for format_param, video_real_url in video_url_list:
2709
2710                         # At this point we have a new video
2711                         self._downloader.increment_downloads()
2712
2713                         # Extension
2714                         video_extension = self._video_extensions.get(format_param, 'mp4')
2715
2716                         # Find the video URL in fmt_url_map or conn paramters
2717                         try:
2718                                 # Process video information
2719                                 self._downloader.process_info({
2720                                         'id':           video_id.decode('utf-8'),
2721                                         'url':          video_real_url.decode('utf-8'),
2722                                         'uploader':     video_uploader.decode('utf-8'),
2723                                         'upload_date':  upload_date,
2724                                         'title':        video_title,
2725                                         'stitle':       simple_title,
2726                                         'ext':          video_extension.decode('utf-8'),
2727                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2728                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2729                                         'description':  video_description.decode('utf-8'),
2730                                         'player_url':   None,
2731                                 })
2732                         except UnavailableVideoError, err:
2733                                 self._downloader.trouble(u'\nERROR: unable to download video')
2734
2735 class BlipTVIE(InfoExtractor):
2736         """Information extractor for blip.tv"""
2737
2738         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2739         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2740
2741         @staticmethod
2742         def suitable(url):
2743                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2744
2745         def report_extraction(self, file_id):
2746                 """Report information extraction."""
2747                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2748
2749         def _simplify_title(self, title):
2750                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2751                 res = res.strip(ur'_')
2752                 return res
2753
2754         def _real_extract(self, url):
2755                 mobj = re.match(self._VALID_URL, url)
2756                 if mobj is None:
2757                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2758                         return
2759
2760                 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2761                 request = urllib2.Request(json_url)
2762                 self.report_extraction(mobj.group(1))
2763                 try:
2764                         json_code = urllib2.urlopen(request).read()
2765                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2766                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2767                         return
2768                 try:
2769                         json_data = json.loads(json_code)
2770                         data = json_data['Post'] if 'Post' in json_data else json_data
2771
2772                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2773                         video_url = data['media']['url']
2774                         umobj = re.match(self._URL_EXT, video_url)
2775                         if umobj is None:
2776                                 raise ValueError('Can not determine filename extension')
2777                         ext = umobj.group(1)
2778
2779                         self._downloader.increment_downloads()
2780
2781                         info = {
2782                                 'id': data['item_id'],
2783                                 'url': video_url,
2784                                 'uploader': data['display_name'],
2785                                 'upload_date': upload_date,
2786                                 'title': data['title'],
2787                                 'stitle': self._simplify_title(data['title']),
2788                                 'ext': ext,
2789                                 'format': data['media']['mimeType'],
2790                                 'thumbnail': data['thumbnailUrl'],
2791                                 'description': data['description'],
2792                                 'player_url': data['embedUrl']
2793                         }
2794                 except (ValueError,KeyError), err:
2795                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2796                         return
2797
2798                 try:
2799                         self._downloader.process_info(info)
2800                 except UnavailableVideoError, err:
2801                         self._downloader.trouble(u'\nERROR: unable to download video')
2802
2803
2804 class PostProcessor(object):
2805         """Post Processor class.
2806
2807         PostProcessor objects can be added to downloaders with their
2808         add_post_processor() method. When the downloader has finished a
2809         successful download, it will take its internal chain of PostProcessors
2810         and start calling the run() method on each one of them, first with
2811         an initial argument and then with the returned value of the previous
2812         PostProcessor.
2813
2814         The chain will be stopped if one of them ever returns None or the end
2815         of the chain is reached.
2816
2817         PostProcessor objects follow a "mutual registration" process similar
2818         to InfoExtractor objects.
2819         """
2820
2821         _downloader = None
2822
2823         def __init__(self, downloader=None):
2824                 self._downloader = downloader
2825
2826         def set_downloader(self, downloader):
2827                 """Sets the downloader for this PP."""
2828                 self._downloader = downloader
2829
2830         def run(self, information):
2831                 """Run the PostProcessor.
2832
2833                 The "information" argument is a dictionary like the ones
2834                 composed by InfoExtractors. The only difference is that this
2835                 one has an extra field called "filepath" that points to the
2836                 downloaded file.
2837
2838                 When this method returns None, the postprocessing chain is
2839                 stopped. However, this method may return an information
2840                 dictionary that will be passed to the next postprocessing
2841                 object in the chain. It can be the one it received after
2842                 changing some fields.
2843
2844                 In addition, this method may raise a PostProcessingError
2845                 exception that will be taken into account by the downloader
2846                 it was called from.
2847                 """
2848                 return information # by default, do nothing
2849
2850 class FFmpegExtractAudioPP(PostProcessor):
2851
2852         def __init__(self, downloader=None, preferredcodec=None):
2853                 PostProcessor.__init__(self, downloader)
2854                 if preferredcodec is None:
2855                         preferredcodec = 'best'
2856                 self._preferredcodec = preferredcodec
2857
2858         @staticmethod
2859         def get_audio_codec(path):
2860                 try:
2861                         cmd = ['ffprobe', '-show_streams', '--', path]
2862                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2863                         output = handle.communicate()[0]
2864                         if handle.wait() != 0:
2865                                 return None
2866                 except (IOError, OSError):
2867                         return None
2868                 audio_codec = None
2869                 for line in output.split('\n'):
2870                         if line.startswith('codec_name='):
2871                                 audio_codec = line.split('=')[1].strip()
2872                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2873                                 return audio_codec
2874                 return None
2875
2876         @staticmethod
2877         def run_ffmpeg(path, out_path, codec, more_opts):
2878                 try:
2879                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2880                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2881                         return (ret == 0)
2882                 except (IOError, OSError):
2883                         return False
2884
2885         def run(self, information):
2886                 path = information['filepath']
2887
2888                 filecodec = self.get_audio_codec(path)
2889                 if filecodec is None:
2890                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2891                         return None
2892
2893                 more_opts = []
2894                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2895                         if filecodec == 'aac' or filecodec == 'mp3':
2896                                 # Lossless if possible
2897                                 acodec = 'copy'
2898                                 extension = filecodec
2899                                 if filecodec == 'aac':
2900                                         more_opts = ['-f', 'adts']
2901                         else:
2902                                 # MP3 otherwise.
2903                                 acodec = 'libmp3lame'
2904                                 extension = 'mp3'
2905                                 more_opts = ['-ab', '128k']
2906                 else:
2907                         # We convert the audio (lossy)
2908                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2909                         extension = self._preferredcodec
2910                         more_opts = ['-ab', '128k']
2911                         if self._preferredcodec == 'aac':
2912                                 more_opts += ['-f', 'adts']
2913
2914                 (prefix, ext) = os.path.splitext(path)
2915                 new_path = prefix + '.' + extension
2916                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2917                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2918
2919                 if not status:
2920                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2921                         return None
2922
2923                 try:
2924                         os.remove(path)
2925                 except (IOError, OSError):
2926                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2927                         return None
2928
2929                 information['filepath'] = new_path
2930                 return information
2931
2932 ### MAIN PROGRAM ###
2933 if __name__ == '__main__':
2934         try:
2935                 # Modules needed only when running the main program
2936                 import getpass
2937                 import optparse
2938
2939                 # Function to update the program file with the latest version from the repository.
2940                 def update_self(downloader, filename):
2941                         # Note: downloader only used for options
2942                         if not os.access(filename, os.W_OK):
2943                                 sys.exit('ERROR: no write permissions on %s' % filename)
2944
2945                         downloader.to_screen('Updating to latest stable version...')
2946                         try:
2947                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2948                                 latest_version = urllib.urlopen(latest_url).read().strip()
2949                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2950                                 newcontent = urllib.urlopen(prog_url).read()
2951                         except (IOError, OSError), err:
2952                                 sys.exit('ERROR: unable to download latest version')
2953                         try:
2954                                 stream = open(filename, 'w')
2955                                 stream.write(newcontent)
2956                                 stream.close()
2957                         except (IOError, OSError), err:
2958                                 sys.exit('ERROR: unable to overwrite current version')
2959                         downloader.to_screen('Updated to version %s' % latest_version)
2960
2961                 # Parse command line
2962                 parser = optparse.OptionParser(
2963                         usage='Usage: %prog [options] url...',
2964                         version='2011.07.09-phihag',
2965                         conflict_handler='resolve',
2966                 )
2967
2968                 parser.add_option('-h', '--help',
2969                                 action='help', help='print this help text and exit')
2970                 parser.add_option('-v', '--version',
2971                                 action='version', help='print program version and exit')
2972                 parser.add_option('-U', '--update',
2973                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2974                 parser.add_option('-i', '--ignore-errors',
2975                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2976                 parser.add_option('-r', '--rate-limit',
2977                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2978                 parser.add_option('-R', '--retries',
2979                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2980                 parser.add_option('--playlist-start',
2981                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2982                 parser.add_option('--playlist-end',
2983                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2984                 parser.add_option('--dump-user-agent',
2985                                 action='store_true', dest='dump_user_agent',
2986                                 help='display the current browser identification', default=False)
2987
2988                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2989                 authentication.add_option('-u', '--username',
2990                                 dest='username', metavar='USERNAME', help='account username')
2991                 authentication.add_option('-p', '--password',
2992                                 dest='password', metavar='PASSWORD', help='account password')
2993                 authentication.add_option('-n', '--netrc',
2994                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2995                 parser.add_option_group(authentication)
2996
2997                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2998                 video_format.add_option('-f', '--format',
2999                                 action='store', dest='format', metavar='FORMAT', help='video format code')
3000                 video_format.add_option('--all-formats',
3001                                 action='store_const', dest='format', help='download all available video formats', const='-1')
3002                 video_format.add_option('--max-quality',
3003                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3004                 parser.add_option_group(video_format)
3005
3006                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3007                 verbosity.add_option('-q', '--quiet',
3008                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3009                 verbosity.add_option('-s', '--simulate',
3010                                 action='store_true', dest='simulate', help='do not download video', default=False)
3011                 verbosity.add_option('-g', '--get-url',
3012                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3013                 verbosity.add_option('-e', '--get-title',
3014                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3015                 verbosity.add_option('--get-thumbnail',
3016                                 action='store_true', dest='getthumbnail',
3017                                 help='simulate, quiet but print thumbnail URL', default=False)
3018                 verbosity.add_option('--get-description',
3019                                 action='store_true', dest='getdescription',
3020                                 help='simulate, quiet but print video description', default=False)
3021                 verbosity.add_option('--get-filename',
3022                                 action='store_true', dest='getfilename',
3023                                 help='simulate, quiet but print output filename', default=False)
3024                 verbosity.add_option('--no-progress',
3025                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3026                 verbosity.add_option('--console-title',
3027                                 action='store_true', dest='consoletitle',
3028                                 help='display progress in console titlebar', default=False)
3029                 parser.add_option_group(verbosity)
3030
3031                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3032                 filesystem.add_option('-t', '--title',
3033                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
3034                 filesystem.add_option('-l', '--literal',
3035                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3036                 filesystem.add_option('-A', '--auto-number',
3037                                 action='store_true', dest='autonumber',
3038                                 help='number downloaded files starting from 00000', default=False)
3039                 filesystem.add_option('-o', '--output',
3040                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3041                 filesystem.add_option('-a', '--batch-file',
3042                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3043                 filesystem.add_option('-w', '--no-overwrites',
3044                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3045                 filesystem.add_option('-c', '--continue',
3046                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3047                 filesystem.add_option('--cookies',
3048                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3049                 filesystem.add_option('--no-part',
3050                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
3051                 filesystem.add_option('--no-mtime',
3052                                 action='store_false', dest='updatetime',
3053                                 help='do not use the Last-modified header to set the file modification time', default=True)
3054                 filesystem.add_option('--write-description',
3055                                 action='store_true', dest='writedescription',
3056                                 help='write video description to a .description file', default=False)
3057                 filesystem.add_option('--write-info-json',
3058                                 action='store_true', dest='writeinfojson',
3059                                 help='write video metadata to a .info.json file', default=False)
3060                 parser.add_option_group(filesystem)
3061
3062                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3063                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3064                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3065                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3066                                 help='"best", "aac" or "mp3"; best by default')
3067                 parser.add_option_group(postproc)
3068
3069                 (opts, args) = parser.parse_args()
3070
3071                 # Open appropriate CookieJar
3072                 if opts.cookiefile is None:
3073                         jar = cookielib.CookieJar()
3074                 else:
3075                         try:
3076                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3077                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3078                                         jar.load()
3079                         except (IOError, OSError), err:
3080                                 sys.exit(u'ERROR: unable to open cookie file')
3081
3082                 # Dump user agent
3083                 if opts.dump_user_agent:
3084                         print std_headers['User-Agent']
3085                         sys.exit(0)
3086
3087                 # General configuration
3088                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3089                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3090                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3091
3092                 # Batch file verification
3093                 batchurls = []
3094                 if opts.batchfile is not None:
3095                         try:
3096                                 if opts.batchfile == '-':
3097                                         batchfd = sys.stdin
3098                                 else:
3099                                         batchfd = open(opts.batchfile, 'r')
3100                                 batchurls = batchfd.readlines()
3101                                 batchurls = [x.strip() for x in batchurls]
3102                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3103                         except IOError:
3104                                 sys.exit(u'ERROR: batch file could not be read')
3105                 all_urls = batchurls + args
3106
3107                 # Conflicting, missing and erroneous options
3108                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3109                         parser.error(u'using .netrc conflicts with giving username/password')
3110                 if opts.password is not None and opts.username is None:
3111                         parser.error(u'account username missing')
3112                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3113                         parser.error(u'using output template conflicts with using title, literal title or auto number')
3114                 if opts.usetitle and opts.useliteral:
3115                         parser.error(u'using title conflicts with using literal title')
3116                 if opts.username is not None and opts.password is None:
3117                         opts.password = getpass.getpass(u'Type account password and press return:')
3118                 if opts.ratelimit is not None:
3119                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3120                         if numeric_limit is None:
3121                                 parser.error(u'invalid rate limit specified')
3122                         opts.ratelimit = numeric_limit
3123                 if opts.retries is not None:
3124                         try:
3125                                 opts.retries = long(opts.retries)
3126                         except (TypeError, ValueError), err:
3127                                 parser.error(u'invalid retry count specified')
3128                 try:
3129                         opts.playliststart = long(opts.playliststart)
3130                         if opts.playliststart <= 0:
3131                                 raise ValueError
3132                 except (TypeError, ValueError), err:
3133                         parser.error(u'invalid playlist start number specified')
3134                 try:
3135                         opts.playlistend = long(opts.playlistend)
3136                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3137                                 raise ValueError
3138                 except (TypeError, ValueError), err:
3139                         parser.error(u'invalid playlist end number specified')
3140                 if opts.extractaudio:
3141                         if opts.audioformat not in ['best', 'aac', 'mp3']:
3142                                 parser.error(u'invalid audio format specified')
3143
3144                 # Information extractors
3145                 youtube_ie = YoutubeIE()
3146                 metacafe_ie = MetacafeIE(youtube_ie)
3147                 dailymotion_ie = DailymotionIE()
3148                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3149                 youtube_user_ie = YoutubeUserIE(youtube_ie)
3150                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3151                 google_ie = GoogleIE()
3152                 google_search_ie = GoogleSearchIE(google_ie)
3153                 photobucket_ie = PhotobucketIE()
3154                 yahoo_ie = YahooIE()
3155                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3156                 deposit_files_ie = DepositFilesIE()
3157                 facebook_ie = FacebookIE()
3158                 bliptv_ie = BlipTVIE()
3159                 generic_ie = GenericIE()
3160
3161                 # File downloader
3162                 fd = FileDownloader({
3163                         'usenetrc': opts.usenetrc,
3164                         'username': opts.username,
3165                         'password': opts.password,
3166                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3167                         'forceurl': opts.geturl,
3168                         'forcetitle': opts.gettitle,
3169                         'forcethumbnail': opts.getthumbnail,
3170                         'forcedescription': opts.getdescription,
3171                         'forcefilename': opts.getfilename,
3172                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3173                         'format': opts.format,
3174                         'format_limit': opts.format_limit,
3175                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3176                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3177                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3178                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3179                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3180                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3181                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3182                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3183                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3184                                 or u'%(id)s.%(ext)s'),
3185                         'ignoreerrors': opts.ignoreerrors,
3186                         'ratelimit': opts.ratelimit,
3187                         'nooverwrites': opts.nooverwrites,
3188                         'retries': opts.retries,
3189                         'continuedl': opts.continue_dl,
3190                         'noprogress': opts.noprogress,
3191                         'playliststart': opts.playliststart,
3192                         'playlistend': opts.playlistend,
3193                         'logtostderr': opts.outtmpl == '-',
3194                         'consoletitle': opts.consoletitle,
3195                         'nopart': opts.nopart,
3196                         'updatetime': opts.updatetime,
3197                         'writedescription': opts.writedescription,
3198                         'writeinfojson': opts.writeinfojson,
3199                         })
3200                 fd.add_info_extractor(youtube_search_ie)
3201                 fd.add_info_extractor(youtube_pl_ie)
3202                 fd.add_info_extractor(youtube_user_ie)
3203                 fd.add_info_extractor(metacafe_ie)
3204                 fd.add_info_extractor(dailymotion_ie)
3205                 fd.add_info_extractor(youtube_ie)
3206                 fd.add_info_extractor(google_ie)
3207                 fd.add_info_extractor(google_search_ie)
3208                 fd.add_info_extractor(photobucket_ie)
3209                 fd.add_info_extractor(yahoo_ie)
3210                 fd.add_info_extractor(yahoo_search_ie)
3211                 fd.add_info_extractor(deposit_files_ie)
3212                 fd.add_info_extractor(facebook_ie)
3213                 fd.add_info_extractor(bliptv_ie)
3214
3215                 # This must come last since it's the
3216                 # fallback if none of the others work
3217                 fd.add_info_extractor(generic_ie)
3218
3219                 # PostProcessors
3220                 if opts.extractaudio:
3221                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3222
3223                 # Update version
3224                 if opts.update_self:
3225                         update_self(fd, sys.argv[0])
3226
3227                 # Maybe do nothing
3228                 if len(all_urls) < 1:
3229                         if not opts.update_self:
3230                                 parser.error(u'you must provide at least one URL')
3231                         else:
3232                                 sys.exit()
3233                 retcode = fd.download(all_urls)
3234
3235                 # Dump cookie jar if requested
3236                 if opts.cookiefile is not None:
3237                         try:
3238                                 jar.save()
3239                         except (IOError, OSError), err:
3240                                 sys.exit(u'ERROR: unable to save cookie jar')
3241
3242                 sys.exit(retcode)
3243
3244         except DownloadError:
3245                 sys.exit(1)
3246         except SameFileError:
3247                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3248         except KeyboardInterrupt:
3249                 sys.exit(u'\nERROR: Interrupted by user')