youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # Author: Paweł Paprota
   9 # Author: Gergely Imreh
  10 # Author: Philipp Hagemeister <phihag@phihag.de>
  11 # License: Public domain code
  12 from __future__ import with_statement
  13 import contextlib
  14 import cookielib
  15 import ctypes
  16 import datetime
  17 import email.utils
  18 import gzip
  19 import htmlentitydefs
  20 import httplib
  21 import locale
  22 import math
  23 import netrc
  24 import os
  25 import os.path
  26 import re
  27 import socket
  28 import string
  29 import subprocess
  30 import sys
  31 import time
  32 import urllib
  33 import urllib2
  34 import warnings
  35 import zlib
  36
  37 try:
  38         import cStringIO as StringIO
  39 except ImportError:
  40         import StringIO
  41
  42 # parse_qs was moved from the cgi module to the urlparse module recently.
  43 try:
  44         from urlparse import parse_qs
  45 except ImportError:
  46         from cgi import parse_qs
  47
  48 try:
  49         import lxml.etree
  50 except ImportError: # Python < 2.6
  51         pass # Handled below
  52
  53 std_headers = {
  54         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
  55         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  56         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  57         'Accept-Encoding': 'gzip, deflate',
  58         'Accept-Language': 'en-us,en;q=0.5',
  59 }
  60
  61 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  62
  63 try:
  64         import json
  65 except ImportError: # Python <2.5, use trivialjson (https://github.com/phihag/trivialjson):
  66         import re
  67         class json(object):
  68                 @staticmethod
  69                 def loads(s):
  70                         s = s.decode('UTF-8')
  71                         def raiseError(msg, i):
  72                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  73                         def skipSpace(i, expectMore=True):
  74                                 while i < len(s) and s[i] in ' \t\r\n':
  75                                         i += 1
  76                                 if expectMore:
  77                                         if i >= len(s):
  78                                                 raiseError('Premature end', i)
  79                                 return i
  80                         def decodeEscape(match):
  81                                 esc = match.group(1)
  82                                 _STATIC = {
  83                                         '"': '"',
  84                                         '\\': '\\',
  85                                         '/': '/',
  86                                         'b': unichr(0x8),
  87                                         'f': unichr(0xc),
  88                                         'n': '\n',
  89                                         'r': '\r',
  90                                         't': '\t',
  91                                 }
  92                                 if esc in _STATIC:
  93                                         return _STATIC[esc]
  94                                 if esc[0] == 'u':
  95                                         if len(esc) == 1+4:
  96                                                 return unichr(int(esc[1:5], 16))
  97                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
  98                                                 hi = int(esc[1:5], 16)
  99                                                 low = int(esc[7:11], 16)
 100                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 101                                 raise ValueError('Unknown escape ' + str(esc))
 102                         def parseString(i):
 103                                 i += 1
 104                                 e = i
 105                                 while True:
 106                                         e = s.index('"', e)
 107                                         bslashes = 0
 108                                         while s[e-bslashes-1] == '\\':
 109                                                 bslashes += 1
 110                                         if bslashes % 2 == 1:
 111                                                 e += 1
 112                                                 continue
 113                                         break
 114                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 115                                 stri = rexp.sub(decodeEscape, s[i:e])
 116                                 return (e+1,stri)
 117                         def parseObj(i):
 118                                 i += 1
 119                                 res = {}
 120                                 i = skipSpace(i)
 121                                 if s[i] == '}': # Empty dictionary
 122                                         return (i+1,res)
 123                                 while True:
 124                                         if s[i] != '"':
 125                                                 raiseError('Expected a string object key', i)
 126                                         i,key = parseString(i)
 127                                         i = skipSpace(i)
 128                                         if i >= len(s) or s[i] != ':':
 129                                                 raiseError('Expected a colon', i)
 130                                         i,val = parse(i+1)
 131                                         res[key] = val
 132                                         i = skipSpace(i)
 133                                         if s[i] == '}':
 134                                                 return (i+1, res)
 135                                         if s[i] != ',':
 136                                                 raiseError('Expected comma or closing curly brace', i)
 137                                         i = skipSpace(i+1)
 138                         def parseArray(i):
 139                                 res = []
 140                                 i = skipSpace(i+1)
 141                                 if s[i] == ']': # Empty array
 142                                         return (i+1,res)
 143                                 while True:
 144                                         i,val = parse(i)
 145                                         res.append(val)
 146                                         i = skipSpace(i) # Raise exception if premature end
 147                                         if s[i] == ']':
 148                                                 return (i+1, res)
 149                                         if s[i] != ',':
 150                                                 raiseError('Expected a comma or closing bracket', i)
 151                                         i = skipSpace(i+1)
 152                         def parseDiscrete(i):
 153                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 154                                         if s.startswith(k, i):
 155                                                 return (i+len(k), v)
 156                                 raiseError('Not a boolean (or null)', i)
 157                         def parseNumber(i):
 158                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 159                                 if mobj is None:
 160                                         raiseError('Not a number', i)
 161                                 nums = mobj.group(1)
 162                                 if '.' in nums or 'e' in nums or 'E' in nums:
 163                                         return (i+len(nums), float(nums))
 164                                 return (i+len(nums), int(nums))
 165                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 166                         def parse(i):
 167                                 i = skipSpace(i)
 168                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 169                                 i = skipSpace(i, False)
 170                                 return (i,res)
 171                         i,res = parse(0)
 172                         if i < len(s):
 173                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 174                         return res
 175
 176 def preferredencoding():
 177         """Get preferred encoding.
 178
 179         Returns the best encoding scheme for the system, based on
 180         locale.getpreferredencoding() and some further tweaks.
 181         """
 182         def yield_preferredencoding():
 183                 try:
 184                         pref = locale.getpreferredencoding()
 185                         u'TEST'.encode(pref)
 186                 except:
 187                         pref = 'UTF-8'
 188                 while True:
 189                         yield pref
 190         return yield_preferredencoding().next()
 191
 192 def htmlentity_transform(matchobj):
 193         """Transforms an HTML entity to a Unicode character.
 194
 195         This function receives a match object and is intended to be used with
 196         the re.sub() function.
 197         """
 198         entity = matchobj.group(1)
 199
 200         # Known non-numeric HTML entity
 201         if entity in htmlentitydefs.name2codepoint:
 202                 return unichr(htmlentitydefs.name2codepoint[entity])
 203
 204         # Unicode character
 205         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 206         if mobj is not None:
 207                 numstr = mobj.group(1)
 208                 if numstr.startswith(u'x'):
 209                         base = 16
 210                         numstr = u'0%s' % numstr
 211                 else:
 212                         base = 10
 213                 return unichr(long(numstr, base))
 214
 215         # Unknown entity in name, return its literal representation
 216         return (u'&%s;' % entity)
 217
 218 def sanitize_title(utitle):
 219         """Sanitizes a video title so it could be used as part of a filename."""
 220         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 221         return utitle.replace(unicode(os.sep), u'%')
 222
 223 def sanitize_open(filename, open_mode):
 224         """Try to open the given filename, and slightly tweak it if this fails.
 225
 226         Attempts to open the given filename. If this fails, it tries to change
 227         the filename slightly, step by step, until it's either able to open it
 228         or it fails and raises a final exception, like the standard open()
 229         function.
 230
 231         It returns the tuple (stream, definitive_file_name).
 232         """
 233         try:
 234                 if filename == u'-':
 235                         if sys.platform == 'win32':
 236                                 import msvcrt
 237                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 238                         return (sys.stdout, filename)
 239                 stream = open(filename, open_mode)
 240                 return (stream, filename)
 241         except (IOError, OSError), err:
 242                 # In case of error, try to remove win32 forbidden chars
 243                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 244
 245                 # An exception here should be caught in the caller
 246                 stream = open(filename, open_mode)
 247                 return (stream, filename)
 248
 249 def timeconvert(timestr):
 250     """Convert RFC 2822 defined time string into system timestamp"""
 251     timestamp = None
 252     timetuple = email.utils.parsedate_tz(timestr)
 253     if timetuple is not None:
 254         timestamp = email.utils.mktime_tz(timetuple)
 255     return timestamp
 256
 257 class DownloadError(Exception):
 258         """Download Error exception.
 259
 260         This exception may be thrown by FileDownloader objects if they are not
 261         configured to continue on errors. They will contain the appropriate
 262         error message.
 263         """
 264         pass
 265
 266 class SameFileError(Exception):
 267         """Same File exception.
 268
 269         This exception will be thrown by FileDownloader objects if they detect
 270         multiple files would have to be downloaded to the same file on disk.
 271         """
 272         pass
 273
 274 class PostProcessingError(Exception):
 275         """Post Processing exception.
 276
 277         This exception may be raised by PostProcessor's .run() method to
 278         indicate an error in the postprocessing task.
 279         """
 280         pass
 281
 282 class UnavailableVideoError(Exception):
 283         """Unavailable Format exception.
 284
 285         This exception will be thrown when a video is requested
 286         in a format that is not available for that video.
 287         """
 288         pass
 289
 290 class ContentTooShortError(Exception):
 291         """Content Too Short exception.
 292
 293         This exception may be raised by FileDownloader objects when a file they
 294         download is too small for what the server announced first, indicating
 295         the connection was probably interrupted.
 296         """
 297         # Both in bytes
 298         downloaded = None
 299         expected = None
 300
 301         def __init__(self, downloaded, expected):
 302                 self.downloaded = downloaded
 303                 self.expected = expected
 304
 305 class YoutubeDLHandler(urllib2.HTTPHandler):
 306         """Handler for HTTP requests and responses.
 307
 308         This class, when installed with an OpenerDirector, automatically adds
 309         the standard headers to every HTTP request and handles gzipped and
 310         deflated responses from web servers. If compression is to be avoided in
 311         a particular request, the original request in the program code only has
 312         to include the HTTP header "Youtubedl-No-Compression", which will be
 313         removed before making the real request.
 314
 315         Part of this code was copied from:
 316
 317           http://techknack.net/python-urllib2-handlers/
 318
 319         Andrew Rowls, the author of that code, agreed to release it to the
 320         public domain.
 321         """
 322
 323         @staticmethod
 324         def deflate(data):
 325                 try:
 326                         return zlib.decompress(data, -zlib.MAX_WBITS)
 327                 except zlib.error:
 328                         return zlib.decompress(data)
 329
 330         @staticmethod
 331         def addinfourl_wrapper(stream, headers, url, code):
 332                 if hasattr(urllib2.addinfourl, 'getcode'):
 333                         return urllib2.addinfourl(stream, headers, url, code)
 334                 ret = urllib2.addinfourl(stream, headers, url)
 335                 ret.code = code
 336                 return ret
 337
 338         def http_request(self, req):
 339                 for h in std_headers:
 340                         if h in req.headers:
 341                                 del req.headers[h]
 342                         req.add_header(h, std_headers[h])
 343                 if 'Youtubedl-no-compression' in req.headers:
 344                         if 'Accept-encoding' in req.headers:
 345                                 del req.headers['Accept-encoding']
 346                         del req.headers['Youtubedl-no-compression']
 347                 return req
 348
 349         def http_response(self, req, resp):
 350                 old_resp = resp
 351                 # gzip
 352                 if resp.headers.get('Content-encoding', '') == 'gzip':
 353                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 354                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 355                         resp.msg = old_resp.msg
 356                 # deflate
 357                 if resp.headers.get('Content-encoding', '') == 'deflate':
 358                         gz = StringIO.StringIO(self.deflate(resp.read()))
 359                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 360                         resp.msg = old_resp.msg
 361                 return resp
 362
 363 class FileDownloader(object):
 364         """File Downloader class.
 365
 366         File downloader objects are the ones responsible of downloading the
 367         actual video file and writing it to disk if the user has requested
 368         it, among some other tasks. In most cases there should be one per
 369         program. As, given a video URL, the downloader doesn't know how to
 370         extract all the needed information, task that InfoExtractors do, it
 371         has to pass the URL to one of them.
 372
 373         For this, file downloader objects have a method that allows
 374         InfoExtractors to be registered in a given order. When it is passed
 375         a URL, the file downloader handles it to the first InfoExtractor it
 376         finds that reports being able to handle it. The InfoExtractor extracts
 377         all the information about the video or videos the URL refers to, and
 378         asks the FileDownloader to process the video information, possibly
 379         downloading the video.
 380
 381         File downloaders accept a lot of parameters. In order not to saturate
 382         the object constructor with arguments, it receives a dictionary of
 383         options instead. These options are available through the params
 384         attribute for the InfoExtractors to use. The FileDownloader also
 385         registers itself as the downloader in charge for the InfoExtractors
 386         that are added to it, so this is a "mutual registration".
 387
 388         Available options:
 389
 390         username:         Username for authentication purposes.
 391         password:         Password for authentication purposes.
 392         usenetrc:         Use netrc for authentication instead.
 393         quiet:            Do not print messages to stdout.
 394         forceurl:         Force printing final URL.
 395         forcetitle:       Force printing title.
 396         forcethumbnail:   Force printing thumbnail URL.
 397         forcedescription: Force printing description.
 398         forcefilename:    Force printing final filename.
 399         simulate:         Do not download the video files.
 400         format:           Video format code.
 401         format_limit:     Highest quality format to try.
 402         outtmpl:          Template for output names.
 403         ignoreerrors:     Do not stop on download errors.
 404         ratelimit:        Download speed limit, in bytes/sec.
 405         nooverwrites:     Prevent overwriting files.
 406         retries:          Number of times to retry for HTTP error 5xx
 407         continuedl:       Try to continue downloads if possible.
 408         noprogress:       Do not print the progress bar.
 409         playliststart:    Playlist item to start at.
 410         playlistend:      Playlist item to end at.
 411         logtostderr:      Log messages to stderr instead of stdout.
 412         consoletitle:     Display progress in console window's titlebar.
 413         nopart:           Do not use temporary .part files.
 414         updatetime:       Use the Last-modified header to set output file timestamps.
 415         writedescription: Write the video description to a .description file
 416         writeinfojson:    Write the video description to a .info.json file
 417         """
 418
 419         params = None
 420         _ies = []
 421         _pps = []
 422         _download_retcode = None
 423         _num_downloads = None
 424         _screen_file = None
 425
 426         def __init__(self, params):
 427                 """Create a FileDownloader object with the given options."""
 428                 self._ies = []
 429                 self._pps = []
 430                 self._download_retcode = 0
 431                 self._num_downloads = 0
 432                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 433                 self.params = params
 434
 435         @staticmethod
 436         def pmkdir(filename):
 437                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 438                 components = filename.split(os.sep)
 439                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 440                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 441                 for dir in aggregate:
 442                         if not os.path.exists(dir):
 443                                 os.mkdir(dir)
 444
 445         @staticmethod
 446         def format_bytes(bytes):
 447                 if bytes is None:
 448                         return 'N/A'
 449                 if type(bytes) is str:
 450                         bytes = float(bytes)
 451                 if bytes == 0.0:
 452                         exponent = 0
 453                 else:
 454                         exponent = long(math.log(bytes, 1024.0))
 455                 suffix = 'bkMGTPEZY'[exponent]
 456                 converted = float(bytes) / float(1024**exponent)
 457                 return '%.2f%s' % (converted, suffix)
 458
 459         @staticmethod
 460         def calc_percent(byte_counter, data_len):
 461                 if data_len is None:
 462                         return '---.-%'
 463                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 464
 465         @staticmethod
 466         def calc_eta(start, now, total, current):
 467                 if total is None:
 468                         return '--:--'
 469                 dif = now - start
 470                 if current == 0 or dif < 0.001: # One millisecond
 471                         return '--:--'
 472                 rate = float(current) / dif
 473                 eta = long((float(total) - float(current)) / rate)
 474                 (eta_mins, eta_secs) = divmod(eta, 60)
 475                 if eta_mins > 99:
 476                         return '--:--'
 477                 return '%02d:%02d' % (eta_mins, eta_secs)
 478
 479         @staticmethod
 480         def calc_speed(start, now, bytes):
 481                 dif = now - start
 482                 if bytes == 0 or dif < 0.001: # One millisecond
 483                         return '%10s' % '---b/s'
 484                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 485
 486         @staticmethod
 487         def best_block_size(elapsed_time, bytes):
 488                 new_min = max(bytes / 2.0, 1.0)
 489                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 490                 if elapsed_time < 0.001:
 491                         return long(new_max)
 492                 rate = bytes / elapsed_time
 493                 if rate > new_max:
 494                         return long(new_max)
 495                 if rate < new_min:
 496                         return long(new_min)
 497                 return long(rate)
 498
 499         @staticmethod
 500         def parse_bytes(bytestr):
 501                 """Parse a string indicating a byte quantity into a long integer."""
 502                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 503                 if matchobj is None:
 504                         return None
 505                 number = float(matchobj.group(1))
 506                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 507                 return long(round(number * multiplier))
 508
 509         def add_info_extractor(self, ie):
 510                 """Add an InfoExtractor object to the end of the list."""
 511                 self._ies.append(ie)
 512                 ie.set_downloader(self)
 513
 514         def add_post_processor(self, pp):
 515                 """Add a PostProcessor object to the end of the chain."""
 516                 self._pps.append(pp)
 517                 pp.set_downloader(self)
 518
 519         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 520                 """Print message to stdout if not in quiet mode."""
 521                 try:
 522                         if not self.params.get('quiet', False):
 523                                 terminator = [u'\n', u''][skip_eol]
 524                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 525                         self._screen_file.flush()
 526                 except (UnicodeEncodeError), err:
 527                         if not ignore_encoding_errors:
 528                                 raise
 529
 530         def to_stderr(self, message):
 531                 """Print message to stderr."""
 532                 print >>sys.stderr, message.encode(preferredencoding())
 533
 534         def to_cons_title(self, message):
 535                 """Set console/terminal window title to message."""
 536                 if not self.params.get('consoletitle', False):
 537                         return
 538                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 539                         # c_wchar_p() might not be necessary if `message` is
 540                         # already of type unicode()
 541                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 542                 elif 'TERM' in os.environ:
 543                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 544
 545         def fixed_template(self):
 546                 """Checks if the output template is fixed."""
 547                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 548
 549         def trouble(self, message=None):
 550                 """Determine action to take when a download problem appears.
 551
 552                 Depending on if the downloader has been configured to ignore
 553                 download errors or not, this method may throw an exception or
 554                 not when errors are found, after printing the message.
 555                 """
 556                 if message is not None:
 557                         self.to_stderr(message)
 558                 if not self.params.get('ignoreerrors', False):
 559                         raise DownloadError(message)
 560                 self._download_retcode = 1
 561
 562         def slow_down(self, start_time, byte_counter):
 563                 """Sleep if the download speed is over the rate limit."""
 564                 rate_limit = self.params.get('ratelimit', None)
 565                 if rate_limit is None or byte_counter == 0:
 566                         return
 567                 now = time.time()
 568                 elapsed = now - start_time
 569                 if elapsed <= 0.0:
 570                         return
 571                 speed = float(byte_counter) / elapsed
 572                 if speed > rate_limit:
 573                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 574
 575         def temp_name(self, filename):
 576                 """Returns a temporary filename for the given filename."""
 577                 if self.params.get('nopart', False) or filename == u'-' or \
 578                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 579                         return filename
 580                 return filename + u'.part'
 581
 582         def undo_temp_name(self, filename):
 583                 if filename.endswith(u'.part'):
 584                         return filename[:-len(u'.part')]
 585                 return filename
 586
 587         def try_rename(self, old_filename, new_filename):
 588                 try:
 589                         if old_filename == new_filename:
 590                                 return
 591                         os.rename(old_filename, new_filename)
 592                 except (IOError, OSError), err:
 593                         self.trouble(u'ERROR: unable to rename file')
 594
 595         def try_utime(self, filename, last_modified_hdr):
 596                 """Try to set the last-modified time of the given file."""
 597                 if last_modified_hdr is None:
 598                         return
 599                 if not os.path.isfile(filename):
 600                         return
 601                 timestr = last_modified_hdr
 602                 if timestr is None:
 603                         return
 604                 filetime = timeconvert(timestr)
 605                 if filetime is None:
 606                         return
 607                 try:
 608                         os.utime(filename,(time.time(), filetime))
 609                 except:
 610                         pass
 611
 612         def report_writedescription(self, descfn):
 613                 """ Report that the description file is being written """
 614                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 615
 616         def report_writeinfojson(self, infofn):
 617                 """ Report that the metadata file has been written """
 618                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 619
 620         def report_destination(self, filename):
 621                 """Report destination filename."""
 622                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 623
 624         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 625                 """Report download progress."""
 626                 if self.params.get('noprogress', False):
 627                         return
 628                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 629                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 630                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 631                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 632
 633         def report_resuming_byte(self, resume_len):
 634                 """Report attempt to resume at given byte."""
 635                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 636
 637         def report_retry(self, count, retries):
 638                 """Report retry in case of HTTP error 5xx"""
 639                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 640
 641         def report_file_already_downloaded(self, file_name):
 642                 """Report file has already been fully downloaded."""
 643                 try:
 644                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 645                 except (UnicodeEncodeError), err:
 646                         self.to_screen(u'[download] The file has already been downloaded')
 647
 648         def report_unable_to_resume(self):
 649                 """Report it was impossible to resume download."""
 650                 self.to_screen(u'[download] Unable to resume')
 651
 652         def report_finish(self):
 653                 """Report download finished."""
 654                 if self.params.get('noprogress', False):
 655                         self.to_screen(u'[download] Download completed')
 656                 else:
 657                         self.to_screen(u'')
 658
 659         def increment_downloads(self):
 660                 """Increment the ordinal that assigns a number to each file."""
 661                 self._num_downloads += 1
 662
 663         def prepare_filename(self, info_dict):
 664                 """Generate the output filename."""
 665                 try:
 666                         template_dict = dict(info_dict)
 667                         template_dict['epoch'] = unicode(long(time.time()))
 668                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 669                         filename = self.params['outtmpl'] % template_dict
 670                         return filename
 671                 except (ValueError, KeyError), err:
 672                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 673                         return None
 674
 675         def process_info(self, info_dict):
 676                 """Process a single dictionary returned by an InfoExtractor."""
 677                 filename = self.prepare_filename(info_dict)
 678                 # Do nothing else if in simulate mode
 679                 if self.params.get('simulate', False):
 680                         # Forced printings
 681                         if self.params.get('forcetitle', False):
 682                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 683                         if self.params.get('forceurl', False):
 684                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 685                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 686                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 687                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 688                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 689                         if self.params.get('forcefilename', False) and filename is not None:
 690                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 691
 692                         return
 693
 694                 if filename is None:
 695                         return
 696                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 697                         self.to_stderr(u'WARNING: file exists and will be skipped')
 698                         return
 699
 700                 try:
 701                         self.pmkdir(filename)
 702                 except (OSError, IOError), err:
 703                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 704                         return
 705
 706                 if self.params.get('writedescription', False):
 707                         try:
 708                                 descfn = filename + '.description'
 709                                 self.report_writedescription(descfn)
 710                                 with contextlib.closing(open(descfn, 'wb')) as descfile:
 711                                         descfile.write(info_dict['description'].encode('utf-8'))
 712                         except (OSError, IOError):
 713                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 714                                 return
 715
 716                 print(repr(self.params))
 717                 if self.params.get('writeinfojson', False):
 718                         infofn = filename + '.info.json'
 719                         self.report_writeinfojson(infofn)
 720                         try:
 721                                 json.dump
 722                         except (NameError,AttributeError):
 723                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 724                                 return
 725                         try:
 726                                 with contextlib.closing(open(infofn, 'wb')) as infof:
 727                                         json.dump(info_dict, infof)
 728                         except (OSError, IOError):
 729                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 730                                 return
 731
 732                 try:
 733                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 734                 except (OSError, IOError), err:
 735                         raise UnavailableVideoError
 736                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 737                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 738                         return
 739                 except (ContentTooShortError, ), err:
 740                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 741                         return
 742
 743                 if success:
 744                         try:
 745                                 self.post_process(filename, info_dict)
 746                         except (PostProcessingError), err:
 747                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 748                                 return
 749
 750         def download(self, url_list):
 751                 """Download a given list of URLs."""
 752                 if len(url_list) > 1 and self.fixed_template():
 753                         raise SameFileError(self.params['outtmpl'])
 754
 755                 for url in url_list:
 756                         suitable_found = False
 757                         for ie in self._ies:
 758                                 # Go to next InfoExtractor if not suitable
 759                                 if not ie.suitable(url):
 760                                         continue
 761
 762                                 # Suitable InfoExtractor found
 763                                 suitable_found = True
 764
 765                                 # Extract information from URL and process it
 766                                 ie.extract(url)
 767
 768                                 # Suitable InfoExtractor had been found; go to next URL
 769                                 break
 770
 771                         if not suitable_found:
 772                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 773
 774                 return self._download_retcode
 775
 776         def post_process(self, filename, ie_info):
 777                 """Run the postprocessing chain on the given file."""
 778                 info = dict(ie_info)
 779                 info['filepath'] = filename
 780                 for pp in self._pps:
 781                         info = pp.run(info)
 782                         if info is None:
 783                                 break
 784
 785         def _download_with_rtmpdump(self, filename, url, player_url):
 786                 self.report_destination(filename)
 787                 tmpfilename = self.temp_name(filename)
 788
 789                 # Check for rtmpdump first
 790                 try:
 791                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 792                 except (OSError, IOError):
 793                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 794                         return False
 795
 796                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 797                 # the connection was interrumpted and resuming appears to be
 798                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 799                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 800                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 801                 while retval == 2 or retval == 1:
 802                         prevsize = os.path.getsize(tmpfilename)
 803                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 804                         time.sleep(5.0) # This seems to be needed
 805                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 806                         cursize = os.path.getsize(tmpfilename)
 807                         if prevsize == cursize and retval == 1:
 808                                 break
 809                 if retval == 0:
 810                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 811                         self.try_rename(tmpfilename, filename)
 812                         return True
 813                 else:
 814                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 815                         return False
 816
 817         def _do_download(self, filename, url, player_url):
 818                 # Check file already present
 819                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 820                         self.report_file_already_downloaded(filename)
 821                         return True
 822
 823                 # Attempt to download using rtmpdump
 824                 if url.startswith('rtmp'):
 825                         return self._download_with_rtmpdump(filename, url, player_url)
 826
 827                 tmpfilename = self.temp_name(filename)
 828                 stream = None
 829                 open_mode = 'wb'
 830
 831                 # Do not include the Accept-Encoding header
 832                 headers = {'Youtubedl-no-compression': 'True'}
 833                 basic_request = urllib2.Request(url, None, headers)
 834                 request = urllib2.Request(url, None, headers)
 835
 836                 # Establish possible resume length
 837                 if os.path.isfile(tmpfilename):
 838                         resume_len = os.path.getsize(tmpfilename)
 839                 else:
 840                         resume_len = 0
 841
 842                 # Request parameters in case of being able to resume
 843                 if self.params.get('continuedl', False) and resume_len != 0:
 844                         self.report_resuming_byte(resume_len)
 845                         request.add_header('Range','bytes=%d-' % resume_len)
 846                         open_mode = 'ab'
 847
 848                 count = 0
 849                 retries = self.params.get('retries', 0)
 850                 while count <= retries:
 851                         # Establish connection
 852                         try:
 853                                 data = urllib2.urlopen(request)
 854                                 break
 855                         except (urllib2.HTTPError, ), err:
 856                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 857                                         # Unexpected HTTP error
 858                                         raise
 859                                 elif err.code == 416:
 860                                         # Unable to resume (requested range not satisfiable)
 861                                         try:
 862                                                 # Open the connection again without the range header
 863                                                 data = urllib2.urlopen(basic_request)
 864                                                 content_length = data.info()['Content-Length']
 865                                         except (urllib2.HTTPError, ), err:
 866                                                 if err.code < 500 or err.code >= 600:
 867                                                         raise
 868                                         else:
 869                                                 # Examine the reported length
 870                                                 if (content_length is not None and
 871                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 872                                                         # The file had already been fully downloaded.
 873                                                         # Explanation to the above condition: in issue #175 it was revealed that
 874                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 875                                                         # changing the file size slightly and causing problems for some users. So
 876                                                         # I decided to implement a suggested change and consider the file
 877                                                         # completely downloaded if the file size differs less than 100 bytes from
 878                                                         # the one in the hard drive.
 879                                                         self.report_file_already_downloaded(filename)
 880                                                         self.try_rename(tmpfilename, filename)
 881                                                         return True
 882                                                 else:
 883                                                         # The length does not match, we start the download over
 884                                                         self.report_unable_to_resume()
 885                                                         open_mode = 'wb'
 886                                                         break
 887                         # Retry
 888                         count += 1
 889                         if count <= retries:
 890                                 self.report_retry(count, retries)
 891
 892                 if count > retries:
 893                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 894                         return False
 895
 896                 data_len = data.info().get('Content-length', None)
 897                 if data_len is not None:
 898                         data_len = long(data_len) + resume_len
 899                 data_len_str = self.format_bytes(data_len)
 900                 byte_counter = 0 + resume_len
 901                 block_size = 1024
 902                 start = time.time()
 903                 while True:
 904                         # Download and write
 905                         before = time.time()
 906                         data_block = data.read(block_size)
 907                         after = time.time()
 908                         if len(data_block) == 0:
 909                                 break
 910                         byte_counter += len(data_block)
 911
 912                         # Open file just in time
 913                         if stream is None:
 914                                 try:
 915                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 916                                         filename = self.undo_temp_name(tmpfilename)
 917                                         self.report_destination(filename)
 918                                 except (OSError, IOError), err:
 919                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 920                                         return False
 921                         try:
 922                                 stream.write(data_block)
 923                         except (IOError, OSError), err:
 924                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 925                                 return False
 926                         block_size = self.best_block_size(after - before, len(data_block))
 927
 928                         # Progress message
 929                         percent_str = self.calc_percent(byte_counter, data_len)
 930                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 931                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 932                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 933
 934                         # Apply rate limit
 935                         self.slow_down(start, byte_counter - resume_len)
 936
 937                 stream.close()
 938                 self.report_finish()
 939                 if data_len is not None and byte_counter != data_len:
 940                         raise ContentTooShortError(byte_counter, long(data_len))
 941                 self.try_rename(tmpfilename, filename)
 942
 943                 # Update file modification time
 944                 if self.params.get('updatetime', True):
 945                         self.try_utime(filename, data.info().get('last-modified', None))
 946
 947                 return True
 948
 949 class InfoExtractor(object):
 950         """Information Extractor class.
 951
 952         Information extractors are the classes that, given a URL, extract
 953         information from the video (or videos) the URL refers to. This
 954         information includes the real video URL, the video title and simplified
 955         title, author and others. The information is stored in a dictionary
 956         which is then passed to the FileDownloader. The FileDownloader
 957         processes this information possibly downloading the video to the file
 958         system, among other possible outcomes. The dictionaries must include
 959         the following fields:
 960
 961         id:             Video identifier.
 962         url:            Final video URL.
 963         uploader:       Nickname of the video uploader.
 964         title:          Literal title.
 965         stitle:         Simplified title.
 966         ext:            Video filename extension.
 967         format:         Video format.
 968         player_url:     SWF Player URL (may be None).
 969
 970         The following fields are optional. Their primary purpose is to allow
 971         youtube-dl to serve as the backend for a video search function, such
 972         as the one in youtube2mp3.  They are only used when their respective
 973         forced printing functions are called:
 974
 975         thumbnail:      Full URL to a video thumbnail image.
 976         description:    One-line video description.
 977
 978         Subclasses of this one should re-define the _real_initialize() and
 979         _real_extract() methods, as well as the suitable() static method.
 980         Probably, they should also be instantiated and added to the main
 981         downloader.
 982         """
 983
 984         _ready = False
 985         _downloader = None
 986
 987         def __init__(self, downloader=None):
 988                 """Constructor. Receives an optional downloader."""
 989                 self._ready = False
 990                 self.set_downloader(downloader)
 991
 992         @staticmethod
 993         def suitable(url):
 994                 """Receives a URL and returns True if suitable for this IE."""
 995                 return False
 996
 997         def initialize(self):
 998                 """Initializes an instance (authentication, etc)."""
 999                 if not self._ready:
1000                         self._real_initialize()
1001                         self._ready = True
1002
1003         def extract(self, url):
1004                 """Extracts URL information and returns it in list of dicts."""
1005                 self.initialize()
1006                 return self._real_extract(url)
1007
1008         def set_downloader(self, downloader):
1009                 """Sets the downloader for this IE."""
1010                 self._downloader = downloader
1011
1012         def _real_initialize(self):
1013                 """Real initialization process. Redefine in subclasses."""
1014                 pass
1015
1016         def _real_extract(self, url):
1017                 """Real extraction process. Redefine in subclasses."""
1018                 pass
1019
1020 class YoutubeIE(InfoExtractor):
1021         """Information extractor for youtube.com."""
1022
1023         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1024         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1025         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1026         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1027         _NETRC_MACHINE = 'youtube'
1028         # Listed in order of quality
1029         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1030         _video_extensions = {
1031                 '13': '3gp',
1032                 '17': 'mp4',
1033                 '18': 'mp4',
1034                 '22': 'mp4',
1035                 '37': 'mp4',
1036                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1037                 '43': 'webm',
1038                 '45': 'webm',
1039         }
1040
1041         @staticmethod
1042         def suitable(url):
1043                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1044
1045         def report_lang(self):
1046                 """Report attempt to set language."""
1047                 self._downloader.to_screen(u'[youtube] Setting language')
1048
1049         def report_login(self):
1050                 """Report attempt to log in."""
1051                 self._downloader.to_screen(u'[youtube] Logging in')
1052
1053         def report_age_confirmation(self):
1054                 """Report attempt to confirm age."""
1055                 self._downloader.to_screen(u'[youtube] Confirming age')
1056
1057         def report_video_webpage_download(self, video_id):
1058                 """Report attempt to download video webpage."""
1059                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1060
1061         def report_video_info_webpage_download(self, video_id):
1062                 """Report attempt to download video info webpage."""
1063                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1064
1065         def report_information_extraction(self, video_id):
1066                 """Report attempt to extract video information."""
1067                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1068
1069         def report_unavailable_format(self, video_id, format):
1070                 """Report extracted video URL."""
1071                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1072
1073         def report_rtmp_download(self):
1074                 """Indicate the download will use the RTMP protocol."""
1075                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1076
1077         def _real_initialize(self):
1078                 if self._downloader is None:
1079                         return
1080
1081                 username = None
1082                 password = None
1083                 downloader_params = self._downloader.params
1084
1085                 # Attempt to use provided username and password or .netrc data
1086                 if downloader_params.get('username', None) is not None:
1087                         username = downloader_params['username']
1088                         password = downloader_params['password']
1089                 elif downloader_params.get('usenetrc', False):
1090                         try:
1091                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1092                                 if info is not None:
1093                                         username = info[0]
1094                                         password = info[2]
1095                                 else:
1096                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1097                         except (IOError, netrc.NetrcParseError), err:
1098                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1099                                 return
1100
1101                 # Set language
1102                 request = urllib2.Request(self._LANG_URL)
1103                 try:
1104                         self.report_lang()
1105                         urllib2.urlopen(request).read()
1106                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1107                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1108                         return
1109
1110                 # No authentication to be performed
1111                 if username is None:
1112                         return
1113
1114                 # Log in
1115                 login_form = {
1116                                 'current_form': 'loginForm',
1117                                 'next':         '/',
1118                                 'action_login': 'Log In',
1119                                 'username':     username,
1120                                 'password':     password,
1121                                 }
1122                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1123                 try:
1124                         self.report_login()
1125                         login_results = urllib2.urlopen(request).read()
1126                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1127                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1128                                 return
1129                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1130                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1131                         return
1132
1133                 # Confirm age
1134                 age_form = {
1135                                 'next_url':             '/',
1136                                 'action_confirm':       'Confirm',
1137                                 }
1138                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1139                 try:
1140                         self.report_age_confirmation()
1141                         age_results = urllib2.urlopen(request).read()
1142                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1143                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1144                         return
1145
1146         def _real_extract(self, url):
1147                 # Extract video id from URL
1148                 mobj = re.match(self._VALID_URL, url)
1149                 if mobj is None:
1150                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1151                         return
1152                 video_id = mobj.group(2)
1153
1154                 # Get video webpage
1155                 self.report_video_webpage_download(video_id)
1156                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1157                 try:
1158                         video_webpage = urllib2.urlopen(request).read()
1159                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1160                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1161                         return
1162
1163                 # Attempt to extract SWF player URL
1164                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1165                 if mobj is not None:
1166                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1167                 else:
1168                         player_url = None
1169
1170                 # Get video info
1171                 self.report_video_info_webpage_download(video_id)
1172                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1173                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1174                                            % (video_id, el_type))
1175                         request = urllib2.Request(video_info_url)
1176                         try:
1177                                 video_info_webpage = urllib2.urlopen(request).read()
1178                                 video_info = parse_qs(video_info_webpage)
1179                                 if 'token' in video_info:
1180                                         break
1181                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1182                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1183                                 return
1184                 if 'token' not in video_info:
1185                         if 'reason' in video_info:
1186                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1187                         else:
1188                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1189                         return
1190
1191                 # Start extracting information
1192                 self.report_information_extraction(video_id)
1193
1194                 # uploader
1195                 if 'author' not in video_info:
1196                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1197                         return
1198                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1199
1200                 # title
1201                 if 'title' not in video_info:
1202                         self._downloader.trouble(u'ERROR: unable to extract video title')
1203                         return
1204                 video_title = urllib.unquote_plus(video_info['title'][0])
1205                 video_title = video_title.decode('utf-8')
1206                 video_title = sanitize_title(video_title)
1207
1208                 # simplified title
1209                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1210                 simple_title = simple_title.strip(ur'_')
1211
1212                 # thumbnail image
1213                 if 'thumbnail_url' not in video_info:
1214                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1215                         video_thumbnail = ''
1216                 else:   # don't panic if we can't find it
1217                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1218
1219                 # upload date
1220                 upload_date = u'NA'
1221                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1222                 if mobj is not None:
1223                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1224                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1225                         for expression in format_expressions:
1226                                 try:
1227                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1228                                 except:
1229                                         pass
1230
1231                 # description
1232                 try:
1233                         lxml.etree
1234                 except NameError:
1235                         video_description = u'No description available.'
1236                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1237                                 warnings.warn(u'You are using an old Python version, install Python 2.6+ or lxml. Falling back to old video description extractor.')
1238                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1239                                 if mobj is not None:
1240                                         video_description = mobj.group(1).decode('utf-8')
1241                 else:
1242                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1243                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1244                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1245
1246                 # token
1247                 video_token = urllib.unquote_plus(video_info['token'][0])
1248
1249                 # Decide which formats to download
1250                 req_format = self._downloader.params.get('format', None)
1251
1252                 if 'fmt_url_map' in video_info and len(video_info['fmt_url_map']) >= 1 and ',' in video_info['fmt_url_map'][0]:
1253                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1254                         format_limit = self._downloader.params.get('format_limit', None)
1255                         if format_limit is not None and format_limit in self._available_formats:
1256                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1257                         else:
1258                                 format_list = self._available_formats
1259                         existing_formats = [x for x in format_list if x in url_map]
1260                         if len(existing_formats) == 0:
1261                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1262                                 return
1263                         if req_format is None:
1264                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1265                         elif req_format == '-1':
1266                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1267                         else:
1268                                 # Specific format
1269                                 if req_format not in url_map:
1270                                         self._downloader.trouble(u'ERROR: requested format not available')
1271                                         return
1272                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1273
1274                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1275                         self.report_rtmp_download()
1276                         video_url_list = [(None, video_info['conn'][0])]
1277
1278                 else:
1279                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1280                         return
1281
1282                 for format_param, video_real_url in video_url_list:
1283                         # At this point we have a new video
1284                         self._downloader.increment_downloads()
1285
1286                         # Extension
1287                         video_extension = self._video_extensions.get(format_param, 'flv')
1288
1289                         # Find the video URL in fmt_url_map or conn paramters
1290                         try:
1291                                 # Process video information
1292                                 self._downloader.process_info({
1293                                         'id':           video_id.decode('utf-8'),
1294                                         'url':          video_real_url.decode('utf-8'),
1295                                         'uploader':     video_uploader.decode('utf-8'),
1296                                         'upload_date':  upload_date,
1297                                         'title':        video_title,
1298                                         'stitle':       simple_title,
1299                                         'ext':          video_extension.decode('utf-8'),
1300                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1301                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1302                                         'description':  video_description,
1303                                         'player_url':   player_url,
1304                                 })
1305                         except UnavailableVideoError, err:
1306                                 self._downloader.trouble(u'\nERROR: unable to download video')
1307
1308
1309 class MetacafeIE(InfoExtractor):
1310         """Information Extractor for metacafe.com."""
1311
1312         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1313         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1314         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1315         _youtube_ie = None
1316
1317         def __init__(self, youtube_ie, downloader=None):
1318                 InfoExtractor.__init__(self, downloader)
1319                 self._youtube_ie = youtube_ie
1320
1321         @staticmethod
1322         def suitable(url):
1323                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1324
1325         def report_disclaimer(self):
1326                 """Report disclaimer retrieval."""
1327                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1328
1329         def report_age_confirmation(self):
1330                 """Report attempt to confirm age."""
1331                 self._downloader.to_screen(u'[metacafe] Confirming age')
1332
1333         def report_download_webpage(self, video_id):
1334                 """Report webpage download."""
1335                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1336
1337         def report_extraction(self, video_id):
1338                 """Report information extraction."""
1339                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1340
1341         def _real_initialize(self):
1342                 # Retrieve disclaimer
1343                 request = urllib2.Request(self._DISCLAIMER)
1344                 try:
1345                         self.report_disclaimer()
1346                         disclaimer = urllib2.urlopen(request).read()
1347                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1348                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1349                         return
1350
1351                 # Confirm age
1352                 disclaimer_form = {
1353                         'filters': '0',
1354                         'submit': "Continue - I'm over 18",
1355                         }
1356                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1357                 try:
1358                         self.report_age_confirmation()
1359                         disclaimer = urllib2.urlopen(request).read()
1360                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1361                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1362                         return
1363
1364         def _real_extract(self, url):
1365                 # Extract id and simplified title from URL
1366                 mobj = re.match(self._VALID_URL, url)
1367                 if mobj is None:
1368                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1369                         return
1370
1371                 video_id = mobj.group(1)
1372
1373                 # Check if video comes from YouTube
1374                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1375                 if mobj2 is not None:
1376                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1377                         return
1378
1379                 # At this point we have a new video
1380                 self._downloader.increment_downloads()
1381
1382                 simple_title = mobj.group(2).decode('utf-8')
1383
1384                 # Retrieve video webpage to extract further information
1385                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1386                 try:
1387                         self.report_download_webpage(video_id)
1388                         webpage = urllib2.urlopen(request).read()
1389                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1390                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1391                         return
1392
1393                 # Extract URL, uploader and title from webpage
1394                 self.report_extraction(video_id)
1395                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1396                 if mobj is not None:
1397                         mediaURL = urllib.unquote(mobj.group(1))
1398                         video_extension = mediaURL[-3:]
1399
1400                         # Extract gdaKey if available
1401                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1402                         if mobj is None:
1403                                 video_url = mediaURL
1404                         else:
1405                                 gdaKey = mobj.group(1)
1406                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1407                 else:
1408                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1409                         if mobj is None:
1410                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1411                                 return
1412                         vardict = parse_qs(mobj.group(1))
1413                         if 'mediaData' not in vardict:
1414                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1415                                 return
1416                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1417                         if mobj is None:
1418                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1419                                 return
1420                         mediaURL = mobj.group(1).replace('\\/', '/')
1421                         video_extension = mediaURL[-3:]
1422                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1423
1424                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1425                 if mobj is None:
1426                         self._downloader.trouble(u'ERROR: unable to extract title')
1427                         return
1428                 video_title = mobj.group(1).decode('utf-8')
1429                 video_title = sanitize_title(video_title)
1430
1431                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1432                 if mobj is None:
1433                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1434                         return
1435                 video_uploader = mobj.group(1)
1436
1437                 try:
1438                         # Process video information
1439                         self._downloader.process_info({
1440                                 'id':           video_id.decode('utf-8'),
1441                                 'url':          video_url.decode('utf-8'),
1442                                 'uploader':     video_uploader.decode('utf-8'),
1443                                 'upload_date':  u'NA',
1444                                 'title':        video_title,
1445                                 'stitle':       simple_title,
1446                                 'ext':          video_extension.decode('utf-8'),
1447                                 'format':       u'NA',
1448                                 'player_url':   None,
1449                         })
1450                 except UnavailableVideoError:
1451                         self._downloader.trouble(u'\nERROR: unable to download video')
1452
1453
1454 class DailymotionIE(InfoExtractor):
1455         """Information Extractor for Dailymotion"""
1456
1457         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1458
1459         def __init__(self, downloader=None):
1460                 InfoExtractor.__init__(self, downloader)
1461
1462         @staticmethod
1463         def suitable(url):
1464                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1465
1466         def report_download_webpage(self, video_id):
1467                 """Report webpage download."""
1468                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1469
1470         def report_extraction(self, video_id):
1471                 """Report information extraction."""
1472                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1473
1474         def _real_initialize(self):
1475                 return
1476
1477         def _real_extract(self, url):
1478                 # Extract id and simplified title from URL
1479                 mobj = re.match(self._VALID_URL, url)
1480                 if mobj is None:
1481                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1482                         return
1483
1484                 # At this point we have a new video
1485                 self._downloader.increment_downloads()
1486                 video_id = mobj.group(1)
1487
1488                 simple_title = mobj.group(2).decode('utf-8')
1489                 video_extension = 'flv'
1490
1491                 # Retrieve video webpage to extract further information
1492                 request = urllib2.Request(url)
1493                 try:
1494                         self.report_download_webpage(video_id)
1495                         webpage = urllib2.urlopen(request).read()
1496                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1497                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1498                         return
1499
1500                 # Extract URL, uploader and title from webpage
1501                 self.report_extraction(video_id)
1502                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1503                 if mobj is None:
1504                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1505                         return
1506                 mediaURL = urllib.unquote(mobj.group(1))
1507
1508                 # if needed add http://www.dailymotion.com/ if relative URL
1509
1510                 video_url = mediaURL
1511
1512                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1513                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1514                 if mobj is None:
1515                         self._downloader.trouble(u'ERROR: unable to extract title')
1516                         return
1517                 video_title = mobj.group(1).decode('utf-8')
1518                 video_title = sanitize_title(video_title)
1519
1520                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1521                 if mobj is None:
1522                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1523                         return
1524                 video_uploader = mobj.group(1)
1525
1526                 try:
1527                         # Process video information
1528                         self._downloader.process_info({
1529                                 'id':           video_id.decode('utf-8'),
1530                                 'url':          video_url.decode('utf-8'),
1531                                 'uploader':     video_uploader.decode('utf-8'),
1532                                 'upload_date':  u'NA',
1533                                 'title':        video_title,
1534                                 'stitle':       simple_title,
1535                                 'ext':          video_extension.decode('utf-8'),
1536                                 'format':       u'NA',
1537                                 'player_url':   None,
1538                         })
1539                 except UnavailableVideoError:
1540                         self._downloader.trouble(u'\nERROR: unable to download video')
1541
1542 class GoogleIE(InfoExtractor):
1543         """Information extractor for video.google.com."""
1544
1545         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1546
1547         def __init__(self, downloader=None):
1548                 InfoExtractor.__init__(self, downloader)
1549
1550         @staticmethod
1551         def suitable(url):
1552                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1553
1554         def report_download_webpage(self, video_id):
1555                 """Report webpage download."""
1556                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1557
1558         def report_extraction(self, video_id):
1559                 """Report information extraction."""
1560                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1561
1562         def _real_initialize(self):
1563                 return
1564
1565         def _real_extract(self, url):
1566                 # Extract id from URL
1567                 mobj = re.match(self._VALID_URL, url)
1568                 if mobj is None:
1569                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1570                         return
1571
1572                 # At this point we have a new video
1573                 self._downloader.increment_downloads()
1574                 video_id = mobj.group(1)
1575
1576                 video_extension = 'mp4'
1577
1578                 # Retrieve video webpage to extract further information
1579                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1580                 try:
1581                         self.report_download_webpage(video_id)
1582                         webpage = urllib2.urlopen(request).read()
1583                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1584                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1585                         return
1586
1587                 # Extract URL, uploader, and title from webpage
1588                 self.report_extraction(video_id)
1589                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1590                 if mobj is None:
1591                         video_extension = 'flv'
1592                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1593                 if mobj is None:
1594                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1595                         return
1596                 mediaURL = urllib.unquote(mobj.group(1))
1597                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1598                 mediaURL = mediaURL.replace('\\x26', '\x26')
1599
1600                 video_url = mediaURL
1601
1602                 mobj = re.search(r'<title>(.*)</title>', webpage)
1603                 if mobj is None:
1604                         self._downloader.trouble(u'ERROR: unable to extract title')
1605                         return
1606                 video_title = mobj.group(1).decode('utf-8')
1607                 video_title = sanitize_title(video_title)
1608                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1609
1610                 # Extract video description
1611                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1612                 if mobj is None:
1613                         self._downloader.trouble(u'ERROR: unable to extract video description')
1614                         return
1615                 video_description = mobj.group(1).decode('utf-8')
1616                 if not video_description:
1617                         video_description = 'No description available.'
1618
1619                 # Extract video thumbnail
1620                 if self._downloader.params.get('forcethumbnail', False):
1621                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1622                         try:
1623                                 webpage = urllib2.urlopen(request).read()
1624                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1625                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1626                                 return
1627                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1628                         if mobj is None:
1629                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1630                                 return
1631                         video_thumbnail = mobj.group(1)
1632                 else:   # we need something to pass to process_info
1633                         video_thumbnail = ''
1634
1635
1636                 try:
1637                         # Process video information
1638                         self._downloader.process_info({
1639                                 'id':           video_id.decode('utf-8'),
1640                                 'url':          video_url.decode('utf-8'),
1641                                 'uploader':     u'NA',
1642                                 'upload_date':  u'NA',
1643                                 'title':        video_title,
1644                                 'stitle':       simple_title,
1645                                 'ext':          video_extension.decode('utf-8'),
1646                                 'format':       u'NA',
1647                                 'player_url':   None,
1648                         })
1649                 except UnavailableVideoError:
1650                         self._downloader.trouble(u'\nERROR: unable to download video')
1651
1652
1653 class PhotobucketIE(InfoExtractor):
1654         """Information extractor for photobucket.com."""
1655
1656         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1657
1658         def __init__(self, downloader=None):
1659                 InfoExtractor.__init__(self, downloader)
1660
1661         @staticmethod
1662         def suitable(url):
1663                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1664
1665         def report_download_webpage(self, video_id):
1666                 """Report webpage download."""
1667                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1668
1669         def report_extraction(self, video_id):
1670                 """Report information extraction."""
1671                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1672
1673         def _real_initialize(self):
1674                 return
1675
1676         def _real_extract(self, url):
1677                 # Extract id from URL
1678                 mobj = re.match(self._VALID_URL, url)
1679                 if mobj is None:
1680                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1681                         return
1682
1683                 # At this point we have a new video
1684                 self._downloader.increment_downloads()
1685                 video_id = mobj.group(1)
1686
1687                 video_extension = 'flv'
1688
1689                 # Retrieve video webpage to extract further information
1690                 request = urllib2.Request(url)
1691                 try:
1692                         self.report_download_webpage(video_id)
1693                         webpage = urllib2.urlopen(request).read()
1694                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1695                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1696                         return
1697
1698                 # Extract URL, uploader, and title from webpage
1699                 self.report_extraction(video_id)
1700                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1701                 if mobj is None:
1702                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1703                         return
1704                 mediaURL = urllib.unquote(mobj.group(1))
1705
1706                 video_url = mediaURL
1707
1708                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1709                 if mobj is None:
1710                         self._downloader.trouble(u'ERROR: unable to extract title')
1711                         return
1712                 video_title = mobj.group(1).decode('utf-8')
1713                 video_title = sanitize_title(video_title)
1714                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1715
1716                 video_uploader = mobj.group(2).decode('utf-8')
1717
1718                 try:
1719                         # Process video information
1720                         self._downloader.process_info({
1721                                 'id':           video_id.decode('utf-8'),
1722                                 'url':          video_url.decode('utf-8'),
1723                                 'uploader':     video_uploader,
1724                                 'upload_date':  u'NA',
1725                                 'title':        video_title,
1726                                 'stitle':       simple_title,
1727                                 'ext':          video_extension.decode('utf-8'),
1728                                 'format':       u'NA',
1729                                 'player_url':   None,
1730                         })
1731                 except UnavailableVideoError:
1732                         self._downloader.trouble(u'\nERROR: unable to download video')
1733
1734
1735 class YahooIE(InfoExtractor):
1736         """Information extractor for video.yahoo.com."""
1737
1738         # _VALID_URL matches all Yahoo! Video URLs
1739         # _VPAGE_URL matches only the extractable '/watch/' URLs
1740         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1741         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1742
1743         def __init__(self, downloader=None):
1744                 InfoExtractor.__init__(self, downloader)
1745
1746         @staticmethod
1747         def suitable(url):
1748                 return (re.match(YahooIE._VALID_URL, url) is not None)
1749
1750         def report_download_webpage(self, video_id):
1751                 """Report webpage download."""
1752                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1753
1754         def report_extraction(self, video_id):
1755                 """Report information extraction."""
1756                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1757
1758         def _real_initialize(self):
1759                 return
1760
1761         def _real_extract(self, url, new_video=True):
1762                 # Extract ID from URL
1763                 mobj = re.match(self._VALID_URL, url)
1764                 if mobj is None:
1765                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1766                         return
1767
1768                 # At this point we have a new video
1769                 self._downloader.increment_downloads()
1770                 video_id = mobj.group(2)
1771                 video_extension = 'flv'
1772
1773                 # Rewrite valid but non-extractable URLs as
1774                 # extractable English language /watch/ URLs
1775                 if re.match(self._VPAGE_URL, url) is None:
1776                         request = urllib2.Request(url)
1777                         try:
1778                                 webpage = urllib2.urlopen(request).read()
1779                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1780                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1781                                 return
1782
1783                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1784                         if mobj is None:
1785                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1786                                 return
1787                         yahoo_id = mobj.group(1)
1788
1789                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1790                         if mobj is None:
1791                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1792                                 return
1793                         yahoo_vid = mobj.group(1)
1794
1795                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1796                         return self._real_extract(url, new_video=False)
1797
1798                 # Retrieve video webpage to extract further information
1799                 request = urllib2.Request(url)
1800                 try:
1801                         self.report_download_webpage(video_id)
1802                         webpage = urllib2.urlopen(request).read()
1803                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1804                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1805                         return
1806
1807                 # Extract uploader and title from webpage
1808                 self.report_extraction(video_id)
1809                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1810                 if mobj is None:
1811                         self._downloader.trouble(u'ERROR: unable to extract video title')
1812                         return
1813                 video_title = mobj.group(1).decode('utf-8')
1814                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1815
1816                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1817                 if mobj is None:
1818                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1819                         return
1820                 video_uploader = mobj.group(1).decode('utf-8')
1821
1822                 # Extract video thumbnail
1823                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1824                 if mobj is None:
1825                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1826                         return
1827                 video_thumbnail = mobj.group(1).decode('utf-8')
1828
1829                 # Extract video description
1830                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1831                 if mobj is None:
1832                         self._downloader.trouble(u'ERROR: unable to extract video description')
1833                         return
1834                 video_description = mobj.group(1).decode('utf-8')
1835                 if not video_description: video_description = 'No description available.'
1836
1837                 # Extract video height and width
1838                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1839                 if mobj is None:
1840                         self._downloader.trouble(u'ERROR: unable to extract video height')
1841                         return
1842                 yv_video_height = mobj.group(1)
1843
1844                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1845                 if mobj is None:
1846                         self._downloader.trouble(u'ERROR: unable to extract video width')
1847                         return
1848                 yv_video_width = mobj.group(1)
1849
1850                 # Retrieve video playlist to extract media URL
1851                 # I'm not completely sure what all these options are, but we
1852                 # seem to need most of them, otherwise the server sends a 401.
1853                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1854                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1855                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1856                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1857                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1858                 try:
1859                         self.report_download_webpage(video_id)
1860                         webpage = urllib2.urlopen(request).read()
1861                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1862                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1863                         return
1864
1865                 # Extract media URL from playlist XML
1866                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1867                 if mobj is None:
1868                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1869                         return
1870                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1871                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1872
1873                 try:
1874                         # Process video information
1875                         self._downloader.process_info({
1876                                 'id':           video_id.decode('utf-8'),
1877                                 'url':          video_url,
1878                                 'uploader':     video_uploader,
1879                                 'upload_date':  u'NA',
1880                                 'title':        video_title,
1881                                 'stitle':       simple_title,
1882                                 'ext':          video_extension.decode('utf-8'),
1883                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1884                                 'description':  video_description,
1885                                 'thumbnail':    video_thumbnail,
1886                                 'description':  video_description,
1887                                 'player_url':   None,
1888                         })
1889                 except UnavailableVideoError:
1890                         self._downloader.trouble(u'\nERROR: unable to download video')
1891
1892
1893 class GenericIE(InfoExtractor):
1894         """Generic last-resort information extractor."""
1895
1896         def __init__(self, downloader=None):
1897                 InfoExtractor.__init__(self, downloader)
1898
1899         @staticmethod
1900         def suitable(url):
1901                 return True
1902
1903         def report_download_webpage(self, video_id):
1904                 """Report webpage download."""
1905                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1906                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1907
1908         def report_extraction(self, video_id):
1909                 """Report information extraction."""
1910                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1911
1912         def _real_initialize(self):
1913                 return
1914
1915         def _real_extract(self, url):
1916                 # At this point we have a new video
1917                 self._downloader.increment_downloads()
1918
1919                 video_id = url.split('/')[-1]
1920                 request = urllib2.Request(url)
1921                 try:
1922                         self.report_download_webpage(video_id)
1923                         webpage = urllib2.urlopen(request).read()
1924                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1925                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1926                         return
1927                 except ValueError, err:
1928                         # since this is the last-resort InfoExtractor, if
1929                         # this error is thrown, it'll be thrown here
1930                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1931                         return
1932
1933                 self.report_extraction(video_id)
1934                 # Start with something easy: JW Player in SWFObject
1935                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1936                 if mobj is None:
1937                         # Broaden the search a little bit
1938                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1939                 if mobj is None:
1940                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1941                         return
1942
1943                 # It's possible that one of the regexes
1944                 # matched, but returned an empty group:
1945                 if mobj.group(1) is None:
1946                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1947                         return
1948
1949                 video_url = urllib.unquote(mobj.group(1))
1950                 video_id  = os.path.basename(video_url)
1951
1952                 # here's a fun little line of code for you:
1953                 video_extension = os.path.splitext(video_id)[1][1:]
1954                 video_id        = os.path.splitext(video_id)[0]
1955
1956                 # it's tempting to parse this further, but you would
1957                 # have to take into account all the variations like
1958                 #   Video Title - Site Name
1959                 #   Site Name | Video Title
1960                 #   Video Title - Tagline | Site Name
1961                 # and so on and so forth; it's just not practical
1962                 mobj = re.search(r'<title>(.*)</title>', webpage)
1963                 if mobj is None:
1964                         self._downloader.trouble(u'ERROR: unable to extract title')
1965                         return
1966                 video_title = mobj.group(1).decode('utf-8')
1967                 video_title = sanitize_title(video_title)
1968                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1969
1970                 # video uploader is domain name
1971                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1972                 if mobj is None:
1973                         self._downloader.trouble(u'ERROR: unable to extract title')
1974                         return
1975                 video_uploader = mobj.group(1).decode('utf-8')
1976
1977                 try:
1978                         # Process video information
1979                         self._downloader.process_info({
1980                                 'id':           video_id.decode('utf-8'),
1981                                 'url':          video_url.decode('utf-8'),
1982                                 'uploader':     video_uploader,
1983                                 'upload_date':  u'NA',
1984                                 'title':        video_title,
1985                                 'stitle':       simple_title,
1986                                 'ext':          video_extension.decode('utf-8'),
1987                                 'format':       u'NA',
1988                                 'player_url':   None,
1989                         })
1990                 except UnavailableVideoError, err:
1991                         self._downloader.trouble(u'\nERROR: unable to download video')
1992
1993
1994 class YoutubeSearchIE(InfoExtractor):
1995         """Information Extractor for YouTube search queries."""
1996         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1997         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1998         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1999         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2000         _youtube_ie = None
2001         _max_youtube_results = 1000
2002
2003         def __init__(self, youtube_ie, downloader=None):
2004                 InfoExtractor.__init__(self, downloader)
2005                 self._youtube_ie = youtube_ie
2006
2007         @staticmethod
2008         def suitable(url):
2009                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2010
2011         def report_download_page(self, query, pagenum):
2012                 """Report attempt to download playlist page with given number."""
2013                 query = query.decode(preferredencoding())
2014                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2015
2016         def _real_initialize(self):
2017                 self._youtube_ie.initialize()
2018
2019         def _real_extract(self, query):
2020                 mobj = re.match(self._VALID_QUERY, query)
2021                 if mobj is None:
2022                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2023                         return
2024
2025                 prefix, query = query.split(':')
2026                 prefix = prefix[8:]
2027                 query  = query.encode('utf-8')
2028                 if prefix == '':
2029                         self._download_n_results(query, 1)
2030                         return
2031                 elif prefix == 'all':
2032                         self._download_n_results(query, self._max_youtube_results)
2033                         return
2034                 else:
2035                         try:
2036                                 n = long(prefix)
2037                                 if n <= 0:
2038                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2039                                         return
2040                                 elif n > self._max_youtube_results:
2041                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2042                                         n = self._max_youtube_results
2043                                 self._download_n_results(query, n)
2044                                 return
2045                         except ValueError: # parsing prefix as integer fails
2046                                 self._download_n_results(query, 1)
2047                                 return
2048
2049         def _download_n_results(self, query, n):
2050                 """Downloads a specified number of results for a query"""
2051
2052                 video_ids = []
2053                 already_seen = set()
2054                 pagenum = 1
2055
2056                 while True:
2057                         self.report_download_page(query, pagenum)
2058                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2059                         request = urllib2.Request(result_url)
2060                         try:
2061                                 page = urllib2.urlopen(request).read()
2062                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2063                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2064                                 return
2065
2066                         # Extract video identifiers
2067                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2068                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2069                                 if video_id not in already_seen:
2070                                         video_ids.append(video_id)
2071                                         already_seen.add(video_id)
2072                                         if len(video_ids) == n:
2073                                                 # Specified n videos reached
2074                                                 for id in video_ids:
2075                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2076                                                 return
2077
2078                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2079                                 for id in video_ids:
2080                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2081                                 return
2082
2083                         pagenum = pagenum + 1
2084
2085 class GoogleSearchIE(InfoExtractor):
2086         """Information Extractor for Google Video search queries."""
2087         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2088         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2089         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2090         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2091         _google_ie = None
2092         _max_google_results = 1000
2093
2094         def __init__(self, google_ie, downloader=None):
2095                 InfoExtractor.__init__(self, downloader)
2096                 self._google_ie = google_ie
2097
2098         @staticmethod
2099         def suitable(url):
2100                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2101
2102         def report_download_page(self, query, pagenum):
2103                 """Report attempt to download playlist page with given number."""
2104                 query = query.decode(preferredencoding())
2105                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2106
2107         def _real_initialize(self):
2108                 self._google_ie.initialize()
2109
2110         def _real_extract(self, query):
2111                 mobj = re.match(self._VALID_QUERY, query)
2112                 if mobj is None:
2113                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2114                         return
2115
2116                 prefix, query = query.split(':')
2117                 prefix = prefix[8:]
2118                 query  = query.encode('utf-8')
2119                 if prefix == '':
2120                         self._download_n_results(query, 1)
2121                         return
2122                 elif prefix == 'all':
2123                         self._download_n_results(query, self._max_google_results)
2124                         return
2125                 else:
2126                         try:
2127                                 n = long(prefix)
2128                                 if n <= 0:
2129                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2130                                         return
2131                                 elif n > self._max_google_results:
2132                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2133                                         n = self._max_google_results
2134                                 self._download_n_results(query, n)
2135                                 return
2136                         except ValueError: # parsing prefix as integer fails
2137                                 self._download_n_results(query, 1)
2138                                 return
2139
2140         def _download_n_results(self, query, n):
2141                 """Downloads a specified number of results for a query"""
2142
2143                 video_ids = []
2144                 already_seen = set()
2145                 pagenum = 1
2146
2147                 while True:
2148                         self.report_download_page(query, pagenum)
2149                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2150                         request = urllib2.Request(result_url)
2151                         try:
2152                                 page = urllib2.urlopen(request).read()
2153                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2154                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2155                                 return
2156
2157                         # Extract video identifiers
2158                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2159                                 video_id = mobj.group(1)
2160                                 if video_id not in already_seen:
2161                                         video_ids.append(video_id)
2162                                         already_seen.add(video_id)
2163                                         if len(video_ids) == n:
2164                                                 # Specified n videos reached
2165                                                 for id in video_ids:
2166                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2167                                                 return
2168
2169                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2170                                 for id in video_ids:
2171                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2172                                 return
2173
2174                         pagenum = pagenum + 1
2175
2176 class YahooSearchIE(InfoExtractor):
2177         """Information Extractor for Yahoo! Video search queries."""
2178         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2179         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2180         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2181         _MORE_PAGES_INDICATOR = r'\s*Next'
2182         _yahoo_ie = None
2183         _max_yahoo_results = 1000
2184
2185         def __init__(self, yahoo_ie, downloader=None):
2186                 InfoExtractor.__init__(self, downloader)
2187                 self._yahoo_ie = yahoo_ie
2188
2189         @staticmethod
2190         def suitable(url):
2191                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2192
2193         def report_download_page(self, query, pagenum):
2194                 """Report attempt to download playlist page with given number."""
2195                 query = query.decode(preferredencoding())
2196                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2197
2198         def _real_initialize(self):
2199                 self._yahoo_ie.initialize()
2200
2201         def _real_extract(self, query):
2202                 mobj = re.match(self._VALID_QUERY, query)
2203                 if mobj is None:
2204                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2205                         return
2206
2207                 prefix, query = query.split(':')
2208                 prefix = prefix[8:]
2209                 query  = query.encode('utf-8')
2210                 if prefix == '':
2211                         self._download_n_results(query, 1)
2212                         return
2213                 elif prefix == 'all':
2214                         self._download_n_results(query, self._max_yahoo_results)
2215                         return
2216                 else:
2217                         try:
2218                                 n = long(prefix)
2219                                 if n <= 0:
2220                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2221                                         return
2222                                 elif n > self._max_yahoo_results:
2223                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2224                                         n = self._max_yahoo_results
2225                                 self._download_n_results(query, n)
2226                                 return
2227                         except ValueError: # parsing prefix as integer fails
2228                                 self._download_n_results(query, 1)
2229                                 return
2230
2231         def _download_n_results(self, query, n):
2232                 """Downloads a specified number of results for a query"""
2233
2234                 video_ids = []
2235                 already_seen = set()
2236                 pagenum = 1
2237
2238                 while True:
2239                         self.report_download_page(query, pagenum)
2240                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2241                         request = urllib2.Request(result_url)
2242                         try:
2243                                 page = urllib2.urlopen(request).read()
2244                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2245                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2246                                 return
2247
2248                         # Extract video identifiers
2249                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2250                                 video_id = mobj.group(1)
2251                                 if video_id not in already_seen:
2252                                         video_ids.append(video_id)
2253                                         already_seen.add(video_id)
2254                                         if len(video_ids) == n:
2255                                                 # Specified n videos reached
2256                                                 for id in video_ids:
2257                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2258                                                 return
2259
2260                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2261                                 for id in video_ids:
2262                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2263                                 return
2264
2265                         pagenum = pagenum + 1
2266
2267 class YoutubePlaylistIE(InfoExtractor):
2268         """Information Extractor for YouTube playlists."""
2269
2270         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2271         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2272         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2273         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2274         _youtube_ie = None
2275
2276         def __init__(self, youtube_ie, downloader=None):
2277                 InfoExtractor.__init__(self, downloader)
2278                 self._youtube_ie = youtube_ie
2279
2280         @staticmethod
2281         def suitable(url):
2282                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2283
2284         def report_download_page(self, playlist_id, pagenum):
2285                 """Report attempt to download playlist page with given number."""
2286                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2287
2288         def _real_initialize(self):
2289                 self._youtube_ie.initialize()
2290
2291         def _real_extract(self, url):
2292                 # Extract playlist id
2293                 mobj = re.match(self._VALID_URL, url)
2294                 if mobj is None:
2295                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2296                         return
2297
2298                 # Single video case
2299                 if mobj.group(3) is not None:
2300                         self._youtube_ie.extract(mobj.group(3))
2301                         return
2302
2303                 # Download playlist pages
2304                 # prefix is 'p' as default for playlists but there are other types that need extra care
2305                 playlist_prefix = mobj.group(1)
2306                 if playlist_prefix == 'a':
2307                         playlist_access = 'artist'
2308                 else:
2309                         playlist_prefix = 'p'
2310                         playlist_access = 'view_play_list'
2311                 playlist_id = mobj.group(2)
2312                 video_ids = []
2313                 pagenum = 1
2314
2315                 while True:
2316                         self.report_download_page(playlist_id, pagenum)
2317                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2318                         try:
2319                                 page = urllib2.urlopen(request).read()
2320                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2321                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2322                                 return
2323
2324                         # Extract video identifiers
2325                         ids_in_page = []
2326                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2327                                 if mobj.group(1) not in ids_in_page:
2328                                         ids_in_page.append(mobj.group(1))
2329                         video_ids.extend(ids_in_page)
2330
2331                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2332                                 break
2333                         pagenum = pagenum + 1
2334
2335                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2336                 playlistend = self._downloader.params.get('playlistend', -1)
2337                 video_ids = video_ids[playliststart:playlistend]
2338
2339                 for id in video_ids:
2340                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2341                 return
2342
2343 class YoutubeUserIE(InfoExtractor):
2344         """Information Extractor for YouTube users."""
2345
2346         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2347         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2348         _GDATA_PAGE_SIZE = 50
2349         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2350         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2351         _youtube_ie = None
2352
2353         def __init__(self, youtube_ie, downloader=None):
2354                 InfoExtractor.__init__(self, downloader)
2355                 self._youtube_ie = youtube_ie
2356
2357         @staticmethod
2358         def suitable(url):
2359                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2360
2361         def report_download_page(self, username, start_index):
2362                 """Report attempt to download user page."""
2363                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2364                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2365
2366         def _real_initialize(self):
2367                 self._youtube_ie.initialize()
2368
2369         def _real_extract(self, url):
2370                 # Extract username
2371                 mobj = re.match(self._VALID_URL, url)
2372                 if mobj is None:
2373                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2374                         return
2375
2376                 username = mobj.group(1)
2377
2378                 # Download video ids using YouTube Data API. Result size per
2379                 # query is limited (currently to 50 videos) so we need to query
2380                 # page by page until there are no video ids - it means we got
2381                 # all of them.
2382
2383                 video_ids = []
2384                 pagenum = 0
2385
2386                 while True:
2387                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2388                         self.report_download_page(username, start_index)
2389
2390                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2391
2392                         try:
2393                                 page = urllib2.urlopen(request).read()
2394                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2395                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2396                                 return
2397
2398                         # Extract video identifiers
2399                         ids_in_page = []
2400
2401                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2402                                 if mobj.group(1) not in ids_in_page:
2403                                         ids_in_page.append(mobj.group(1))
2404
2405                         video_ids.extend(ids_in_page)
2406
2407                         # A little optimization - if current page is not
2408                         # "full", ie. does not contain PAGE_SIZE video ids then
2409                         # we can assume that this page is the last one - there
2410                         # are no more ids on further pages - no need to query
2411                         # again.
2412
2413                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2414                                 break
2415
2416                         pagenum += 1
2417
2418                 all_ids_count = len(video_ids)
2419                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2420                 playlistend = self._downloader.params.get('playlistend', -1)
2421
2422                 if playlistend == -1:
2423                         video_ids = video_ids[playliststart:]
2424                 else:
2425                         video_ids = video_ids[playliststart:playlistend]
2426
2427                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2428                                            (username, all_ids_count, len(video_ids)))
2429
2430                 for video_id in video_ids:
2431                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2432
2433
2434 class DepositFilesIE(InfoExtractor):
2435         """Information extractor for depositfiles.com"""
2436
2437         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2438
2439         def __init__(self, downloader=None):
2440                 InfoExtractor.__init__(self, downloader)
2441
2442         @staticmethod
2443         def suitable(url):
2444                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2445
2446         def report_download_webpage(self, file_id):
2447                 """Report webpage download."""
2448                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2449
2450         def report_extraction(self, file_id):
2451                 """Report information extraction."""
2452                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2453
2454         def _real_initialize(self):
2455                 return
2456
2457         def _real_extract(self, url):
2458                 # At this point we have a new file
2459                 self._downloader.increment_downloads()
2460
2461                 file_id = url.split('/')[-1]
2462                 # Rebuild url in english locale
2463                 url = 'http://depositfiles.com/en/files/' + file_id
2464
2465                 # Retrieve file webpage with 'Free download' button pressed
2466                 free_download_indication = { 'gateway_result' : '1' }
2467                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2468                 try:
2469                         self.report_download_webpage(file_id)
2470                         webpage = urllib2.urlopen(request).read()
2471                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2472                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2473                         return
2474
2475                 # Search for the real file URL
2476                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2477                 if (mobj is None) or (mobj.group(1) is None):
2478                         # Try to figure out reason of the error.
2479                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2480                         if (mobj is not None) and (mobj.group(1) is not None):
2481                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2482                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2483                         else:
2484                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2485                         return
2486
2487                 file_url = mobj.group(1)
2488                 file_extension = os.path.splitext(file_url)[1][1:]
2489
2490                 # Search for file title
2491                 mobj = re.search(r'<b title="(.*?)">', webpage)
2492                 if mobj is None:
2493                         self._downloader.trouble(u'ERROR: unable to extract title')
2494                         return
2495                 file_title = mobj.group(1).decode('utf-8')
2496
2497                 try:
2498                         # Process file information
2499                         self._downloader.process_info({
2500                                 'id':           file_id.decode('utf-8'),
2501                                 'url':          file_url.decode('utf-8'),
2502                                 'uploader':     u'NA',
2503                                 'upload_date':  u'NA',
2504                                 'title':        file_title,
2505                                 'stitle':       file_title,
2506                                 'ext':          file_extension.decode('utf-8'),
2507                                 'format':       u'NA',
2508                                 'player_url':   None,
2509                         })
2510                 except UnavailableVideoError, err:
2511                         self._downloader.trouble(u'ERROR: unable to download file')
2512
2513 class FacebookIE(InfoExtractor):
2514         """Information Extractor for Facebook"""
2515
2516         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2517         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2518         _NETRC_MACHINE = 'facebook'
2519         _available_formats = ['highqual', 'lowqual']
2520         _video_extensions = {
2521                 'highqual': 'mp4',
2522                 'lowqual': 'mp4',
2523         }
2524
2525         def __init__(self, downloader=None):
2526                 InfoExtractor.__init__(self, downloader)
2527
2528         @staticmethod
2529         def suitable(url):
2530                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2531
2532         def _reporter(self, message):
2533                 """Add header and report message."""
2534                 self._downloader.to_screen(u'[facebook] %s' % message)
2535
2536         def report_login(self):
2537                 """Report attempt to log in."""
2538                 self._reporter(u'Logging in')
2539
2540         def report_video_webpage_download(self, video_id):
2541                 """Report attempt to download video webpage."""
2542                 self._reporter(u'%s: Downloading video webpage' % video_id)
2543
2544         def report_information_extraction(self, video_id):
2545                 """Report attempt to extract video information."""
2546                 self._reporter(u'%s: Extracting video information' % video_id)
2547
2548         def _parse_page(self, video_webpage):
2549                 """Extract video information from page"""
2550                 # General data
2551                 data = {'title': r'class="video_title datawrap">(.*?)</',
2552                         'description': r'<div class="datawrap">(.*?)</div>',
2553                         'owner': r'\("video_owner_name", "(.*?)"\)',
2554                         'upload_date': r'data-date="(.*?)"',
2555                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2556                         }
2557                 video_info = {}
2558                 for piece in data.keys():
2559                         mobj = re.search(data[piece], video_webpage)
2560                         if mobj is not None:
2561                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2562
2563                 # Video urls
2564                 video_urls = {}
2565                 for fmt in self._available_formats:
2566                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2567                         if mobj is not None:
2568                                 # URL is in a Javascript segment inside an escaped Unicode format within
2569                                 # the generally utf-8 page
2570                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2571                 video_info['video_urls'] = video_urls
2572
2573                 return video_info
2574
2575         def _real_initialize(self):
2576                 if self._downloader is None:
2577                         return
2578
2579                 useremail = None
2580                 password = None
2581                 downloader_params = self._downloader.params
2582
2583                 # Attempt to use provided username and password or .netrc data
2584                 if downloader_params.get('username', None) is not None:
2585                         useremail = downloader_params['username']
2586                         password = downloader_params['password']
2587                 elif downloader_params.get('usenetrc', False):
2588                         try:
2589                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2590                                 if info is not None:
2591                                         useremail = info[0]
2592                                         password = info[2]
2593                                 else:
2594                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2595                         except (IOError, netrc.NetrcParseError), err:
2596                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2597                                 return
2598
2599                 if useremail is None:
2600                         return
2601
2602                 # Log in
2603                 login_form = {
2604                         'email': useremail,
2605                         'pass': password,
2606                         'login': 'Log+In'
2607                         }
2608                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2609                 try:
2610                         self.report_login()
2611                         login_results = urllib2.urlopen(request).read()
2612                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2613                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2614                                 return
2615                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2616                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2617                         return
2618
2619         def _real_extract(self, url):
2620                 mobj = re.match(self._VALID_URL, url)
2621                 if mobj is None:
2622                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2623                         return
2624                 video_id = mobj.group('ID')
2625
2626                 # Get video webpage
2627                 self.report_video_webpage_download(video_id)
2628                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2629                 try:
2630                         page = urllib2.urlopen(request)
2631                         video_webpage = page.read()
2632                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2633                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2634                         return
2635
2636                 # Start extracting information
2637                 self.report_information_extraction(video_id)
2638
2639                 # Extract information
2640                 video_info = self._parse_page(video_webpage)
2641
2642                 # uploader
2643                 if 'owner' not in video_info:
2644                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2645                         return
2646                 video_uploader = video_info['owner']
2647
2648                 # title
2649                 if 'title' not in video_info:
2650                         self._downloader.trouble(u'ERROR: unable to extract video title')
2651                         return
2652                 video_title = video_info['title']
2653                 video_title = video_title.decode('utf-8')
2654                 video_title = sanitize_title(video_title)
2655
2656                 # simplified title
2657                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2658                 simple_title = simple_title.strip(ur'_')
2659
2660                 # thumbnail image
2661                 if 'thumbnail' not in video_info:
2662                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2663                         video_thumbnail = ''
2664                 else:
2665                         video_thumbnail = video_info['thumbnail']
2666
2667                 # upload date
2668                 upload_date = u'NA'
2669                 if 'upload_date' in video_info:
2670                         upload_time = video_info['upload_date']
2671                         timetuple = email.utils.parsedate_tz(upload_time)
2672                         if timetuple is not None:
2673                                 try:
2674                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2675                                 except:
2676                                         pass
2677
2678                 # description
2679                 video_description = video_info.get('description', 'No description available.')
2680
2681                 url_map = video_info['video_urls']
2682                 if len(url_map.keys()) > 0:
2683                         # Decide which formats to download
2684                         req_format = self._downloader.params.get('format', None)
2685                         format_limit = self._downloader.params.get('format_limit', None)
2686
2687                         if format_limit is not None and format_limit in self._available_formats:
2688                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2689                         else:
2690                                 format_list = self._available_formats
2691                         existing_formats = [x for x in format_list if x in url_map]
2692                         if len(existing_formats) == 0:
2693                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2694                                 return
2695                         if req_format is None:
2696                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2697                         elif req_format == '-1':
2698                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2699                         else:
2700                                 # Specific format
2701                                 if req_format not in url_map:
2702                                         self._downloader.trouble(u'ERROR: requested format not available')
2703                                         return
2704                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2705
2706                 for format_param, video_real_url in video_url_list:
2707
2708                         # At this point we have a new video
2709                         self._downloader.increment_downloads()
2710
2711                         # Extension
2712                         video_extension = self._video_extensions.get(format_param, 'mp4')
2713
2714                         # Find the video URL in fmt_url_map or conn paramters
2715                         try:
2716                                 # Process video information
2717                                 self._downloader.process_info({
2718                                         'id':           video_id.decode('utf-8'),
2719                                         'url':          video_real_url.decode('utf-8'),
2720                                         'uploader':     video_uploader.decode('utf-8'),
2721                                         'upload_date':  upload_date,
2722                                         'title':        video_title,
2723                                         'stitle':       simple_title,
2724                                         'ext':          video_extension.decode('utf-8'),
2725                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2726                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2727                                         'description':  video_description.decode('utf-8'),
2728                                         'player_url':   None,
2729                                 })
2730                         except UnavailableVideoError, err:
2731                                 self._downloader.trouble(u'\nERROR: unable to download video')
2732
2733 class BlipTVIE(InfoExtractor):
2734         """Information extractor for blip.tv"""
2735
2736         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2737         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2738
2739         @staticmethod
2740         def suitable(url):
2741                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2742
2743         def report_extraction(self, file_id):
2744                 """Report information extraction."""
2745                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2746
2747         def _simplify_title(self, title):
2748                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2749                 res = res.strip(ur'_')
2750                 return res
2751
2752         def _real_extract(self, url):
2753                 mobj = re.match(self._VALID_URL, url)
2754                 if mobj is None:
2755                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2756                         return
2757
2758                 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2759                 request = urllib2.Request(json_url)
2760                 self.report_extraction(mobj.group(1))
2761                 try:
2762                         json_code = urllib2.urlopen(request).read()
2763                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2764                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2765                         return
2766                 try:
2767                         json_data = json.loads(json_code)
2768                         data = json_data['Post'] if 'Post' in json_data else json_data
2769
2770                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2771                         video_url = data['media']['url']
2772                         umobj = re.match(self._URL_EXT, video_url)
2773                         if umobj is None:
2774                                 raise ValueError('Can not determine filename extension')
2775                         ext = umobj.group(1)
2776
2777                         self._downloader.increment_downloads()
2778
2779                         info = {
2780                                 'id': data['item_id'],
2781                                 'url': video_url,
2782                                 'uploader': data['display_name'],
2783                                 'upload_date': upload_date,
2784                                 'title': data['title'],
2785                                 'stitle': self._simplify_title(data['title']),
2786                                 'ext': ext,
2787                                 'format': data['media']['mimeType'],
2788                                 'thumbnail': data['thumbnailUrl'],
2789                                 'description': data['description'],
2790                                 'player_url': data['embedUrl']
2791                         }
2792                 except (ValueError,KeyError), err:
2793                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2794                         return
2795
2796                 try:
2797                         self._downloader.process_info(info)
2798                 except UnavailableVideoError, err:
2799                         self._downloader.trouble(u'\nERROR: unable to download video')
2800
2801
2802 class PostProcessor(object):
2803         """Post Processor class.
2804
2805         PostProcessor objects can be added to downloaders with their
2806         add_post_processor() method. When the downloader has finished a
2807         successful download, it will take its internal chain of PostProcessors
2808         and start calling the run() method on each one of them, first with
2809         an initial argument and then with the returned value of the previous
2810         PostProcessor.
2811
2812         The chain will be stopped if one of them ever returns None or the end
2813         of the chain is reached.
2814
2815         PostProcessor objects follow a "mutual registration" process similar
2816         to InfoExtractor objects.
2817         """
2818
2819         _downloader = None
2820
2821         def __init__(self, downloader=None):
2822                 self._downloader = downloader
2823
2824         def set_downloader(self, downloader):
2825                 """Sets the downloader for this PP."""
2826                 self._downloader = downloader
2827
2828         def run(self, information):
2829                 """Run the PostProcessor.
2830
2831                 The "information" argument is a dictionary like the ones
2832                 composed by InfoExtractors. The only difference is that this
2833                 one has an extra field called "filepath" that points to the
2834                 downloaded file.
2835
2836                 When this method returns None, the postprocessing chain is
2837                 stopped. However, this method may return an information
2838                 dictionary that will be passed to the next postprocessing
2839                 object in the chain. It can be the one it received after
2840                 changing some fields.
2841
2842                 In addition, this method may raise a PostProcessingError
2843                 exception that will be taken into account by the downloader
2844                 it was called from.
2845                 """
2846                 return information # by default, do nothing
2847
2848 class FFmpegExtractAudioPP(PostProcessor):
2849
2850         def __init__(self, downloader=None, preferredcodec=None):
2851                 PostProcessor.__init__(self, downloader)
2852                 if preferredcodec is None:
2853                         preferredcodec = 'best'
2854                 self._preferredcodec = preferredcodec
2855
2856         @staticmethod
2857         def get_audio_codec(path):
2858                 try:
2859                         cmd = ['ffprobe', '-show_streams', '--', path]
2860                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2861                         output = handle.communicate()[0]
2862                         if handle.wait() != 0:
2863                                 return None
2864                 except (IOError, OSError):
2865                         return None
2866                 audio_codec = None
2867                 for line in output.split('\n'):
2868                         if line.startswith('codec_name='):
2869                                 audio_codec = line.split('=')[1].strip()
2870                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2871                                 return audio_codec
2872                 return None
2873
2874         @staticmethod
2875         def run_ffmpeg(path, out_path, codec, more_opts):
2876                 try:
2877                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2878                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2879                         return (ret == 0)
2880                 except (IOError, OSError):
2881                         return False
2882
2883         def run(self, information):
2884                 path = information['filepath']
2885
2886                 filecodec = self.get_audio_codec(path)
2887                 if filecodec is None:
2888                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2889                         return None
2890
2891                 more_opts = []
2892                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2893                         if filecodec == 'aac' or filecodec == 'mp3':
2894                                 # Lossless if possible
2895                                 acodec = 'copy'
2896                                 extension = filecodec
2897                                 if filecodec == 'aac':
2898                                         more_opts = ['-f', 'adts']
2899                         else:
2900                                 # MP3 otherwise.
2901                                 acodec = 'libmp3lame'
2902                                 extension = 'mp3'
2903                                 more_opts = ['-ab', '128k']
2904                 else:
2905                         # We convert the audio (lossy)
2906                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2907                         extension = self._preferredcodec
2908                         more_opts = ['-ab', '128k']
2909                         if self._preferredcodec == 'aac':
2910                                 more_opts += ['-f', 'adts']
2911
2912                 (prefix, ext) = os.path.splitext(path)
2913                 new_path = prefix + '.' + extension
2914                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2915                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2916
2917                 if not status:
2918                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2919                         return None
2920
2921                 try:
2922                         os.remove(path)
2923                 except (IOError, OSError):
2924                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2925                         return None
2926
2927                 information['filepath'] = new_path
2928                 return information
2929
2930 ### MAIN PROGRAM ###
2931 if __name__ == '__main__':
2932         try:
2933                 # Modules needed only when running the main program
2934                 import getpass
2935                 import optparse
2936
2937                 # Function to update the program file with the latest version from the repository.
2938                 def update_self(downloader, filename):
2939                         # Note: downloader only used for options
2940                         if not os.access(filename, os.W_OK):
2941                                 sys.exit('ERROR: no write permissions on %s' % filename)
2942
2943                         downloader.to_screen('Updating to latest stable version...')
2944                         try:
2945                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2946                                 latest_version = urllib.urlopen(latest_url).read().strip()
2947                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2948                                 newcontent = urllib.urlopen(prog_url).read()
2949                         except (IOError, OSError), err:
2950                                 sys.exit('ERROR: unable to download latest version')
2951                         try:
2952                                 stream = open(filename, 'w')
2953                                 stream.write(newcontent)
2954                                 stream.close()
2955                         except (IOError, OSError), err:
2956                                 sys.exit('ERROR: unable to overwrite current version')
2957                         downloader.to_screen('Updated to version %s' % latest_version)
2958
2959                 # Parse command line
2960                 parser = optparse.OptionParser(
2961                         usage='Usage: %prog [options] url...',
2962                         version='2011.07.09-phihag',
2963                         conflict_handler='resolve',
2964                 )
2965
2966                 parser.add_option('-h', '--help',
2967                                 action='help', help='print this help text and exit')
2968                 parser.add_option('-v', '--version',
2969                                 action='version', help='print program version and exit')
2970                 parser.add_option('-U', '--update',
2971                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2972                 parser.add_option('-i', '--ignore-errors',
2973                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2974                 parser.add_option('-r', '--rate-limit',
2975                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2976                 parser.add_option('-R', '--retries',
2977                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2978                 parser.add_option('--playlist-start',
2979                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2980                 parser.add_option('--playlist-end',
2981                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2982                 parser.add_option('--dump-user-agent',
2983                                 action='store_true', dest='dump_user_agent',
2984                                 help='display the current browser identification', default=False)
2985
2986                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2987                 authentication.add_option('-u', '--username',
2988                                 dest='username', metavar='USERNAME', help='account username')
2989                 authentication.add_option('-p', '--password',
2990                                 dest='password', metavar='PASSWORD', help='account password')
2991                 authentication.add_option('-n', '--netrc',
2992                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2993                 parser.add_option_group(authentication)
2994
2995                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2996                 video_format.add_option('-f', '--format',
2997                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2998                 video_format.add_option('--all-formats',
2999                                 action='store_const', dest='format', help='download all available video formats', const='-1')
3000                 video_format.add_option('--max-quality',
3001                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3002                 parser.add_option_group(video_format)
3003
3004                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3005                 verbosity.add_option('-q', '--quiet',
3006                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3007                 verbosity.add_option('-s', '--simulate',
3008                                 action='store_true', dest='simulate', help='do not download video', default=False)
3009                 verbosity.add_option('-g', '--get-url',
3010                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3011                 verbosity.add_option('-e', '--get-title',
3012                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3013                 verbosity.add_option('--get-thumbnail',
3014                                 action='store_true', dest='getthumbnail',
3015                                 help='simulate, quiet but print thumbnail URL', default=False)
3016                 verbosity.add_option('--get-description',
3017                                 action='store_true', dest='getdescription',
3018                                 help='simulate, quiet but print video description', default=False)
3019                 verbosity.add_option('--get-filename',
3020                                 action='store_true', dest='getfilename',
3021                                 help='simulate, quiet but print output filename', default=False)
3022                 verbosity.add_option('--no-progress',
3023                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3024                 verbosity.add_option('--console-title',
3025                                 action='store_true', dest='consoletitle',
3026                                 help='display progress in console titlebar', default=False)
3027                 parser.add_option_group(verbosity)
3028
3029                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3030                 filesystem.add_option('-t', '--title',
3031                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
3032                 filesystem.add_option('-l', '--literal',
3033                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3034                 filesystem.add_option('-A', '--auto-number',
3035                                 action='store_true', dest='autonumber',
3036                                 help='number downloaded files starting from 00000', default=False)
3037                 filesystem.add_option('-o', '--output',
3038                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3039                 filesystem.add_option('-a', '--batch-file',
3040                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3041                 filesystem.add_option('-w', '--no-overwrites',
3042                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3043                 filesystem.add_option('-c', '--continue',
3044                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3045                 filesystem.add_option('--cookies',
3046                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3047                 filesystem.add_option('--no-part',
3048                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
3049                 filesystem.add_option('--no-mtime',
3050                                 action='store_false', dest='updatetime',
3051                                 help='do not use the Last-modified header to set the file modification time', default=True)
3052                 filesystem.add_option('--write-description',
3053                                 action='store_true', dest='writedescription',
3054                                 help='write video description to a .description file', default=False)
3055                 filesystem.add_option('--write-info-json',
3056                                 action='store_true', dest='writeinfojson',
3057                                 help='write video metadata to a .info.json file', default=False)
3058                 parser.add_option_group(filesystem)
3059
3060                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3061                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3062                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3063                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3064                                 help='"best", "aac" or "mp3"; best by default')
3065                 parser.add_option_group(postproc)
3066
3067                 (opts, args) = parser.parse_args()
3068
3069                 # Open appropriate CookieJar
3070                 if opts.cookiefile is None:
3071                         jar = cookielib.CookieJar()
3072                 else:
3073                         try:
3074                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3075                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3076                                         jar.load()
3077                         except (IOError, OSError), err:
3078                                 sys.exit(u'ERROR: unable to open cookie file')
3079
3080                 # Dump user agent
3081                 if opts.dump_user_agent:
3082                         print std_headers['User-Agent']
3083                         sys.exit(0)
3084
3085                 # General configuration
3086                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3087                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3088                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3089
3090                 # Batch file verification
3091                 batchurls = []
3092                 if opts.batchfile is not None:
3093                         try:
3094                                 if opts.batchfile == '-':
3095                                         batchfd = sys.stdin
3096                                 else:
3097                                         batchfd = open(opts.batchfile, 'r')
3098                                 batchurls = batchfd.readlines()
3099                                 batchurls = [x.strip() for x in batchurls]
3100                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3101                         except IOError:
3102                                 sys.exit(u'ERROR: batch file could not be read')
3103                 all_urls = batchurls + args
3104
3105                 # Conflicting, missing and erroneous options
3106                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3107                         parser.error(u'using .netrc conflicts with giving username/password')
3108                 if opts.password is not None and opts.username is None:
3109                         parser.error(u'account username missing')
3110                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3111                         parser.error(u'using output template conflicts with using title, literal title or auto number')
3112                 if opts.usetitle and opts.useliteral:
3113                         parser.error(u'using title conflicts with using literal title')
3114                 if opts.username is not None and opts.password is None:
3115                         opts.password = getpass.getpass(u'Type account password and press return:')
3116                 if opts.ratelimit is not None:
3117                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3118                         if numeric_limit is None:
3119                                 parser.error(u'invalid rate limit specified')
3120                         opts.ratelimit = numeric_limit
3121                 if opts.retries is not None:
3122                         try:
3123                                 opts.retries = long(opts.retries)
3124                         except (TypeError, ValueError), err:
3125                                 parser.error(u'invalid retry count specified')
3126                 try:
3127                         opts.playliststart = long(opts.playliststart)
3128                         if opts.playliststart <= 0:
3129                                 raise ValueError
3130                 except (TypeError, ValueError), err:
3131                         parser.error(u'invalid playlist start number specified')
3132                 try:
3133                         opts.playlistend = long(opts.playlistend)
3134                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3135                                 raise ValueError
3136                 except (TypeError, ValueError), err:
3137                         parser.error(u'invalid playlist end number specified')
3138                 if opts.extractaudio:
3139                         if opts.audioformat not in ['best', 'aac', 'mp3']:
3140                                 parser.error(u'invalid audio format specified')
3141
3142                 # Information extractors
3143                 youtube_ie = YoutubeIE()
3144                 metacafe_ie = MetacafeIE(youtube_ie)
3145                 dailymotion_ie = DailymotionIE()
3146                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3147                 youtube_user_ie = YoutubeUserIE(youtube_ie)
3148                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3149                 google_ie = GoogleIE()
3150                 google_search_ie = GoogleSearchIE(google_ie)
3151                 photobucket_ie = PhotobucketIE()
3152                 yahoo_ie = YahooIE()
3153                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3154                 deposit_files_ie = DepositFilesIE()
3155                 facebook_ie = FacebookIE()
3156                 bliptv_ie = BlipTVIE()
3157                 generic_ie = GenericIE()
3158
3159                 # File downloader
3160                 fd = FileDownloader({
3161                         'usenetrc': opts.usenetrc,
3162                         'username': opts.username,
3163                         'password': opts.password,
3164                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3165                         'forceurl': opts.geturl,
3166                         'forcetitle': opts.gettitle,
3167                         'forcethumbnail': opts.getthumbnail,
3168                         'forcedescription': opts.getdescription,
3169                         'forcefilename': opts.getfilename,
3170                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3171                         'format': opts.format,
3172                         'format_limit': opts.format_limit,
3173                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3174                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3175                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3176                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3177                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3178                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3179                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3180                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3181                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3182                                 or u'%(id)s.%(ext)s'),
3183                         'ignoreerrors': opts.ignoreerrors,
3184                         'ratelimit': opts.ratelimit,
3185                         'nooverwrites': opts.nooverwrites,
3186                         'retries': opts.retries,
3187                         'continuedl': opts.continue_dl,
3188                         'noprogress': opts.noprogress,
3189                         'playliststart': opts.playliststart,
3190                         'playlistend': opts.playlistend,
3191                         'logtostderr': opts.outtmpl == '-',
3192                         'consoletitle': opts.consoletitle,
3193                         'nopart': opts.nopart,
3194                         'updatetime': opts.updatetime,
3195                         'writedescription': opts.writedescription,
3196                         'writeinfojson': opts.writeinfojson,
3197                         })
3198                 fd.add_info_extractor(youtube_search_ie)
3199                 fd.add_info_extractor(youtube_pl_ie)
3200                 fd.add_info_extractor(youtube_user_ie)
3201                 fd.add_info_extractor(metacafe_ie)
3202                 fd.add_info_extractor(dailymotion_ie)
3203                 fd.add_info_extractor(youtube_ie)
3204                 fd.add_info_extractor(google_ie)
3205                 fd.add_info_extractor(google_search_ie)
3206                 fd.add_info_extractor(photobucket_ie)
3207                 fd.add_info_extractor(yahoo_ie)
3208                 fd.add_info_extractor(yahoo_search_ie)
3209                 fd.add_info_extractor(deposit_files_ie)
3210                 fd.add_info_extractor(facebook_ie)
3211                 fd.add_info_extractor(bliptv_ie)
3212
3213                 # This must come last since it's the
3214                 # fallback if none of the others work
3215                 fd.add_info_extractor(generic_ie)
3216
3217                 # PostProcessors
3218                 if opts.extractaudio:
3219                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3220
3221                 # Update version
3222                 if opts.update_self:
3223                         update_self(fd, sys.argv[0])
3224
3225                 # Maybe do nothing
3226                 if len(all_urls) < 1:
3227                         if not opts.update_self:
3228                                 parser.error(u'you must provide at least one URL')
3229                         else:
3230                                 sys.exit()
3231                 retcode = fd.download(all_urls)
3232
3233                 # Dump cookie jar if requested
3234                 if opts.cookiefile is not None:
3235                         try:
3236                                 jar.save()
3237                         except (IOError, OSError), err:
3238                                 sys.exit(u'ERROR: unable to save cookie jar')
3239
3240                 sys.exit(retcode)
3241
3242         except DownloadError:
3243                 sys.exit(1)
3244         except SameFileError:
3245                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3246         except KeyboardInterrupt:
3247                 sys.exit(u'\nERROR: Interrupted by user')