youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # Author: Paweł Paprota
   9 # Author: Gergely Imreh
  10 # Author: Philipp Hagemeister <phihag@phihag.de>
  11 # License: Public domain code
  12 from __future__ import with_statement
  13 import contextlib
  14 import cookielib
  15 import datetime
  16 import gzip
  17 import htmlentitydefs
  18 import httplib
  19 import locale
  20 import math
  21 import netrc
  22 import os
  23 import os.path
  24 import re
  25 import socket
  26 import string
  27 import subprocess
  28 import sys
  29 import time
  30 import urllib
  31 import urllib2
  32 import warnings
  33 import zlib
  34
  35 if os.name == 'nt':
  36         import ctypes
  37
  38 try:
  39         import email.utils
  40 except ImportError: # Python 2.4
  41         import email.Utils
  42 try:
  43         import cStringIO as StringIO
  44 except ImportError:
  45         import StringIO
  46
  47 # parse_qs was moved from the cgi module to the urlparse module recently.
  48 try:
  49         from urlparse import parse_qs
  50 except ImportError:
  51         from cgi import parse_qs
  52
  53 try:
  54         import lxml.etree
  55 except ImportError: # Python < 2.6
  56         pass # Handled below
  57
  58 std_headers = {
  59         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
  60         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  61         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  62         'Accept-Encoding': 'gzip, deflate',
  63         'Accept-Language': 'en-us,en;q=0.5',
  64 }
  65
  66 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  67
  68 try:
  69         import json
  70 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  71         import re
  72         class json(object):
  73                 @staticmethod
  74                 def loads(s):
  75                         s = s.decode('UTF-8')
  76                         def raiseError(msg, i):
  77                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  78                         def skipSpace(i, expectMore=True):
  79                                 while i < len(s) and s[i] in ' \t\r\n':
  80                                         i += 1
  81                                 if expectMore:
  82                                         if i >= len(s):
  83                                                 raiseError('Premature end', i)
  84                                 return i
  85                         def decodeEscape(match):
  86                                 esc = match.group(1)
  87                                 _STATIC = {
  88                                         '"': '"',
  89                                         '\\': '\\',
  90                                         '/': '/',
  91                                         'b': unichr(0x8),
  92                                         'f': unichr(0xc),
  93                                         'n': '\n',
  94                                         'r': '\r',
  95                                         't': '\t',
  96                                 }
  97                                 if esc in _STATIC:
  98                                         return _STATIC[esc]
  99                                 if esc[0] == 'u':
 100                                         if len(esc) == 1+4:
 101                                                 return unichr(int(esc[1:5], 16))
 102                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 103                                                 hi = int(esc[1:5], 16)
 104                                                 low = int(esc[7:11], 16)
 105                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 106                                 raise ValueError('Unknown escape ' + str(esc))
 107                         def parseString(i):
 108                                 i += 1
 109                                 e = i
 110                                 while True:
 111                                         e = s.index('"', e)
 112                                         bslashes = 0
 113                                         while s[e-bslashes-1] == '\\':
 114                                                 bslashes += 1
 115                                         if bslashes % 2 == 1:
 116                                                 e += 1
 117                                                 continue
 118                                         break
 119                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 120                                 stri = rexp.sub(decodeEscape, s[i:e])
 121                                 return (e+1,stri)
 122                         def parseObj(i):
 123                                 i += 1
 124                                 res = {}
 125                                 i = skipSpace(i)
 126                                 if s[i] == '}': # Empty dictionary
 127                                         return (i+1,res)
 128                                 while True:
 129                                         if s[i] != '"':
 130                                                 raiseError('Expected a string object key', i)
 131                                         i,key = parseString(i)
 132                                         i = skipSpace(i)
 133                                         if i >= len(s) or s[i] != ':':
 134                                                 raiseError('Expected a colon', i)
 135                                         i,val = parse(i+1)
 136                                         res[key] = val
 137                                         i = skipSpace(i)
 138                                         if s[i] == '}':
 139                                                 return (i+1, res)
 140                                         if s[i] != ',':
 141                                                 raiseError('Expected comma or closing curly brace', i)
 142                                         i = skipSpace(i+1)
 143                         def parseArray(i):
 144                                 res = []
 145                                 i = skipSpace(i+1)
 146                                 if s[i] == ']': # Empty array
 147                                         return (i+1,res)
 148                                 while True:
 149                                         i,val = parse(i)
 150                                         res.append(val)
 151                                         i = skipSpace(i) # Raise exception if premature end
 152                                         if s[i] == ']':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected a comma or closing bracket', i)
 156                                         i = skipSpace(i+1)
 157                         def parseDiscrete(i):
 158                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 159                                         if s.startswith(k, i):
 160                                                 return (i+len(k), v)
 161                                 raiseError('Not a boolean (or null)', i)
 162                         def parseNumber(i):
 163                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 164                                 if mobj is None:
 165                                         raiseError('Not a number', i)
 166                                 nums = mobj.group(1)
 167                                 if '.' in nums or 'e' in nums or 'E' in nums:
 168                                         return (i+len(nums), float(nums))
 169                                 return (i+len(nums), int(nums))
 170                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 171                         def parse(i):
 172                                 i = skipSpace(i)
 173                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 174                                 i = skipSpace(i, False)
 175                                 return (i,res)
 176                         i,res = parse(0)
 177                         if i < len(s):
 178                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 179                         return res
 180
 181 def preferredencoding():
 182         """Get preferred encoding.
 183
 184         Returns the best encoding scheme for the system, based on
 185         locale.getpreferredencoding() and some further tweaks.
 186         """
 187         def yield_preferredencoding():
 188                 try:
 189                         pref = locale.getpreferredencoding()
 190                         u'TEST'.encode(pref)
 191                 except:
 192                         pref = 'UTF-8'
 193                 while True:
 194                         yield pref
 195         return yield_preferredencoding().next()
 196
 197 def htmlentity_transform(matchobj):
 198         """Transforms an HTML entity to a Unicode character.
 199
 200         This function receives a match object and is intended to be used with
 201         the re.sub() function.
 202         """
 203         entity = matchobj.group(1)
 204
 205         # Known non-numeric HTML entity
 206         if entity in htmlentitydefs.name2codepoint:
 207                 return unichr(htmlentitydefs.name2codepoint[entity])
 208
 209         # Unicode character
 210         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 211         if mobj is not None:
 212                 numstr = mobj.group(1)
 213                 if numstr.startswith(u'x'):
 214                         base = 16
 215                         numstr = u'0%s' % numstr
 216                 else:
 217                         base = 10
 218                 return unichr(long(numstr, base))
 219
 220         # Unknown entity in name, return its literal representation
 221         return (u'&%s;' % entity)
 222
 223 def sanitize_title(utitle):
 224         """Sanitizes a video title so it could be used as part of a filename."""
 225         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 226         return utitle.replace(unicode(os.sep), u'%')
 227
 228 def sanitize_open(filename, open_mode):
 229         """Try to open the given filename, and slightly tweak it if this fails.
 230
 231         Attempts to open the given filename. If this fails, it tries to change
 232         the filename slightly, step by step, until it's either able to open it
 233         or it fails and raises a final exception, like the standard open()
 234         function.
 235
 236         It returns the tuple (stream, definitive_file_name).
 237         """
 238         try:
 239                 if filename == u'-':
 240                         if sys.platform == 'win32':
 241                                 import msvcrt
 242                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 243                         return (sys.stdout, filename)
 244                 stream = open(filename, open_mode)
 245                 return (stream, filename)
 246         except (IOError, OSError), err:
 247                 # In case of error, try to remove win32 forbidden chars
 248                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 249
 250                 # An exception here should be caught in the caller
 251                 stream = open(filename, open_mode)
 252                 return (stream, filename)
 253
 254 def timeconvert(timestr):
 255     """Convert RFC 2822 defined time string into system timestamp"""
 256     timestamp = None
 257     timetuple = email.utils.parsedate_tz(timestr)
 258     if timetuple is not None:
 259         timestamp = email.utils.mktime_tz(timetuple)
 260     return timestamp
 261
 262 class DownloadError(Exception):
 263         """Download Error exception.
 264
 265         This exception may be thrown by FileDownloader objects if they are not
 266         configured to continue on errors. They will contain the appropriate
 267         error message.
 268         """
 269         pass
 270
 271 class SameFileError(Exception):
 272         """Same File exception.
 273
 274         This exception will be thrown by FileDownloader objects if they detect
 275         multiple files would have to be downloaded to the same file on disk.
 276         """
 277         pass
 278
 279 class PostProcessingError(Exception):
 280         """Post Processing exception.
 281
 282         This exception may be raised by PostProcessor's .run() method to
 283         indicate an error in the postprocessing task.
 284         """
 285         pass
 286
 287 class UnavailableVideoError(Exception):
 288         """Unavailable Format exception.
 289
 290         This exception will be thrown when a video is requested
 291         in a format that is not available for that video.
 292         """
 293         pass
 294
 295 class ContentTooShortError(Exception):
 296         """Content Too Short exception.
 297
 298         This exception may be raised by FileDownloader objects when a file they
 299         download is too small for what the server announced first, indicating
 300         the connection was probably interrupted.
 301         """
 302         # Both in bytes
 303         downloaded = None
 304         expected = None
 305
 306         def __init__(self, downloaded, expected):
 307                 self.downloaded = downloaded
 308                 self.expected = expected
 309
 310 class YoutubeDLHandler(urllib2.HTTPHandler):
 311         """Handler for HTTP requests and responses.
 312
 313         This class, when installed with an OpenerDirector, automatically adds
 314         the standard headers to every HTTP request and handles gzipped and
 315         deflated responses from web servers. If compression is to be avoided in
 316         a particular request, the original request in the program code only has
 317         to include the HTTP header "Youtubedl-No-Compression", which will be
 318         removed before making the real request.
 319
 320         Part of this code was copied from:
 321
 322           http://techknack.net/python-urllib2-handlers/
 323
 324         Andrew Rowls, the author of that code, agreed to release it to the
 325         public domain.
 326         """
 327
 328         @staticmethod
 329         def deflate(data):
 330                 try:
 331                         return zlib.decompress(data, -zlib.MAX_WBITS)
 332                 except zlib.error:
 333                         return zlib.decompress(data)
 334
 335         @staticmethod
 336         def addinfourl_wrapper(stream, headers, url, code):
 337                 if hasattr(urllib2.addinfourl, 'getcode'):
 338                         return urllib2.addinfourl(stream, headers, url, code)
 339                 ret = urllib2.addinfourl(stream, headers, url)
 340                 ret.code = code
 341                 return ret
 342
 343         def http_request(self, req):
 344                 for h in std_headers:
 345                         if h in req.headers:
 346                                 del req.headers[h]
 347                         req.add_header(h, std_headers[h])
 348                 if 'Youtubedl-no-compression' in req.headers:
 349                         if 'Accept-encoding' in req.headers:
 350                                 del req.headers['Accept-encoding']
 351                         del req.headers['Youtubedl-no-compression']
 352                 return req
 353
 354         def http_response(self, req, resp):
 355                 old_resp = resp
 356                 # gzip
 357                 if resp.headers.get('Content-encoding', '') == 'gzip':
 358                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 359                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 360                         resp.msg = old_resp.msg
 361                 # deflate
 362                 if resp.headers.get('Content-encoding', '') == 'deflate':
 363                         gz = StringIO.StringIO(self.deflate(resp.read()))
 364                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 365                         resp.msg = old_resp.msg
 366                 return resp
 367
 368 class FileDownloader(object):
 369         """File Downloader class.
 370
 371         File downloader objects are the ones responsible of downloading the
 372         actual video file and writing it to disk if the user has requested
 373         it, among some other tasks. In most cases there should be one per
 374         program. As, given a video URL, the downloader doesn't know how to
 375         extract all the needed information, task that InfoExtractors do, it
 376         has to pass the URL to one of them.
 377
 378         For this, file downloader objects have a method that allows
 379         InfoExtractors to be registered in a given order. When it is passed
 380         a URL, the file downloader handles it to the first InfoExtractor it
 381         finds that reports being able to handle it. The InfoExtractor extracts
 382         all the information about the video or videos the URL refers to, and
 383         asks the FileDownloader to process the video information, possibly
 384         downloading the video.
 385
 386         File downloaders accept a lot of parameters. In order not to saturate
 387         the object constructor with arguments, it receives a dictionary of
 388         options instead. These options are available through the params
 389         attribute for the InfoExtractors to use. The FileDownloader also
 390         registers itself as the downloader in charge for the InfoExtractors
 391         that are added to it, so this is a "mutual registration".
 392
 393         Available options:
 394
 395         username:         Username for authentication purposes.
 396         password:         Password for authentication purposes.
 397         usenetrc:         Use netrc for authentication instead.
 398         quiet:            Do not print messages to stdout.
 399         forceurl:         Force printing final URL.
 400         forcetitle:       Force printing title.
 401         forcethumbnail:   Force printing thumbnail URL.
 402         forcedescription: Force printing description.
 403         forcefilename:    Force printing final filename.
 404         simulate:         Do not download the video files.
 405         format:           Video format code.
 406         format_limit:     Highest quality format to try.
 407         outtmpl:          Template for output names.
 408         ignoreerrors:     Do not stop on download errors.
 409         ratelimit:        Download speed limit, in bytes/sec.
 410         nooverwrites:     Prevent overwriting files.
 411         retries:          Number of times to retry for HTTP error 5xx
 412         continuedl:       Try to continue downloads if possible.
 413         noprogress:       Do not print the progress bar.
 414         playliststart:    Playlist item to start at.
 415         playlistend:      Playlist item to end at.
 416         logtostderr:      Log messages to stderr instead of stdout.
 417         consoletitle:     Display progress in console window's titlebar.
 418         nopart:           Do not use temporary .part files.
 419         updatetime:       Use the Last-modified header to set output file timestamps.
 420         writedescription: Write the video description to a .description file
 421         writeinfojson:    Write the video description to a .info.json file
 422         """
 423
 424         params = None
 425         _ies = []
 426         _pps = []
 427         _download_retcode = None
 428         _num_downloads = None
 429         _screen_file = None
 430
 431         def __init__(self, params):
 432                 """Create a FileDownloader object with the given options."""
 433                 self._ies = []
 434                 self._pps = []
 435                 self._download_retcode = 0
 436                 self._num_downloads = 0
 437                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 438                 self.params = params
 439
 440         @staticmethod
 441         def pmkdir(filename):
 442                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 443                 components = filename.split(os.sep)
 444                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 445                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 446                 for dir in aggregate:
 447                         if not os.path.exists(dir):
 448                                 os.mkdir(dir)
 449
 450         @staticmethod
 451         def format_bytes(bytes):
 452                 if bytes is None:
 453                         return 'N/A'
 454                 if type(bytes) is str:
 455                         bytes = float(bytes)
 456                 if bytes == 0.0:
 457                         exponent = 0
 458                 else:
 459                         exponent = long(math.log(bytes, 1024.0))
 460                 suffix = 'bkMGTPEZY'[exponent]
 461                 converted = float(bytes) / float(1024**exponent)
 462                 return '%.2f%s' % (converted, suffix)
 463
 464         @staticmethod
 465         def calc_percent(byte_counter, data_len):
 466                 if data_len is None:
 467                         return '---.-%'
 468                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 469
 470         @staticmethod
 471         def calc_eta(start, now, total, current):
 472                 if total is None:
 473                         return '--:--'
 474                 dif = now - start
 475                 if current == 0 or dif < 0.001: # One millisecond
 476                         return '--:--'
 477                 rate = float(current) / dif
 478                 eta = long((float(total) - float(current)) / rate)
 479                 (eta_mins, eta_secs) = divmod(eta, 60)
 480                 if eta_mins > 99:
 481                         return '--:--'
 482                 return '%02d:%02d' % (eta_mins, eta_secs)
 483
 484         @staticmethod
 485         def calc_speed(start, now, bytes):
 486                 dif = now - start
 487                 if bytes == 0 or dif < 0.001: # One millisecond
 488                         return '%10s' % '---b/s'
 489                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 490
 491         @staticmethod
 492         def best_block_size(elapsed_time, bytes):
 493                 new_min = max(bytes / 2.0, 1.0)
 494                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 495                 if elapsed_time < 0.001:
 496                         return long(new_max)
 497                 rate = bytes / elapsed_time
 498                 if rate > new_max:
 499                         return long(new_max)
 500                 if rate < new_min:
 501                         return long(new_min)
 502                 return long(rate)
 503
 504         @staticmethod
 505         def parse_bytes(bytestr):
 506                 """Parse a string indicating a byte quantity into a long integer."""
 507                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 508                 if matchobj is None:
 509                         return None
 510                 number = float(matchobj.group(1))
 511                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 512                 return long(round(number * multiplier))
 513
 514         def add_info_extractor(self, ie):
 515                 """Add an InfoExtractor object to the end of the list."""
 516                 self._ies.append(ie)
 517                 ie.set_downloader(self)
 518
 519         def add_post_processor(self, pp):
 520                 """Add a PostProcessor object to the end of the chain."""
 521                 self._pps.append(pp)
 522                 pp.set_downloader(self)
 523
 524         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 525                 """Print message to stdout if not in quiet mode."""
 526                 try:
 527                         if not self.params.get('quiet', False):
 528                                 terminator = [u'\n', u''][skip_eol]
 529                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 530                         self._screen_file.flush()
 531                 except (UnicodeEncodeError), err:
 532                         if not ignore_encoding_errors:
 533                                 raise
 534
 535         def to_stderr(self, message):
 536                 """Print message to stderr."""
 537                 print >>sys.stderr, message.encode(preferredencoding())
 538
 539         def to_cons_title(self, message):
 540                 """Set console/terminal window title to message."""
 541                 if not self.params.get('consoletitle', False):
 542                         return
 543                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 544                         # c_wchar_p() might not be necessary if `message` is
 545                         # already of type unicode()
 546                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 547                 elif 'TERM' in os.environ:
 548                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 549
 550         def fixed_template(self):
 551                 """Checks if the output template is fixed."""
 552                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 553
 554         def trouble(self, message=None):
 555                 """Determine action to take when a download problem appears.
 556
 557                 Depending on if the downloader has been configured to ignore
 558                 download errors or not, this method may throw an exception or
 559                 not when errors are found, after printing the message.
 560                 """
 561                 if message is not None:
 562                         self.to_stderr(message)
 563                 if not self.params.get('ignoreerrors', False):
 564                         raise DownloadError(message)
 565                 self._download_retcode = 1
 566
 567         def slow_down(self, start_time, byte_counter):
 568                 """Sleep if the download speed is over the rate limit."""
 569                 rate_limit = self.params.get('ratelimit', None)
 570                 if rate_limit is None or byte_counter == 0:
 571                         return
 572                 now = time.time()
 573                 elapsed = now - start_time
 574                 if elapsed <= 0.0:
 575                         return
 576                 speed = float(byte_counter) / elapsed
 577                 if speed > rate_limit:
 578                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 579
 580         def temp_name(self, filename):
 581                 """Returns a temporary filename for the given filename."""
 582                 if self.params.get('nopart', False) or filename == u'-' or \
 583                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 584                         return filename
 585                 return filename + u'.part'
 586
 587         def undo_temp_name(self, filename):
 588                 if filename.endswith(u'.part'):
 589                         return filename[:-len(u'.part')]
 590                 return filename
 591
 592         def try_rename(self, old_filename, new_filename):
 593                 try:
 594                         if old_filename == new_filename:
 595                                 return
 596                         os.rename(old_filename, new_filename)
 597                 except (IOError, OSError), err:
 598                         self.trouble(u'ERROR: unable to rename file')
 599
 600         def try_utime(self, filename, last_modified_hdr):
 601                 """Try to set the last-modified time of the given file."""
 602                 if last_modified_hdr is None:
 603                         return
 604                 if not os.path.isfile(filename):
 605                         return
 606                 timestr = last_modified_hdr
 607                 if timestr is None:
 608                         return
 609                 filetime = timeconvert(timestr)
 610                 if filetime is None:
 611                         return
 612                 try:
 613                         os.utime(filename,(time.time(), filetime))
 614                 except:
 615                         pass
 616
 617         def report_writedescription(self, descfn):
 618                 """ Report that the description file is being written """
 619                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 620
 621         def report_writeinfojson(self, infofn):
 622                 """ Report that the metadata file has been written """
 623                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 624
 625         def report_destination(self, filename):
 626                 """Report destination filename."""
 627                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 628
 629         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 630                 """Report download progress."""
 631                 if self.params.get('noprogress', False):
 632                         return
 633                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 634                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 635                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 636                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 637
 638         def report_resuming_byte(self, resume_len):
 639                 """Report attempt to resume at given byte."""
 640                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 641
 642         def report_retry(self, count, retries):
 643                 """Report retry in case of HTTP error 5xx"""
 644                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 645
 646         def report_file_already_downloaded(self, file_name):
 647                 """Report file has already been fully downloaded."""
 648                 try:
 649                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 650                 except (UnicodeEncodeError), err:
 651                         self.to_screen(u'[download] The file has already been downloaded')
 652
 653         def report_unable_to_resume(self):
 654                 """Report it was impossible to resume download."""
 655                 self.to_screen(u'[download] Unable to resume')
 656
 657         def report_finish(self):
 658                 """Report download finished."""
 659                 if self.params.get('noprogress', False):
 660                         self.to_screen(u'[download] Download completed')
 661                 else:
 662                         self.to_screen(u'')
 663
 664         def increment_downloads(self):
 665                 """Increment the ordinal that assigns a number to each file."""
 666                 self._num_downloads += 1
 667
 668         def prepare_filename(self, info_dict):
 669                 """Generate the output filename."""
 670                 try:
 671                         template_dict = dict(info_dict)
 672                         template_dict['epoch'] = unicode(long(time.time()))
 673                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 674                         filename = self.params['outtmpl'] % template_dict
 675                         return filename
 676                 except (ValueError, KeyError), err:
 677                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 678                         return None
 679
 680         def process_info(self, info_dict):
 681                 """Process a single dictionary returned by an InfoExtractor."""
 682                 filename = self.prepare_filename(info_dict)
 683                 # Do nothing else if in simulate mode
 684                 if self.params.get('simulate', False):
 685                         # Forced printings
 686                         if self.params.get('forcetitle', False):
 687                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 688                         if self.params.get('forceurl', False):
 689                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 690                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 691                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 692                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 693                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 694                         if self.params.get('forcefilename', False) and filename is not None:
 695                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 696
 697                         return
 698
 699                 if filename is None:
 700                         return
 701                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 702                         self.to_stderr(u'WARNING: file exists and will be skipped')
 703                         return
 704
 705                 try:
 706                         self.pmkdir(filename)
 707                 except (OSError, IOError), err:
 708                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 709                         return
 710
 711                 if self.params.get('writedescription', False):
 712                         try:
 713                                 descfn = filename + '.description'
 714                                 self.report_writedescription(descfn)
 715                                 with contextlib.closing(open(descfn, 'wb')) as descfile:
 716                                         descfile.write(info_dict['description'].encode('utf-8'))
 717                         except (OSError, IOError):
 718                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 719                                 return
 720
 721                 if self.params.get('writeinfojson', False):
 722                         infofn = filename + '.info.json'
 723                         self.report_writeinfojson(infofn)
 724                         try:
 725                                 json.dump
 726                         except (NameError,AttributeError):
 727                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 728                                 return
 729                         try:
 730                                 with contextlib.closing(open(infofn, 'wb')) as infof:
 731                                         json.dump(info_dict, infof)
 732                         except (OSError, IOError):
 733                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 734                                 return
 735
 736                 try:
 737                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 738                 except (OSError, IOError), err:
 739                         raise UnavailableVideoError
 740                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 741                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 742                         return
 743                 except (ContentTooShortError, ), err:
 744                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 745                         return
 746
 747                 if success:
 748                         try:
 749                                 self.post_process(filename, info_dict)
 750                         except (PostProcessingError), err:
 751                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 752                                 return
 753
 754         def download(self, url_list):
 755                 """Download a given list of URLs."""
 756                 if len(url_list) > 1 and self.fixed_template():
 757                         raise SameFileError(self.params['outtmpl'])
 758
 759                 for url in url_list:
 760                         suitable_found = False
 761                         for ie in self._ies:
 762                                 # Go to next InfoExtractor if not suitable
 763                                 if not ie.suitable(url):
 764                                         continue
 765
 766                                 # Suitable InfoExtractor found
 767                                 suitable_found = True
 768
 769                                 # Extract information from URL and process it
 770                                 ie.extract(url)
 771
 772                                 # Suitable InfoExtractor had been found; go to next URL
 773                                 break
 774
 775                         if not suitable_found:
 776                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 777
 778                 return self._download_retcode
 779
 780         def post_process(self, filename, ie_info):
 781                 """Run the postprocessing chain on the given file."""
 782                 info = dict(ie_info)
 783                 info['filepath'] = filename
 784                 for pp in self._pps:
 785                         info = pp.run(info)
 786                         if info is None:
 787                                 break
 788
 789         def _download_with_rtmpdump(self, filename, url, player_url):
 790                 self.report_destination(filename)
 791                 tmpfilename = self.temp_name(filename)
 792
 793                 # Check for rtmpdump first
 794                 try:
 795                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 796                 except (OSError, IOError):
 797                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 798                         return False
 799
 800                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 801                 # the connection was interrumpted and resuming appears to be
 802                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 803                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 804                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 805                 while retval == 2 or retval == 1:
 806                         prevsize = os.path.getsize(tmpfilename)
 807                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 808                         time.sleep(5.0) # This seems to be needed
 809                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 810                         cursize = os.path.getsize(tmpfilename)
 811                         if prevsize == cursize and retval == 1:
 812                                 break
 813                 if retval == 0:
 814                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 815                         self.try_rename(tmpfilename, filename)
 816                         return True
 817                 else:
 818                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 819                         return False
 820
 821         def _do_download(self, filename, url, player_url):
 822                 # Check file already present
 823                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 824                         self.report_file_already_downloaded(filename)
 825                         return True
 826
 827                 # Attempt to download using rtmpdump
 828                 if url.startswith('rtmp'):
 829                         return self._download_with_rtmpdump(filename, url, player_url)
 830
 831                 tmpfilename = self.temp_name(filename)
 832                 stream = None
 833                 open_mode = 'wb'
 834
 835                 # Do not include the Accept-Encoding header
 836                 headers = {'Youtubedl-no-compression': 'True'}
 837                 basic_request = urllib2.Request(url, None, headers)
 838                 request = urllib2.Request(url, None, headers)
 839
 840                 # Establish possible resume length
 841                 if os.path.isfile(tmpfilename):
 842                         resume_len = os.path.getsize(tmpfilename)
 843                 else:
 844                         resume_len = 0
 845
 846                 # Request parameters in case of being able to resume
 847                 if self.params.get('continuedl', False) and resume_len != 0:
 848                         self.report_resuming_byte(resume_len)
 849                         request.add_header('Range','bytes=%d-' % resume_len)
 850                         open_mode = 'ab'
 851
 852                 count = 0
 853                 retries = self.params.get('retries', 0)
 854                 while count <= retries:
 855                         # Establish connection
 856                         try:
 857                                 data = urllib2.urlopen(request)
 858                                 break
 859                         except (urllib2.HTTPError, ), err:
 860                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 861                                         # Unexpected HTTP error
 862                                         raise
 863                                 elif err.code == 416:
 864                                         # Unable to resume (requested range not satisfiable)
 865                                         try:
 866                                                 # Open the connection again without the range header
 867                                                 data = urllib2.urlopen(basic_request)
 868                                                 content_length = data.info()['Content-Length']
 869                                         except (urllib2.HTTPError, ), err:
 870                                                 if err.code < 500 or err.code >= 600:
 871                                                         raise
 872                                         else:
 873                                                 # Examine the reported length
 874                                                 if (content_length is not None and
 875                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 876                                                         # The file had already been fully downloaded.
 877                                                         # Explanation to the above condition: in issue #175 it was revealed that
 878                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 879                                                         # changing the file size slightly and causing problems for some users. So
 880                                                         # I decided to implement a suggested change and consider the file
 881                                                         # completely downloaded if the file size differs less than 100 bytes from
 882                                                         # the one in the hard drive.
 883                                                         self.report_file_already_downloaded(filename)
 884                                                         self.try_rename(tmpfilename, filename)
 885                                                         return True
 886                                                 else:
 887                                                         # The length does not match, we start the download over
 888                                                         self.report_unable_to_resume()
 889                                                         open_mode = 'wb'
 890                                                         break
 891                         # Retry
 892                         count += 1
 893                         if count <= retries:
 894                                 self.report_retry(count, retries)
 895
 896                 if count > retries:
 897                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 898                         return False
 899
 900                 data_len = data.info().get('Content-length', None)
 901                 if data_len is not None:
 902                         data_len = long(data_len) + resume_len
 903                 data_len_str = self.format_bytes(data_len)
 904                 byte_counter = 0 + resume_len
 905                 block_size = 1024
 906                 start = time.time()
 907                 while True:
 908                         # Download and write
 909                         before = time.time()
 910                         data_block = data.read(block_size)
 911                         after = time.time()
 912                         if len(data_block) == 0:
 913                                 break
 914                         byte_counter += len(data_block)
 915
 916                         # Open file just in time
 917                         if stream is None:
 918                                 try:
 919                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 920                                         filename = self.undo_temp_name(tmpfilename)
 921                                         self.report_destination(filename)
 922                                 except (OSError, IOError), err:
 923                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 924                                         return False
 925                         try:
 926                                 stream.write(data_block)
 927                         except (IOError, OSError), err:
 928                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 929                                 return False
 930                         block_size = self.best_block_size(after - before, len(data_block))
 931
 932                         # Progress message
 933                         percent_str = self.calc_percent(byte_counter, data_len)
 934                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 935                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 936                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 937
 938                         # Apply rate limit
 939                         self.slow_down(start, byte_counter - resume_len)
 940
 941                 stream.close()
 942                 self.report_finish()
 943                 if data_len is not None and byte_counter != data_len:
 944                         raise ContentTooShortError(byte_counter, long(data_len))
 945                 self.try_rename(tmpfilename, filename)
 946
 947                 # Update file modification time
 948                 if self.params.get('updatetime', True):
 949                         self.try_utime(filename, data.info().get('last-modified', None))
 950
 951                 return True
 952
 953 class InfoExtractor(object):
 954         """Information Extractor class.
 955
 956         Information extractors are the classes that, given a URL, extract
 957         information from the video (or videos) the URL refers to. This
 958         information includes the real video URL, the video title and simplified
 959         title, author and others. The information is stored in a dictionary
 960         which is then passed to the FileDownloader. The FileDownloader
 961         processes this information possibly downloading the video to the file
 962         system, among other possible outcomes. The dictionaries must include
 963         the following fields:
 964
 965         id:             Video identifier.
 966         url:            Final video URL.
 967         uploader:       Nickname of the video uploader.
 968         title:          Literal title.
 969         stitle:         Simplified title.
 970         ext:            Video filename extension.
 971         format:         Video format.
 972         player_url:     SWF Player URL (may be None).
 973
 974         The following fields are optional. Their primary purpose is to allow
 975         youtube-dl to serve as the backend for a video search function, such
 976         as the one in youtube2mp3.  They are only used when their respective
 977         forced printing functions are called:
 978
 979         thumbnail:      Full URL to a video thumbnail image.
 980         description:    One-line video description.
 981
 982         Subclasses of this one should re-define the _real_initialize() and
 983         _real_extract() methods, as well as the suitable() static method.
 984         Probably, they should also be instantiated and added to the main
 985         downloader.
 986         """
 987
 988         _ready = False
 989         _downloader = None
 990
 991         def __init__(self, downloader=None):
 992                 """Constructor. Receives an optional downloader."""
 993                 self._ready = False
 994                 self.set_downloader(downloader)
 995
 996         @staticmethod
 997         def suitable(url):
 998                 """Receives a URL and returns True if suitable for this IE."""
 999                 return False
1000
1001         def initialize(self):
1002                 """Initializes an instance (authentication, etc)."""
1003                 if not self._ready:
1004                         self._real_initialize()
1005                         self._ready = True
1006
1007         def extract(self, url):
1008                 """Extracts URL information and returns it in list of dicts."""
1009                 self.initialize()
1010                 return self._real_extract(url)
1011
1012         def set_downloader(self, downloader):
1013                 """Sets the downloader for this IE."""
1014                 self._downloader = downloader
1015
1016         def _real_initialize(self):
1017                 """Real initialization process. Redefine in subclasses."""
1018                 pass
1019
1020         def _real_extract(self, url):
1021                 """Real extraction process. Redefine in subclasses."""
1022                 pass
1023
1024 class YoutubeIE(InfoExtractor):
1025         """Information extractor for youtube.com."""
1026
1027         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1028         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1029         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1030         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1031         _NETRC_MACHINE = 'youtube'
1032         # Listed in order of quality
1033         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1034         _video_extensions = {
1035                 '13': '3gp',
1036                 '17': 'mp4',
1037                 '18': 'mp4',
1038                 '22': 'mp4',
1039                 '37': 'mp4',
1040                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1041                 '43': 'webm',
1042                 '45': 'webm',
1043         }
1044
1045         @staticmethod
1046         def suitable(url):
1047                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1048
1049         def report_lang(self):
1050                 """Report attempt to set language."""
1051                 self._downloader.to_screen(u'[youtube] Setting language')
1052
1053         def report_login(self):
1054                 """Report attempt to log in."""
1055                 self._downloader.to_screen(u'[youtube] Logging in')
1056
1057         def report_age_confirmation(self):
1058                 """Report attempt to confirm age."""
1059                 self._downloader.to_screen(u'[youtube] Confirming age')
1060
1061         def report_video_webpage_download(self, video_id):
1062                 """Report attempt to download video webpage."""
1063                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1064
1065         def report_video_info_webpage_download(self, video_id):
1066                 """Report attempt to download video info webpage."""
1067                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1068
1069         def report_information_extraction(self, video_id):
1070                 """Report attempt to extract video information."""
1071                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1072
1073         def report_unavailable_format(self, video_id, format):
1074                 """Report extracted video URL."""
1075                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1076
1077         def report_rtmp_download(self):
1078                 """Indicate the download will use the RTMP protocol."""
1079                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1080
1081         def _real_initialize(self):
1082                 if self._downloader is None:
1083                         return
1084
1085                 username = None
1086                 password = None
1087                 downloader_params = self._downloader.params
1088
1089                 # Attempt to use provided username and password or .netrc data
1090                 if downloader_params.get('username', None) is not None:
1091                         username = downloader_params['username']
1092                         password = downloader_params['password']
1093                 elif downloader_params.get('usenetrc', False):
1094                         try:
1095                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1096                                 if info is not None:
1097                                         username = info[0]
1098                                         password = info[2]
1099                                 else:
1100                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1101                         except (IOError, netrc.NetrcParseError), err:
1102                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1103                                 return
1104
1105                 # Set language
1106                 request = urllib2.Request(self._LANG_URL)
1107                 try:
1108                         self.report_lang()
1109                         urllib2.urlopen(request).read()
1110                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1111                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1112                         return
1113
1114                 # No authentication to be performed
1115                 if username is None:
1116                         return
1117
1118                 # Log in
1119                 login_form = {
1120                                 'current_form': 'loginForm',
1121                                 'next':         '/',
1122                                 'action_login': 'Log In',
1123                                 'username':     username,
1124                                 'password':     password,
1125                                 }
1126                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1127                 try:
1128                         self.report_login()
1129                         login_results = urllib2.urlopen(request).read()
1130                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1131                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1132                                 return
1133                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1134                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1135                         return
1136
1137                 # Confirm age
1138                 age_form = {
1139                                 'next_url':             '/',
1140                                 'action_confirm':       'Confirm',
1141                                 }
1142                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1143                 try:
1144                         self.report_age_confirmation()
1145                         age_results = urllib2.urlopen(request).read()
1146                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1147                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1148                         return
1149
1150         def _real_extract(self, url):
1151                 # Extract video id from URL
1152                 mobj = re.match(self._VALID_URL, url)
1153                 if mobj is None:
1154                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1155                         return
1156                 video_id = mobj.group(2)
1157
1158                 # Get video webpage
1159                 self.report_video_webpage_download(video_id)
1160                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1161                 try:
1162                         video_webpage = urllib2.urlopen(request).read()
1163                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1164                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1165                         return
1166
1167                 # Attempt to extract SWF player URL
1168                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1169                 if mobj is not None:
1170                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1171                 else:
1172                         player_url = None
1173
1174                 # Get video info
1175                 self.report_video_info_webpage_download(video_id)
1176                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1177                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1178                                            % (video_id, el_type))
1179                         request = urllib2.Request(video_info_url)
1180                         try:
1181                                 video_info_webpage = urllib2.urlopen(request).read()
1182                                 video_info = parse_qs(video_info_webpage)
1183                                 if 'token' in video_info:
1184                                         break
1185                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1187                                 return
1188                 if 'token' not in video_info:
1189                         if 'reason' in video_info:
1190                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1191                         else:
1192                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1193                         return
1194
1195                 # Start extracting information
1196                 self.report_information_extraction(video_id)
1197
1198                 # uploader
1199                 if 'author' not in video_info:
1200                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1201                         return
1202                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1203
1204                 # title
1205                 if 'title' not in video_info:
1206                         self._downloader.trouble(u'ERROR: unable to extract video title')
1207                         return
1208                 video_title = urllib.unquote_plus(video_info['title'][0])
1209                 video_title = video_title.decode('utf-8')
1210                 video_title = sanitize_title(video_title)
1211
1212                 # simplified title
1213                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1214                 simple_title = simple_title.strip(ur'_')
1215
1216                 # thumbnail image
1217                 if 'thumbnail_url' not in video_info:
1218                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1219                         video_thumbnail = ''
1220                 else:   # don't panic if we can't find it
1221                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1222
1223                 # upload date
1224                 upload_date = u'NA'
1225                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1226                 if mobj is not None:
1227                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1228                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1229                         for expression in format_expressions:
1230                                 try:
1231                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1232                                 except:
1233                                         pass
1234
1235                 # description
1236                 try:
1237                         lxml.etree
1238                 except NameError:
1239                         video_description = u'No description available.'
1240                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1241                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1242                                 if mobj is not None:
1243                                         video_description = mobj.group(1).decode('utf-8')
1244                 else:
1245                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1246                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1247                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1248                         # TODO use another parser
1249
1250                 # token
1251                 video_token = urllib.unquote_plus(video_info['token'][0])
1252
1253                 # Decide which formats to download
1254                 req_format = self._downloader.params.get('format', None)
1255
1256                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1257                         self.report_rtmp_download()
1258                         video_url_list = [(None, video_info['conn'][0])]
1259                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1260                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1261                         url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1262                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1263                         url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1264
1265                         format_limit = self._downloader.params.get('format_limit', None)
1266                         if format_limit is not None and format_limit in self._available_formats:
1267                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1268                         else:
1269                                 format_list = self._available_formats
1270                         existing_formats = [x for x in format_list if x in url_map]
1271                         if len(existing_formats) == 0:
1272                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1273                                 return
1274                         if req_format is None:
1275                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1276                         elif req_format == '-1':
1277                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1278                         else:
1279                                 # Specific format
1280                                 if req_format not in url_map:
1281                                         self._downloader.trouble(u'ERROR: requested format not available')
1282                                         return
1283                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1284                 else:
1285                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1286                         return
1287
1288                 for format_param, video_real_url in video_url_list:
1289                         # At this point we have a new video
1290                         self._downloader.increment_downloads()
1291
1292                         # Extension
1293                         video_extension = self._video_extensions.get(format_param, 'flv')
1294
1295                         # Find the video URL in fmt_url_map or conn paramters
1296                         try:
1297                                 # Process video information
1298                                 self._downloader.process_info({
1299                                         'id':           video_id.decode('utf-8'),
1300                                         'url':          video_real_url.decode('utf-8'),
1301                                         'uploader':     video_uploader.decode('utf-8'),
1302                                         'upload_date':  upload_date,
1303                                         'title':        video_title,
1304                                         'stitle':       simple_title,
1305                                         'ext':          video_extension.decode('utf-8'),
1306                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1307                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1308                                         'description':  video_description,
1309                                         'player_url':   player_url,
1310                                 })
1311                         except UnavailableVideoError, err:
1312                                 self._downloader.trouble(u'\nERROR: unable to download video')
1313
1314
1315 class MetacafeIE(InfoExtractor):
1316         """Information Extractor for metacafe.com."""
1317
1318         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1319         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1320         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1321         _youtube_ie = None
1322
1323         def __init__(self, youtube_ie, downloader=None):
1324                 InfoExtractor.__init__(self, downloader)
1325                 self._youtube_ie = youtube_ie
1326
1327         @staticmethod
1328         def suitable(url):
1329                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1330
1331         def report_disclaimer(self):
1332                 """Report disclaimer retrieval."""
1333                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1334
1335         def report_age_confirmation(self):
1336                 """Report attempt to confirm age."""
1337                 self._downloader.to_screen(u'[metacafe] Confirming age')
1338
1339         def report_download_webpage(self, video_id):
1340                 """Report webpage download."""
1341                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1342
1343         def report_extraction(self, video_id):
1344                 """Report information extraction."""
1345                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1346
1347         def _real_initialize(self):
1348                 # Retrieve disclaimer
1349                 request = urllib2.Request(self._DISCLAIMER)
1350                 try:
1351                         self.report_disclaimer()
1352                         disclaimer = urllib2.urlopen(request).read()
1353                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1354                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1355                         return
1356
1357                 # Confirm age
1358                 disclaimer_form = {
1359                         'filters': '0',
1360                         'submit': "Continue - I'm over 18",
1361                         }
1362                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1363                 try:
1364                         self.report_age_confirmation()
1365                         disclaimer = urllib2.urlopen(request).read()
1366                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1367                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1368                         return
1369
1370         def _real_extract(self, url):
1371                 # Extract id and simplified title from URL
1372                 mobj = re.match(self._VALID_URL, url)
1373                 if mobj is None:
1374                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1375                         return
1376
1377                 video_id = mobj.group(1)
1378
1379                 # Check if video comes from YouTube
1380                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1381                 if mobj2 is not None:
1382                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1383                         return
1384
1385                 # At this point we have a new video
1386                 self._downloader.increment_downloads()
1387
1388                 simple_title = mobj.group(2).decode('utf-8')
1389
1390                 # Retrieve video webpage to extract further information
1391                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1392                 try:
1393                         self.report_download_webpage(video_id)
1394                         webpage = urllib2.urlopen(request).read()
1395                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1396                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1397                         return
1398
1399                 # Extract URL, uploader and title from webpage
1400                 self.report_extraction(video_id)
1401                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1402                 if mobj is not None:
1403                         mediaURL = urllib.unquote(mobj.group(1))
1404                         video_extension = mediaURL[-3:]
1405
1406                         # Extract gdaKey if available
1407                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1408                         if mobj is None:
1409                                 video_url = mediaURL
1410                         else:
1411                                 gdaKey = mobj.group(1)
1412                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1413                 else:
1414                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1415                         if mobj is None:
1416                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1417                                 return
1418                         vardict = parse_qs(mobj.group(1))
1419                         if 'mediaData' not in vardict:
1420                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1421                                 return
1422                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1423                         if mobj is None:
1424                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1425                                 return
1426                         mediaURL = mobj.group(1).replace('\\/', '/')
1427                         video_extension = mediaURL[-3:]
1428                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1429
1430                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1431                 if mobj is None:
1432                         self._downloader.trouble(u'ERROR: unable to extract title')
1433                         return
1434                 video_title = mobj.group(1).decode('utf-8')
1435                 video_title = sanitize_title(video_title)
1436
1437                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1438                 if mobj is None:
1439                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1440                         return
1441                 video_uploader = mobj.group(1)
1442
1443                 try:
1444                         # Process video information
1445                         self._downloader.process_info({
1446                                 'id':           video_id.decode('utf-8'),
1447                                 'url':          video_url.decode('utf-8'),
1448                                 'uploader':     video_uploader.decode('utf-8'),
1449                                 'upload_date':  u'NA',
1450                                 'title':        video_title,
1451                                 'stitle':       simple_title,
1452                                 'ext':          video_extension.decode('utf-8'),
1453                                 'format':       u'NA',
1454                                 'player_url':   None,
1455                         })
1456                 except UnavailableVideoError:
1457                         self._downloader.trouble(u'\nERROR: unable to download video')
1458
1459
1460 class DailymotionIE(InfoExtractor):
1461         """Information Extractor for Dailymotion"""
1462
1463         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1464
1465         def __init__(self, downloader=None):
1466                 InfoExtractor.__init__(self, downloader)
1467
1468         @staticmethod
1469         def suitable(url):
1470                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1471
1472         def report_download_webpage(self, video_id):
1473                 """Report webpage download."""
1474                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1475
1476         def report_extraction(self, video_id):
1477                 """Report information extraction."""
1478                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1479
1480         def _real_initialize(self):
1481                 return
1482
1483         def _real_extract(self, url):
1484                 # Extract id and simplified title from URL
1485                 mobj = re.match(self._VALID_URL, url)
1486                 if mobj is None:
1487                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1488                         return
1489
1490                 # At this point we have a new video
1491                 self._downloader.increment_downloads()
1492                 video_id = mobj.group(1)
1493
1494                 simple_title = mobj.group(2).decode('utf-8')
1495                 video_extension = 'flv'
1496
1497                 # Retrieve video webpage to extract further information
1498                 request = urllib2.Request(url)
1499                 try:
1500                         self.report_download_webpage(video_id)
1501                         webpage = urllib2.urlopen(request).read()
1502                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1503                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1504                         return
1505
1506                 # Extract URL, uploader and title from webpage
1507                 self.report_extraction(video_id)
1508                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1509                 if mobj is None:
1510                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1511                         return
1512                 mediaURL = urllib.unquote(mobj.group(1))
1513
1514                 # if needed add http://www.dailymotion.com/ if relative URL
1515
1516                 video_url = mediaURL
1517
1518                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1519                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1520                 if mobj is None:
1521                         self._downloader.trouble(u'ERROR: unable to extract title')
1522                         return
1523                 video_title = mobj.group(1).decode('utf-8')
1524                 video_title = sanitize_title(video_title)
1525
1526                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1527                 if mobj is None:
1528                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1529                         return
1530                 video_uploader = mobj.group(1)
1531
1532                 try:
1533                         # Process video information
1534                         self._downloader.process_info({
1535                                 'id':           video_id.decode('utf-8'),
1536                                 'url':          video_url.decode('utf-8'),
1537                                 'uploader':     video_uploader.decode('utf-8'),
1538                                 'upload_date':  u'NA',
1539                                 'title':        video_title,
1540                                 'stitle':       simple_title,
1541                                 'ext':          video_extension.decode('utf-8'),
1542                                 'format':       u'NA',
1543                                 'player_url':   None,
1544                         })
1545                 except UnavailableVideoError:
1546                         self._downloader.trouble(u'\nERROR: unable to download video')
1547
1548 class GoogleIE(InfoExtractor):
1549         """Information extractor for video.google.com."""
1550
1551         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1552
1553         def __init__(self, downloader=None):
1554                 InfoExtractor.__init__(self, downloader)
1555
1556         @staticmethod
1557         def suitable(url):
1558                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1559
1560         def report_download_webpage(self, video_id):
1561                 """Report webpage download."""
1562                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1563
1564         def report_extraction(self, video_id):
1565                 """Report information extraction."""
1566                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1567
1568         def _real_initialize(self):
1569                 return
1570
1571         def _real_extract(self, url):
1572                 # Extract id from URL
1573                 mobj = re.match(self._VALID_URL, url)
1574                 if mobj is None:
1575                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1576                         return
1577
1578                 # At this point we have a new video
1579                 self._downloader.increment_downloads()
1580                 video_id = mobj.group(1)
1581
1582                 video_extension = 'mp4'
1583
1584                 # Retrieve video webpage to extract further information
1585                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1586                 try:
1587                         self.report_download_webpage(video_id)
1588                         webpage = urllib2.urlopen(request).read()
1589                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1590                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1591                         return
1592
1593                 # Extract URL, uploader, and title from webpage
1594                 self.report_extraction(video_id)
1595                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1596                 if mobj is None:
1597                         video_extension = 'flv'
1598                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1599                 if mobj is None:
1600                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1601                         return
1602                 mediaURL = urllib.unquote(mobj.group(1))
1603                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1604                 mediaURL = mediaURL.replace('\\x26', '\x26')
1605
1606                 video_url = mediaURL
1607
1608                 mobj = re.search(r'<title>(.*)</title>', webpage)
1609                 if mobj is None:
1610                         self._downloader.trouble(u'ERROR: unable to extract title')
1611                         return
1612                 video_title = mobj.group(1).decode('utf-8')
1613                 video_title = sanitize_title(video_title)
1614                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1615
1616                 # Extract video description
1617                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1618                 if mobj is None:
1619                         self._downloader.trouble(u'ERROR: unable to extract video description')
1620                         return
1621                 video_description = mobj.group(1).decode('utf-8')
1622                 if not video_description:
1623                         video_description = 'No description available.'
1624
1625                 # Extract video thumbnail
1626                 if self._downloader.params.get('forcethumbnail', False):
1627                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1628                         try:
1629                                 webpage = urllib2.urlopen(request).read()
1630                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1631                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1632                                 return
1633                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1634                         if mobj is None:
1635                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1636                                 return
1637                         video_thumbnail = mobj.group(1)
1638                 else:   # we need something to pass to process_info
1639                         video_thumbnail = ''
1640
1641
1642                 try:
1643                         # Process video information
1644                         self._downloader.process_info({
1645                                 'id':           video_id.decode('utf-8'),
1646                                 'url':          video_url.decode('utf-8'),
1647                                 'uploader':     u'NA',
1648                                 'upload_date':  u'NA',
1649                                 'title':        video_title,
1650                                 'stitle':       simple_title,
1651                                 'ext':          video_extension.decode('utf-8'),
1652                                 'format':       u'NA',
1653                                 'player_url':   None,
1654                         })
1655                 except UnavailableVideoError:
1656                         self._downloader.trouble(u'\nERROR: unable to download video')
1657
1658
1659 class PhotobucketIE(InfoExtractor):
1660         """Information extractor for photobucket.com."""
1661
1662         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1663
1664         def __init__(self, downloader=None):
1665                 InfoExtractor.__init__(self, downloader)
1666
1667         @staticmethod
1668         def suitable(url):
1669                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1670
1671         def report_download_webpage(self, video_id):
1672                 """Report webpage download."""
1673                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1674
1675         def report_extraction(self, video_id):
1676                 """Report information extraction."""
1677                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1678
1679         def _real_initialize(self):
1680                 return
1681
1682         def _real_extract(self, url):
1683                 # Extract id from URL
1684                 mobj = re.match(self._VALID_URL, url)
1685                 if mobj is None:
1686                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1687                         return
1688
1689                 # At this point we have a new video
1690                 self._downloader.increment_downloads()
1691                 video_id = mobj.group(1)
1692
1693                 video_extension = 'flv'
1694
1695                 # Retrieve video webpage to extract further information
1696                 request = urllib2.Request(url)
1697                 try:
1698                         self.report_download_webpage(video_id)
1699                         webpage = urllib2.urlopen(request).read()
1700                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1701                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1702                         return
1703
1704                 # Extract URL, uploader, and title from webpage
1705                 self.report_extraction(video_id)
1706                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1707                 if mobj is None:
1708                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1709                         return
1710                 mediaURL = urllib.unquote(mobj.group(1))
1711
1712                 video_url = mediaURL
1713
1714                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1715                 if mobj is None:
1716                         self._downloader.trouble(u'ERROR: unable to extract title')
1717                         return
1718                 video_title = mobj.group(1).decode('utf-8')
1719                 video_title = sanitize_title(video_title)
1720                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1721
1722                 video_uploader = mobj.group(2).decode('utf-8')
1723
1724                 try:
1725                         # Process video information
1726                         self._downloader.process_info({
1727                                 'id':           video_id.decode('utf-8'),
1728                                 'url':          video_url.decode('utf-8'),
1729                                 'uploader':     video_uploader,
1730                                 'upload_date':  u'NA',
1731                                 'title':        video_title,
1732                                 'stitle':       simple_title,
1733                                 'ext':          video_extension.decode('utf-8'),
1734                                 'format':       u'NA',
1735                                 'player_url':   None,
1736                         })
1737                 except UnavailableVideoError:
1738                         self._downloader.trouble(u'\nERROR: unable to download video')
1739
1740
1741 class YahooIE(InfoExtractor):
1742         """Information extractor for video.yahoo.com."""
1743
1744         # _VALID_URL matches all Yahoo! Video URLs
1745         # _VPAGE_URL matches only the extractable '/watch/' URLs
1746         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1747         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1748
1749         def __init__(self, downloader=None):
1750                 InfoExtractor.__init__(self, downloader)
1751
1752         @staticmethod
1753         def suitable(url):
1754                 return (re.match(YahooIE._VALID_URL, url) is not None)
1755
1756         def report_download_webpage(self, video_id):
1757                 """Report webpage download."""
1758                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1759
1760         def report_extraction(self, video_id):
1761                 """Report information extraction."""
1762                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1763
1764         def _real_initialize(self):
1765                 return
1766
1767         def _real_extract(self, url, new_video=True):
1768                 # Extract ID from URL
1769                 mobj = re.match(self._VALID_URL, url)
1770                 if mobj is None:
1771                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1772                         return
1773
1774                 # At this point we have a new video
1775                 self._downloader.increment_downloads()
1776                 video_id = mobj.group(2)
1777                 video_extension = 'flv'
1778
1779                 # Rewrite valid but non-extractable URLs as
1780                 # extractable English language /watch/ URLs
1781                 if re.match(self._VPAGE_URL, url) is None:
1782                         request = urllib2.Request(url)
1783                         try:
1784                                 webpage = urllib2.urlopen(request).read()
1785                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1786                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1787                                 return
1788
1789                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1790                         if mobj is None:
1791                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1792                                 return
1793                         yahoo_id = mobj.group(1)
1794
1795                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1796                         if mobj is None:
1797                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1798                                 return
1799                         yahoo_vid = mobj.group(1)
1800
1801                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1802                         return self._real_extract(url, new_video=False)
1803
1804                 # Retrieve video webpage to extract further information
1805                 request = urllib2.Request(url)
1806                 try:
1807                         self.report_download_webpage(video_id)
1808                         webpage = urllib2.urlopen(request).read()
1809                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1811                         return
1812
1813                 # Extract uploader and title from webpage
1814                 self.report_extraction(video_id)
1815                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1816                 if mobj is None:
1817                         self._downloader.trouble(u'ERROR: unable to extract video title')
1818                         return
1819                 video_title = mobj.group(1).decode('utf-8')
1820                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1821
1822                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1823                 if mobj is None:
1824                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1825                         return
1826                 video_uploader = mobj.group(1).decode('utf-8')
1827
1828                 # Extract video thumbnail
1829                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1830                 if mobj is None:
1831                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1832                         return
1833                 video_thumbnail = mobj.group(1).decode('utf-8')
1834
1835                 # Extract video description
1836                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1837                 if mobj is None:
1838                         self._downloader.trouble(u'ERROR: unable to extract video description')
1839                         return
1840                 video_description = mobj.group(1).decode('utf-8')
1841                 if not video_description: video_description = 'No description available.'
1842
1843                 # Extract video height and width
1844                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1845                 if mobj is None:
1846                         self._downloader.trouble(u'ERROR: unable to extract video height')
1847                         return
1848                 yv_video_height = mobj.group(1)
1849
1850                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1851                 if mobj is None:
1852                         self._downloader.trouble(u'ERROR: unable to extract video width')
1853                         return
1854                 yv_video_width = mobj.group(1)
1855
1856                 # Retrieve video playlist to extract media URL
1857                 # I'm not completely sure what all these options are, but we
1858                 # seem to need most of them, otherwise the server sends a 401.
1859                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1860                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1861                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1862                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1863                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1864                 try:
1865                         self.report_download_webpage(video_id)
1866                         webpage = urllib2.urlopen(request).read()
1867                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1868                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1869                         return
1870
1871                 # Extract media URL from playlist XML
1872                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1873                 if mobj is None:
1874                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1875                         return
1876                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1877                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1878
1879                 try:
1880                         # Process video information
1881                         self._downloader.process_info({
1882                                 'id':           video_id.decode('utf-8'),
1883                                 'url':          video_url,
1884                                 'uploader':     video_uploader,
1885                                 'upload_date':  u'NA',
1886                                 'title':        video_title,
1887                                 'stitle':       simple_title,
1888                                 'ext':          video_extension.decode('utf-8'),
1889                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1890                                 'description':  video_description,
1891                                 'thumbnail':    video_thumbnail,
1892                                 'description':  video_description,
1893                                 'player_url':   None,
1894                         })
1895                 except UnavailableVideoError:
1896                         self._downloader.trouble(u'\nERROR: unable to download video')
1897
1898
1899 class GenericIE(InfoExtractor):
1900         """Generic last-resort information extractor."""
1901
1902         def __init__(self, downloader=None):
1903                 InfoExtractor.__init__(self, downloader)
1904
1905         @staticmethod
1906         def suitable(url):
1907                 return True
1908
1909         def report_download_webpage(self, video_id):
1910                 """Report webpage download."""
1911                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1912                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1913
1914         def report_extraction(self, video_id):
1915                 """Report information extraction."""
1916                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1917
1918         def _real_initialize(self):
1919                 return
1920
1921         def _real_extract(self, url):
1922                 # At this point we have a new video
1923                 self._downloader.increment_downloads()
1924
1925                 video_id = url.split('/')[-1]
1926                 request = urllib2.Request(url)
1927                 try:
1928                         self.report_download_webpage(video_id)
1929                         webpage = urllib2.urlopen(request).read()
1930                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1931                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1932                         return
1933                 except ValueError, err:
1934                         # since this is the last-resort InfoExtractor, if
1935                         # this error is thrown, it'll be thrown here
1936                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1937                         return
1938
1939                 self.report_extraction(video_id)
1940                 # Start with something easy: JW Player in SWFObject
1941                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1942                 if mobj is None:
1943                         # Broaden the search a little bit
1944                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1945                 if mobj is None:
1946                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1947                         return
1948
1949                 # It's possible that one of the regexes
1950                 # matched, but returned an empty group:
1951                 if mobj.group(1) is None:
1952                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1953                         return
1954
1955                 video_url = urllib.unquote(mobj.group(1))
1956                 video_id  = os.path.basename(video_url)
1957
1958                 # here's a fun little line of code for you:
1959                 video_extension = os.path.splitext(video_id)[1][1:]
1960                 video_id        = os.path.splitext(video_id)[0]
1961
1962                 # it's tempting to parse this further, but you would
1963                 # have to take into account all the variations like
1964                 #   Video Title - Site Name
1965                 #   Site Name | Video Title
1966                 #   Video Title - Tagline | Site Name
1967                 # and so on and so forth; it's just not practical
1968                 mobj = re.search(r'<title>(.*)</title>', webpage)
1969                 if mobj is None:
1970                         self._downloader.trouble(u'ERROR: unable to extract title')
1971                         return
1972                 video_title = mobj.group(1).decode('utf-8')
1973                 video_title = sanitize_title(video_title)
1974                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1975
1976                 # video uploader is domain name
1977                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1978                 if mobj is None:
1979                         self._downloader.trouble(u'ERROR: unable to extract title')
1980                         return
1981                 video_uploader = mobj.group(1).decode('utf-8')
1982
1983                 try:
1984                         # Process video information
1985                         self._downloader.process_info({
1986                                 'id':           video_id.decode('utf-8'),
1987                                 'url':          video_url.decode('utf-8'),
1988                                 'uploader':     video_uploader,
1989                                 'upload_date':  u'NA',
1990                                 'title':        video_title,
1991                                 'stitle':       simple_title,
1992                                 'ext':          video_extension.decode('utf-8'),
1993                                 'format':       u'NA',
1994                                 'player_url':   None,
1995                         })
1996                 except UnavailableVideoError, err:
1997                         self._downloader.trouble(u'\nERROR: unable to download video')
1998
1999
2000 class YoutubeSearchIE(InfoExtractor):
2001         """Information Extractor for YouTube search queries."""
2002         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2003         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2004         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2005         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2006         _youtube_ie = None
2007         _max_youtube_results = 1000
2008
2009         def __init__(self, youtube_ie, downloader=None):
2010                 InfoExtractor.__init__(self, downloader)
2011                 self._youtube_ie = youtube_ie
2012
2013         @staticmethod
2014         def suitable(url):
2015                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2016
2017         def report_download_page(self, query, pagenum):
2018                 """Report attempt to download playlist page with given number."""
2019                 query = query.decode(preferredencoding())
2020                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2021
2022         def _real_initialize(self):
2023                 self._youtube_ie.initialize()
2024
2025         def _real_extract(self, query):
2026                 mobj = re.match(self._VALID_QUERY, query)
2027                 if mobj is None:
2028                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2029                         return
2030
2031                 prefix, query = query.split(':')
2032                 prefix = prefix[8:]
2033                 query  = query.encode('utf-8')
2034                 if prefix == '':
2035                         self._download_n_results(query, 1)
2036                         return
2037                 elif prefix == 'all':
2038                         self._download_n_results(query, self._max_youtube_results)
2039                         return
2040                 else:
2041                         try:
2042                                 n = long(prefix)
2043                                 if n <= 0:
2044                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2045                                         return
2046                                 elif n > self._max_youtube_results:
2047                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2048                                         n = self._max_youtube_results
2049                                 self._download_n_results(query, n)
2050                                 return
2051                         except ValueError: # parsing prefix as integer fails
2052                                 self._download_n_results(query, 1)
2053                                 return
2054
2055         def _download_n_results(self, query, n):
2056                 """Downloads a specified number of results for a query"""
2057
2058                 video_ids = []
2059                 already_seen = set()
2060                 pagenum = 1
2061
2062                 while True:
2063                         self.report_download_page(query, pagenum)
2064                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2065                         request = urllib2.Request(result_url)
2066                         try:
2067                                 page = urllib2.urlopen(request).read()
2068                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2069                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2070                                 return
2071
2072                         # Extract video identifiers
2073                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2074                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2075                                 if video_id not in already_seen:
2076                                         video_ids.append(video_id)
2077                                         already_seen.add(video_id)
2078                                         if len(video_ids) == n:
2079                                                 # Specified n videos reached
2080                                                 for id in video_ids:
2081                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2082                                                 return
2083
2084                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2085                                 for id in video_ids:
2086                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2087                                 return
2088
2089                         pagenum = pagenum + 1
2090
2091 class GoogleSearchIE(InfoExtractor):
2092         """Information Extractor for Google Video search queries."""
2093         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2094         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2095         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2096         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2097         _google_ie = None
2098         _max_google_results = 1000
2099
2100         def __init__(self, google_ie, downloader=None):
2101                 InfoExtractor.__init__(self, downloader)
2102                 self._google_ie = google_ie
2103
2104         @staticmethod
2105         def suitable(url):
2106                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2107
2108         def report_download_page(self, query, pagenum):
2109                 """Report attempt to download playlist page with given number."""
2110                 query = query.decode(preferredencoding())
2111                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2112
2113         def _real_initialize(self):
2114                 self._google_ie.initialize()
2115
2116         def _real_extract(self, query):
2117                 mobj = re.match(self._VALID_QUERY, query)
2118                 if mobj is None:
2119                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2120                         return
2121
2122                 prefix, query = query.split(':')
2123                 prefix = prefix[8:]
2124                 query  = query.encode('utf-8')
2125                 if prefix == '':
2126                         self._download_n_results(query, 1)
2127                         return
2128                 elif prefix == 'all':
2129                         self._download_n_results(query, self._max_google_results)
2130                         return
2131                 else:
2132                         try:
2133                                 n = long(prefix)
2134                                 if n <= 0:
2135                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2136                                         return
2137                                 elif n > self._max_google_results:
2138                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2139                                         n = self._max_google_results
2140                                 self._download_n_results(query, n)
2141                                 return
2142                         except ValueError: # parsing prefix as integer fails
2143                                 self._download_n_results(query, 1)
2144                                 return
2145
2146         def _download_n_results(self, query, n):
2147                 """Downloads a specified number of results for a query"""
2148
2149                 video_ids = []
2150                 already_seen = set()
2151                 pagenum = 1
2152
2153                 while True:
2154                         self.report_download_page(query, pagenum)
2155                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2156                         request = urllib2.Request(result_url)
2157                         try:
2158                                 page = urllib2.urlopen(request).read()
2159                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2160                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2161                                 return
2162
2163                         # Extract video identifiers
2164                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2165                                 video_id = mobj.group(1)
2166                                 if video_id not in already_seen:
2167                                         video_ids.append(video_id)
2168                                         already_seen.add(video_id)
2169                                         if len(video_ids) == n:
2170                                                 # Specified n videos reached
2171                                                 for id in video_ids:
2172                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2173                                                 return
2174
2175                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2176                                 for id in video_ids:
2177                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2178                                 return
2179
2180                         pagenum = pagenum + 1
2181
2182 class YahooSearchIE(InfoExtractor):
2183         """Information Extractor for Yahoo! Video search queries."""
2184         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2185         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2186         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2187         _MORE_PAGES_INDICATOR = r'\s*Next'
2188         _yahoo_ie = None
2189         _max_yahoo_results = 1000
2190
2191         def __init__(self, yahoo_ie, downloader=None):
2192                 InfoExtractor.__init__(self, downloader)
2193                 self._yahoo_ie = yahoo_ie
2194
2195         @staticmethod
2196         def suitable(url):
2197                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2198
2199         def report_download_page(self, query, pagenum):
2200                 """Report attempt to download playlist page with given number."""
2201                 query = query.decode(preferredencoding())
2202                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2203
2204         def _real_initialize(self):
2205                 self._yahoo_ie.initialize()
2206
2207         def _real_extract(self, query):
2208                 mobj = re.match(self._VALID_QUERY, query)
2209                 if mobj is None:
2210                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2211                         return
2212
2213                 prefix, query = query.split(':')
2214                 prefix = prefix[8:]
2215                 query  = query.encode('utf-8')
2216                 if prefix == '':
2217                         self._download_n_results(query, 1)
2218                         return
2219                 elif prefix == 'all':
2220                         self._download_n_results(query, self._max_yahoo_results)
2221                         return
2222                 else:
2223                         try:
2224                                 n = long(prefix)
2225                                 if n <= 0:
2226                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2227                                         return
2228                                 elif n > self._max_yahoo_results:
2229                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2230                                         n = self._max_yahoo_results
2231                                 self._download_n_results(query, n)
2232                                 return
2233                         except ValueError: # parsing prefix as integer fails
2234                                 self._download_n_results(query, 1)
2235                                 return
2236
2237         def _download_n_results(self, query, n):
2238                 """Downloads a specified number of results for a query"""
2239
2240                 video_ids = []
2241                 already_seen = set()
2242                 pagenum = 1
2243
2244                 while True:
2245                         self.report_download_page(query, pagenum)
2246                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2247                         request = urllib2.Request(result_url)
2248                         try:
2249                                 page = urllib2.urlopen(request).read()
2250                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2251                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2252                                 return
2253
2254                         # Extract video identifiers
2255                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2256                                 video_id = mobj.group(1)
2257                                 if video_id not in already_seen:
2258                                         video_ids.append(video_id)
2259                                         already_seen.add(video_id)
2260                                         if len(video_ids) == n:
2261                                                 # Specified n videos reached
2262                                                 for id in video_ids:
2263                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2264                                                 return
2265
2266                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2267                                 for id in video_ids:
2268                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2269                                 return
2270
2271                         pagenum = pagenum + 1
2272
2273 class YoutubePlaylistIE(InfoExtractor):
2274         """Information Extractor for YouTube playlists."""
2275
2276         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2277         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2278         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2279         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2280         _youtube_ie = None
2281
2282         def __init__(self, youtube_ie, downloader=None):
2283                 InfoExtractor.__init__(self, downloader)
2284                 self._youtube_ie = youtube_ie
2285
2286         @staticmethod
2287         def suitable(url):
2288                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2289
2290         def report_download_page(self, playlist_id, pagenum):
2291                 """Report attempt to download playlist page with given number."""
2292                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2293
2294         def _real_initialize(self):
2295                 self._youtube_ie.initialize()
2296
2297         def _real_extract(self, url):
2298                 # Extract playlist id
2299                 mobj = re.match(self._VALID_URL, url)
2300                 if mobj is None:
2301                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2302                         return
2303
2304                 # Single video case
2305                 if mobj.group(3) is not None:
2306                         self._youtube_ie.extract(mobj.group(3))
2307                         return
2308
2309                 # Download playlist pages
2310                 # prefix is 'p' as default for playlists but there are other types that need extra care
2311                 playlist_prefix = mobj.group(1)
2312                 if playlist_prefix == 'a':
2313                         playlist_access = 'artist'
2314                 else:
2315                         playlist_prefix = 'p'
2316                         playlist_access = 'view_play_list'
2317                 playlist_id = mobj.group(2)
2318                 video_ids = []
2319                 pagenum = 1
2320
2321                 while True:
2322                         self.report_download_page(playlist_id, pagenum)
2323                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2324                         try:
2325                                 page = urllib2.urlopen(request).read()
2326                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2327                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2328                                 return
2329
2330                         # Extract video identifiers
2331                         ids_in_page = []
2332                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2333                                 if mobj.group(1) not in ids_in_page:
2334                                         ids_in_page.append(mobj.group(1))
2335                         video_ids.extend(ids_in_page)
2336
2337                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2338                                 break
2339                         pagenum = pagenum + 1
2340
2341                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2342                 playlistend = self._downloader.params.get('playlistend', -1)
2343                 video_ids = video_ids[playliststart:playlistend]
2344
2345                 for id in video_ids:
2346                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2347                 return
2348
2349 class YoutubeUserIE(InfoExtractor):
2350         """Information Extractor for YouTube users."""
2351
2352         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2353         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2354         _GDATA_PAGE_SIZE = 50
2355         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2356         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2357         _youtube_ie = None
2358
2359         def __init__(self, youtube_ie, downloader=None):
2360                 InfoExtractor.__init__(self, downloader)
2361                 self._youtube_ie = youtube_ie
2362
2363         @staticmethod
2364         def suitable(url):
2365                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2366
2367         def report_download_page(self, username, start_index):
2368                 """Report attempt to download user page."""
2369                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2370                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2371
2372         def _real_initialize(self):
2373                 self._youtube_ie.initialize()
2374
2375         def _real_extract(self, url):
2376                 # Extract username
2377                 mobj = re.match(self._VALID_URL, url)
2378                 if mobj is None:
2379                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2380                         return
2381
2382                 username = mobj.group(1)
2383
2384                 # Download video ids using YouTube Data API. Result size per
2385                 # query is limited (currently to 50 videos) so we need to query
2386                 # page by page until there are no video ids - it means we got
2387                 # all of them.
2388
2389                 video_ids = []
2390                 pagenum = 0
2391
2392                 while True:
2393                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2394                         self.report_download_page(username, start_index)
2395
2396                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2397
2398                         try:
2399                                 page = urllib2.urlopen(request).read()
2400                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2401                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2402                                 return
2403
2404                         # Extract video identifiers
2405                         ids_in_page = []
2406
2407                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2408                                 if mobj.group(1) not in ids_in_page:
2409                                         ids_in_page.append(mobj.group(1))
2410
2411                         video_ids.extend(ids_in_page)
2412
2413                         # A little optimization - if current page is not
2414                         # "full", ie. does not contain PAGE_SIZE video ids then
2415                         # we can assume that this page is the last one - there
2416                         # are no more ids on further pages - no need to query
2417                         # again.
2418
2419                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2420                                 break
2421
2422                         pagenum += 1
2423
2424                 all_ids_count = len(video_ids)
2425                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2426                 playlistend = self._downloader.params.get('playlistend', -1)
2427
2428                 if playlistend == -1:
2429                         video_ids = video_ids[playliststart:]
2430                 else:
2431                         video_ids = video_ids[playliststart:playlistend]
2432
2433                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2434                                            (username, all_ids_count, len(video_ids)))
2435
2436                 for video_id in video_ids:
2437                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2438
2439
2440 class DepositFilesIE(InfoExtractor):
2441         """Information extractor for depositfiles.com"""
2442
2443         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2444
2445         def __init__(self, downloader=None):
2446                 InfoExtractor.__init__(self, downloader)
2447
2448         @staticmethod
2449         def suitable(url):
2450                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2451
2452         def report_download_webpage(self, file_id):
2453                 """Report webpage download."""
2454                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2455
2456         def report_extraction(self, file_id):
2457                 """Report information extraction."""
2458                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2459
2460         def _real_initialize(self):
2461                 return
2462
2463         def _real_extract(self, url):
2464                 # At this point we have a new file
2465                 self._downloader.increment_downloads()
2466
2467                 file_id = url.split('/')[-1]
2468                 # Rebuild url in english locale
2469                 url = 'http://depositfiles.com/en/files/' + file_id
2470
2471                 # Retrieve file webpage with 'Free download' button pressed
2472                 free_download_indication = { 'gateway_result' : '1' }
2473                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2474                 try:
2475                         self.report_download_webpage(file_id)
2476                         webpage = urllib2.urlopen(request).read()
2477                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2478                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2479                         return
2480
2481                 # Search for the real file URL
2482                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2483                 if (mobj is None) or (mobj.group(1) is None):
2484                         # Try to figure out reason of the error.
2485                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2486                         if (mobj is not None) and (mobj.group(1) is not None):
2487                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2488                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2489                         else:
2490                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2491                         return
2492
2493                 file_url = mobj.group(1)
2494                 file_extension = os.path.splitext(file_url)[1][1:]
2495
2496                 # Search for file title
2497                 mobj = re.search(r'<b title="(.*?)">', webpage)
2498                 if mobj is None:
2499                         self._downloader.trouble(u'ERROR: unable to extract title')
2500                         return
2501                 file_title = mobj.group(1).decode('utf-8')
2502
2503                 try:
2504                         # Process file information
2505                         self._downloader.process_info({
2506                                 'id':           file_id.decode('utf-8'),
2507                                 'url':          file_url.decode('utf-8'),
2508                                 'uploader':     u'NA',
2509                                 'upload_date':  u'NA',
2510                                 'title':        file_title,
2511                                 'stitle':       file_title,
2512                                 'ext':          file_extension.decode('utf-8'),
2513                                 'format':       u'NA',
2514                                 'player_url':   None,
2515                         })
2516                 except UnavailableVideoError, err:
2517                         self._downloader.trouble(u'ERROR: unable to download file')
2518
2519 class FacebookIE(InfoExtractor):
2520         """Information Extractor for Facebook"""
2521
2522         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2523         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2524         _NETRC_MACHINE = 'facebook'
2525         _available_formats = ['highqual', 'lowqual']
2526         _video_extensions = {
2527                 'highqual': 'mp4',
2528                 'lowqual': 'mp4',
2529         }
2530
2531         def __init__(self, downloader=None):
2532                 InfoExtractor.__init__(self, downloader)
2533
2534         @staticmethod
2535         def suitable(url):
2536                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2537
2538         def _reporter(self, message):
2539                 """Add header and report message."""
2540                 self._downloader.to_screen(u'[facebook] %s' % message)
2541
2542         def report_login(self):
2543                 """Report attempt to log in."""
2544                 self._reporter(u'Logging in')
2545
2546         def report_video_webpage_download(self, video_id):
2547                 """Report attempt to download video webpage."""
2548                 self._reporter(u'%s: Downloading video webpage' % video_id)
2549
2550         def report_information_extraction(self, video_id):
2551                 """Report attempt to extract video information."""
2552                 self._reporter(u'%s: Extracting video information' % video_id)
2553
2554         def _parse_page(self, video_webpage):
2555                 """Extract video information from page"""
2556                 # General data
2557                 data = {'title': r'class="video_title datawrap">(.*?)</',
2558                         'description': r'<div class="datawrap">(.*?)</div>',
2559                         'owner': r'\("video_owner_name", "(.*?)"\)',
2560                         'upload_date': r'data-date="(.*?)"',
2561                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2562                         }
2563                 video_info = {}
2564                 for piece in data.keys():
2565                         mobj = re.search(data[piece], video_webpage)
2566                         if mobj is not None:
2567                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2568
2569                 # Video urls
2570                 video_urls = {}
2571                 for fmt in self._available_formats:
2572                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2573                         if mobj is not None:
2574                                 # URL is in a Javascript segment inside an escaped Unicode format within
2575                                 # the generally utf-8 page
2576                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2577                 video_info['video_urls'] = video_urls
2578
2579                 return video_info
2580
2581         def _real_initialize(self):
2582                 if self._downloader is None:
2583                         return
2584
2585                 useremail = None
2586                 password = None
2587                 downloader_params = self._downloader.params
2588
2589                 # Attempt to use provided username and password or .netrc data
2590                 if downloader_params.get('username', None) is not None:
2591                         useremail = downloader_params['username']
2592                         password = downloader_params['password']
2593                 elif downloader_params.get('usenetrc', False):
2594                         try:
2595                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2596                                 if info is not None:
2597                                         useremail = info[0]
2598                                         password = info[2]
2599                                 else:
2600                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2601                         except (IOError, netrc.NetrcParseError), err:
2602                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2603                                 return
2604
2605                 if useremail is None:
2606                         return
2607
2608                 # Log in
2609                 login_form = {
2610                         'email': useremail,
2611                         'pass': password,
2612                         'login': 'Log+In'
2613                         }
2614                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2615                 try:
2616                         self.report_login()
2617                         login_results = urllib2.urlopen(request).read()
2618                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2619                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2620                                 return
2621                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2622                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2623                         return
2624
2625         def _real_extract(self, url):
2626                 mobj = re.match(self._VALID_URL, url)
2627                 if mobj is None:
2628                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2629                         return
2630                 video_id = mobj.group('ID')
2631
2632                 # Get video webpage
2633                 self.report_video_webpage_download(video_id)
2634                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2635                 try:
2636                         page = urllib2.urlopen(request)
2637                         video_webpage = page.read()
2638                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2639                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2640                         return
2641
2642                 # Start extracting information
2643                 self.report_information_extraction(video_id)
2644
2645                 # Extract information
2646                 video_info = self._parse_page(video_webpage)
2647
2648                 # uploader
2649                 if 'owner' not in video_info:
2650                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2651                         return
2652                 video_uploader = video_info['owner']
2653
2654                 # title
2655                 if 'title' not in video_info:
2656                         self._downloader.trouble(u'ERROR: unable to extract video title')
2657                         return
2658                 video_title = video_info['title']
2659                 video_title = video_title.decode('utf-8')
2660                 video_title = sanitize_title(video_title)
2661
2662                 # simplified title
2663                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2664                 simple_title = simple_title.strip(ur'_')
2665
2666                 # thumbnail image
2667                 if 'thumbnail' not in video_info:
2668                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2669                         video_thumbnail = ''
2670                 else:
2671                         video_thumbnail = video_info['thumbnail']
2672
2673                 # upload date
2674                 upload_date = u'NA'
2675                 if 'upload_date' in video_info:
2676                         upload_time = video_info['upload_date']
2677                         timetuple = email.utils.parsedate_tz(upload_time)
2678                         if timetuple is not None:
2679                                 try:
2680                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2681                                 except:
2682                                         pass
2683
2684                 # description
2685                 video_description = video_info.get('description', 'No description available.')
2686
2687                 url_map = video_info['video_urls']
2688                 if len(url_map.keys()) > 0:
2689                         # Decide which formats to download
2690                         req_format = self._downloader.params.get('format', None)
2691                         format_limit = self._downloader.params.get('format_limit', None)
2692
2693                         if format_limit is not None and format_limit in self._available_formats:
2694                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2695                         else:
2696                                 format_list = self._available_formats
2697                         existing_formats = [x for x in format_list if x in url_map]
2698                         if len(existing_formats) == 0:
2699                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2700                                 return
2701                         if req_format is None:
2702                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2703                         elif req_format == '-1':
2704                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2705                         else:
2706                                 # Specific format
2707                                 if req_format not in url_map:
2708                                         self._downloader.trouble(u'ERROR: requested format not available')
2709                                         return
2710                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2711
2712                 for format_param, video_real_url in video_url_list:
2713
2714                         # At this point we have a new video
2715                         self._downloader.increment_downloads()
2716
2717                         # Extension
2718                         video_extension = self._video_extensions.get(format_param, 'mp4')
2719
2720                         # Find the video URL in fmt_url_map or conn paramters
2721                         try:
2722                                 # Process video information
2723                                 self._downloader.process_info({
2724                                         'id':           video_id.decode('utf-8'),
2725                                         'url':          video_real_url.decode('utf-8'),
2726                                         'uploader':     video_uploader.decode('utf-8'),
2727                                         'upload_date':  upload_date,
2728                                         'title':        video_title,
2729                                         'stitle':       simple_title,
2730                                         'ext':          video_extension.decode('utf-8'),
2731                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2732                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2733                                         'description':  video_description.decode('utf-8'),
2734                                         'player_url':   None,
2735                                 })
2736                         except UnavailableVideoError, err:
2737                                 self._downloader.trouble(u'\nERROR: unable to download video')
2738
2739 class BlipTVIE(InfoExtractor):
2740         """Information extractor for blip.tv"""
2741
2742         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2743         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2744
2745         @staticmethod
2746         def suitable(url):
2747                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2748
2749         def report_extraction(self, file_id):
2750                 """Report information extraction."""
2751                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2752
2753         def _simplify_title(self, title):
2754                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2755                 res = res.strip(ur'_')
2756                 return res
2757
2758         def _real_extract(self, url):
2759                 mobj = re.match(self._VALID_URL, url)
2760                 if mobj is None:
2761                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2762                         return
2763
2764                 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2765                 request = urllib2.Request(json_url)
2766                 self.report_extraction(mobj.group(1))
2767                 try:
2768                         json_code = urllib2.urlopen(request).read()
2769                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2770                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2771                         return
2772                 try:
2773                         json_data = json.loads(json_code)
2774                         data = json_data['Post'] if 'Post' in json_data else json_data
2775
2776                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2777                         video_url = data['media']['url']
2778                         umobj = re.match(self._URL_EXT, video_url)
2779                         if umobj is None:
2780                                 raise ValueError('Can not determine filename extension')
2781                         ext = umobj.group(1)
2782
2783                         self._downloader.increment_downloads()
2784
2785                         info = {
2786                                 'id': data['item_id'],
2787                                 'url': video_url,
2788                                 'uploader': data['display_name'],
2789                                 'upload_date': upload_date,
2790                                 'title': data['title'],
2791                                 'stitle': self._simplify_title(data['title']),
2792                                 'ext': ext,
2793                                 'format': data['media']['mimeType'],
2794                                 'thumbnail': data['thumbnailUrl'],
2795                                 'description': data['description'],
2796                                 'player_url': data['embedUrl']
2797                         }
2798                 except (ValueError,KeyError), err:
2799                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2800                         return
2801
2802                 try:
2803                         self._downloader.process_info(info)
2804                 except UnavailableVideoError, err:
2805                         self._downloader.trouble(u'\nERROR: unable to download video')
2806
2807
2808 class PostProcessor(object):
2809         """Post Processor class.
2810
2811         PostProcessor objects can be added to downloaders with their
2812         add_post_processor() method. When the downloader has finished a
2813         successful download, it will take its internal chain of PostProcessors
2814         and start calling the run() method on each one of them, first with
2815         an initial argument and then with the returned value of the previous
2816         PostProcessor.
2817
2818         The chain will be stopped if one of them ever returns None or the end
2819         of the chain is reached.
2820
2821         PostProcessor objects follow a "mutual registration" process similar
2822         to InfoExtractor objects.
2823         """
2824
2825         _downloader = None
2826
2827         def __init__(self, downloader=None):
2828                 self._downloader = downloader
2829
2830         def set_downloader(self, downloader):
2831                 """Sets the downloader for this PP."""
2832                 self._downloader = downloader
2833
2834         def run(self, information):
2835                 """Run the PostProcessor.
2836
2837                 The "information" argument is a dictionary like the ones
2838                 composed by InfoExtractors. The only difference is that this
2839                 one has an extra field called "filepath" that points to the
2840                 downloaded file.
2841
2842                 When this method returns None, the postprocessing chain is
2843                 stopped. However, this method may return an information
2844                 dictionary that will be passed to the next postprocessing
2845                 object in the chain. It can be the one it received after
2846                 changing some fields.
2847
2848                 In addition, this method may raise a PostProcessingError
2849                 exception that will be taken into account by the downloader
2850                 it was called from.
2851                 """
2852                 return information # by default, do nothing
2853
2854 class FFmpegExtractAudioPP(PostProcessor):
2855
2856         def __init__(self, downloader=None, preferredcodec=None):
2857                 PostProcessor.__init__(self, downloader)
2858                 if preferredcodec is None:
2859                         preferredcodec = 'best'
2860                 self._preferredcodec = preferredcodec
2861
2862         @staticmethod
2863         def get_audio_codec(path):
2864                 try:
2865                         cmd = ['ffprobe', '-show_streams', '--', path]
2866                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2867                         output = handle.communicate()[0]
2868                         if handle.wait() != 0:
2869                                 return None
2870                 except (IOError, OSError):
2871                         return None
2872                 audio_codec = None
2873                 for line in output.split('\n'):
2874                         if line.startswith('codec_name='):
2875                                 audio_codec = line.split('=')[1].strip()
2876                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2877                                 return audio_codec
2878                 return None
2879
2880         @staticmethod
2881         def run_ffmpeg(path, out_path, codec, more_opts):
2882                 try:
2883                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2884                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2885                         return (ret == 0)
2886                 except (IOError, OSError):
2887                         return False
2888
2889         def run(self, information):
2890                 path = information['filepath']
2891
2892                 filecodec = self.get_audio_codec(path)
2893                 if filecodec is None:
2894                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2895                         return None
2896
2897                 more_opts = []
2898                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2899                         if filecodec == 'aac' or filecodec == 'mp3':
2900                                 # Lossless if possible
2901                                 acodec = 'copy'
2902                                 extension = filecodec
2903                                 if filecodec == 'aac':
2904                                         more_opts = ['-f', 'adts']
2905                         else:
2906                                 # MP3 otherwise.
2907                                 acodec = 'libmp3lame'
2908                                 extension = 'mp3'
2909                                 more_opts = ['-ab', '128k']
2910                 else:
2911                         # We convert the audio (lossy)
2912                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2913                         extension = self._preferredcodec
2914                         more_opts = ['-ab', '128k']
2915                         if self._preferredcodec == 'aac':
2916                                 more_opts += ['-f', 'adts']
2917
2918                 (prefix, ext) = os.path.splitext(path)
2919                 new_path = prefix + '.' + extension
2920                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2921                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2922
2923                 if not status:
2924                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2925                         return None
2926
2927                 try:
2928                         os.remove(path)
2929                 except (IOError, OSError):
2930                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2931                         return None
2932
2933                 information['filepath'] = new_path
2934                 return information
2935
2936 ### MAIN PROGRAM ###
2937 if __name__ == '__main__':
2938         try:
2939                 # Modules needed only when running the main program
2940                 import getpass
2941                 import optparse
2942
2943                 # Function to update the program file with the latest version from the repository.
2944                 def update_self(downloader, filename):
2945                         # Note: downloader only used for options
2946                         if not os.access(filename, os.W_OK):
2947                                 sys.exit('ERROR: no write permissions on %s' % filename)
2948
2949                         downloader.to_screen('Updating to latest stable version...')
2950                         try:
2951                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2952                                 latest_version = urllib.urlopen(latest_url).read().strip()
2953                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2954                                 newcontent = urllib.urlopen(prog_url).read()
2955                         except (IOError, OSError), err:
2956                                 sys.exit('ERROR: unable to download latest version')
2957                         try:
2958                                 stream = open(filename, 'w')
2959                                 stream.write(newcontent)
2960                                 stream.close()
2961                         except (IOError, OSError), err:
2962                                 sys.exit('ERROR: unable to overwrite current version')
2963                         downloader.to_screen('Updated to version %s' % latest_version)
2964
2965                 # Parse command line
2966                 parser = optparse.OptionParser(
2967                         usage='Usage: %prog [options] url...',
2968                         version='2011.07.09-phihag',
2969                         conflict_handler='resolve',
2970                 )
2971
2972                 parser.add_option('-h', '--help',
2973                                 action='help', help='print this help text and exit')
2974                 parser.add_option('-v', '--version',
2975                                 action='version', help='print program version and exit')
2976                 parser.add_option('-U', '--update',
2977                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2978                 parser.add_option('-i', '--ignore-errors',
2979                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2980                 parser.add_option('-r', '--rate-limit',
2981                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2982                 parser.add_option('-R', '--retries',
2983                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2984                 parser.add_option('--playlist-start',
2985                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2986                 parser.add_option('--playlist-end',
2987                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2988                 parser.add_option('--dump-user-agent',
2989                                 action='store_true', dest='dump_user_agent',
2990                                 help='display the current browser identification', default=False)
2991
2992                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2993                 authentication.add_option('-u', '--username',
2994                                 dest='username', metavar='USERNAME', help='account username')
2995                 authentication.add_option('-p', '--password',
2996                                 dest='password', metavar='PASSWORD', help='account password')
2997                 authentication.add_option('-n', '--netrc',
2998                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2999                 parser.add_option_group(authentication)
3000
3001                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3002                 video_format.add_option('-f', '--format',
3003                                 action='store', dest='format', metavar='FORMAT', help='video format code')
3004                 video_format.add_option('--all-formats',
3005                                 action='store_const', dest='format', help='download all available video formats', const='-1')
3006                 video_format.add_option('--max-quality',
3007                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3008                 parser.add_option_group(video_format)
3009
3010                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3011                 verbosity.add_option('-q', '--quiet',
3012                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3013                 verbosity.add_option('-s', '--simulate',
3014                                 action='store_true', dest='simulate', help='do not download video', default=False)
3015                 verbosity.add_option('-g', '--get-url',
3016                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3017                 verbosity.add_option('-e', '--get-title',
3018                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3019                 verbosity.add_option('--get-thumbnail',
3020                                 action='store_true', dest='getthumbnail',
3021                                 help='simulate, quiet but print thumbnail URL', default=False)
3022                 verbosity.add_option('--get-description',
3023                                 action='store_true', dest='getdescription',
3024                                 help='simulate, quiet but print video description', default=False)
3025                 verbosity.add_option('--get-filename',
3026                                 action='store_true', dest='getfilename',
3027                                 help='simulate, quiet but print output filename', default=False)
3028                 verbosity.add_option('--no-progress',
3029                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3030                 verbosity.add_option('--console-title',
3031                                 action='store_true', dest='consoletitle',
3032                                 help='display progress in console titlebar', default=False)
3033                 parser.add_option_group(verbosity)
3034
3035                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3036                 filesystem.add_option('-t', '--title',
3037                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
3038                 filesystem.add_option('-l', '--literal',
3039                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3040                 filesystem.add_option('-A', '--auto-number',
3041                                 action='store_true', dest='autonumber',
3042                                 help='number downloaded files starting from 00000', default=False)
3043                 filesystem.add_option('-o', '--output',
3044                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3045                 filesystem.add_option('-a', '--batch-file',
3046                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3047                 filesystem.add_option('-w', '--no-overwrites',
3048                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3049                 filesystem.add_option('-c', '--continue',
3050                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3051                 filesystem.add_option('--cookies',
3052                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3053                 filesystem.add_option('--no-part',
3054                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
3055                 filesystem.add_option('--no-mtime',
3056                                 action='store_false', dest='updatetime',
3057                                 help='do not use the Last-modified header to set the file modification time', default=True)
3058                 filesystem.add_option('--write-description',
3059                                 action='store_true', dest='writedescription',
3060                                 help='write video description to a .description file', default=False)
3061                 filesystem.add_option('--write-info-json',
3062                                 action='store_true', dest='writeinfojson',
3063                                 help='write video metadata to a .info.json file', default=False)
3064                 parser.add_option_group(filesystem)
3065
3066                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3067                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3068                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3069                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3070                                 help='"best", "aac" or "mp3"; best by default')
3071                 parser.add_option_group(postproc)
3072
3073                 (opts, args) = parser.parse_args()
3074
3075                 # Open appropriate CookieJar
3076                 if opts.cookiefile is None:
3077                         jar = cookielib.CookieJar()
3078                 else:
3079                         try:
3080                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3081                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3082                                         jar.load()
3083                         except (IOError, OSError), err:
3084                                 sys.exit(u'ERROR: unable to open cookie file')
3085
3086                 # Dump user agent
3087                 if opts.dump_user_agent:
3088                         print std_headers['User-Agent']
3089                         sys.exit(0)
3090
3091                 # General configuration
3092                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3093                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3094                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3095
3096                 # Batch file verification
3097                 batchurls = []
3098                 if opts.batchfile is not None:
3099                         try:
3100                                 if opts.batchfile == '-':
3101                                         batchfd = sys.stdin
3102                                 else:
3103                                         batchfd = open(opts.batchfile, 'r')
3104                                 batchurls = batchfd.readlines()
3105                                 batchurls = [x.strip() for x in batchurls]
3106                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3107                         except IOError:
3108                                 sys.exit(u'ERROR: batch file could not be read')
3109                 all_urls = batchurls + args
3110
3111                 # Conflicting, missing and erroneous options
3112                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3113                         parser.error(u'using .netrc conflicts with giving username/password')
3114                 if opts.password is not None and opts.username is None:
3115                         parser.error(u'account username missing')
3116                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3117                         parser.error(u'using output template conflicts with using title, literal title or auto number')
3118                 if opts.usetitle and opts.useliteral:
3119                         parser.error(u'using title conflicts with using literal title')
3120                 if opts.username is not None and opts.password is None:
3121                         opts.password = getpass.getpass(u'Type account password and press return:')
3122                 if opts.ratelimit is not None:
3123                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3124                         if numeric_limit is None:
3125                                 parser.error(u'invalid rate limit specified')
3126                         opts.ratelimit = numeric_limit
3127                 if opts.retries is not None:
3128                         try:
3129                                 opts.retries = long(opts.retries)
3130                         except (TypeError, ValueError), err:
3131                                 parser.error(u'invalid retry count specified')
3132                 try:
3133                         opts.playliststart = long(opts.playliststart)
3134                         if opts.playliststart <= 0:
3135                                 raise ValueError
3136                 except (TypeError, ValueError), err:
3137                         parser.error(u'invalid playlist start number specified')
3138                 try:
3139                         opts.playlistend = long(opts.playlistend)
3140                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3141                                 raise ValueError
3142                 except (TypeError, ValueError), err:
3143                         parser.error(u'invalid playlist end number specified')
3144                 if opts.extractaudio:
3145                         if opts.audioformat not in ['best', 'aac', 'mp3']:
3146                                 parser.error(u'invalid audio format specified')
3147
3148                 # Information extractors
3149                 youtube_ie = YoutubeIE()
3150                 metacafe_ie = MetacafeIE(youtube_ie)
3151                 dailymotion_ie = DailymotionIE()
3152                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3153                 youtube_user_ie = YoutubeUserIE(youtube_ie)
3154                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3155                 google_ie = GoogleIE()
3156                 google_search_ie = GoogleSearchIE(google_ie)
3157                 photobucket_ie = PhotobucketIE()
3158                 yahoo_ie = YahooIE()
3159                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3160                 deposit_files_ie = DepositFilesIE()
3161                 facebook_ie = FacebookIE()
3162                 bliptv_ie = BlipTVIE()
3163                 generic_ie = GenericIE()
3164
3165                 # File downloader
3166                 fd = FileDownloader({
3167                         'usenetrc': opts.usenetrc,
3168                         'username': opts.username,
3169                         'password': opts.password,
3170                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3171                         'forceurl': opts.geturl,
3172                         'forcetitle': opts.gettitle,
3173                         'forcethumbnail': opts.getthumbnail,
3174                         'forcedescription': opts.getdescription,
3175                         'forcefilename': opts.getfilename,
3176                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3177                         'format': opts.format,
3178                         'format_limit': opts.format_limit,
3179                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3180                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3181                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3182                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3183                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3184                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3185                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3186                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3187                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3188                                 or u'%(id)s.%(ext)s'),
3189                         'ignoreerrors': opts.ignoreerrors,
3190                         'ratelimit': opts.ratelimit,
3191                         'nooverwrites': opts.nooverwrites,
3192                         'retries': opts.retries,
3193                         'continuedl': opts.continue_dl,
3194                         'noprogress': opts.noprogress,
3195                         'playliststart': opts.playliststart,
3196                         'playlistend': opts.playlistend,
3197                         'logtostderr': opts.outtmpl == '-',
3198                         'consoletitle': opts.consoletitle,
3199                         'nopart': opts.nopart,
3200                         'updatetime': opts.updatetime,
3201                         'writedescription': opts.writedescription,
3202                         'writeinfojson': opts.writeinfojson,
3203                         })
3204                 fd.add_info_extractor(youtube_search_ie)
3205                 fd.add_info_extractor(youtube_pl_ie)
3206                 fd.add_info_extractor(youtube_user_ie)
3207                 fd.add_info_extractor(metacafe_ie)
3208                 fd.add_info_extractor(dailymotion_ie)
3209                 fd.add_info_extractor(youtube_ie)
3210                 fd.add_info_extractor(google_ie)
3211                 fd.add_info_extractor(google_search_ie)
3212                 fd.add_info_extractor(photobucket_ie)
3213                 fd.add_info_extractor(yahoo_ie)
3214                 fd.add_info_extractor(yahoo_search_ie)
3215                 fd.add_info_extractor(deposit_files_ie)
3216                 fd.add_info_extractor(facebook_ie)
3217                 fd.add_info_extractor(bliptv_ie)
3218
3219                 # This must come last since it's the
3220                 # fallback if none of the others work
3221                 fd.add_info_extractor(generic_ie)
3222
3223                 # PostProcessors
3224                 if opts.extractaudio:
3225                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3226
3227                 # Update version
3228                 if opts.update_self:
3229                         update_self(fd, sys.argv[0])
3230
3231                 # Maybe do nothing
3232                 if len(all_urls) < 1:
3233                         if not opts.update_self:
3234                                 parser.error(u'you must provide at least one URL')
3235                         else:
3236                                 sys.exit()
3237                 retcode = fd.download(all_urls)
3238
3239                 # Dump cookie jar if requested
3240                 if opts.cookiefile is not None:
3241                         try:
3242                                 jar.save()
3243                         except (IOError, OSError), err:
3244                                 sys.exit(u'ERROR: unable to save cookie jar')
3245
3246                 sys.exit(retcode)
3247
3248         except DownloadError:
3249                 sys.exit(1)
3250         except SameFileError:
3251                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3252         except KeyboardInterrupt:
3253                 sys.exit(u'\nERROR: Interrupted by user')