youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.09.18c'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import HTMLParser
  27 import httplib
  28 import locale
  29 import math
  30 import netrc
  31 import os
  32 import os.path
  33 import re
  34 import socket
  35 import string
  36 import subprocess
  37 import sys
  38 import time
  39 import urllib
  40 import urllib2
  41 import warnings
  42 import zlib
  43
  44 if os.name == 'nt':
  45         import ctypes
  46
  47 try:
  48         import email.utils
  49 except ImportError: # Python 2.4
  50         import email.Utils
  51 try:
  52         import cStringIO as StringIO
  53 except ImportError:
  54         import StringIO
  55
  56 # parse_qs was moved from the cgi module to the urlparse module recently.
  57 try:
  58         from urlparse import parse_qs
  59 except ImportError:
  60         from cgi import parse_qs
  61
  62 try:
  63         import lxml.etree
  64 except ImportError:
  65         pass # Handled below
  66
  67 try:
  68         import xml.etree.ElementTree
  69 except ImportError: # Python<2.5: Not officially supported, but let it slip
  70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  71
  72 std_headers = {
  73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  76         'Accept-Encoding': 'gzip, deflate',
  77         'Accept-Language': 'en-us,en;q=0.5',
  78 }
  79
  80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280
 281 class DownloadError(Exception):
 282         """Download Error exception.
 283
 284         This exception may be thrown by FileDownloader objects if they are not
 285         configured to continue on errors. They will contain the appropriate
 286         error message.
 287         """
 288         pass
 289
 290
 291 class SameFileError(Exception):
 292         """Same File exception.
 293
 294         This exception will be thrown by FileDownloader objects if they detect
 295         multiple files would have to be downloaded to the same file on disk.
 296         """
 297         pass
 298
 299
 300 class PostProcessingError(Exception):
 301         """Post Processing exception.
 302
 303         This exception may be raised by PostProcessor's .run() method to
 304         indicate an error in the postprocessing task.
 305         """
 306         pass
 307
 308
 309 class UnavailableVideoError(Exception):
 310         """Unavailable Format exception.
 311
 312         This exception will be thrown when a video is requested
 313         in a format that is not available for that video.
 314         """
 315         pass
 316
 317
 318 class ContentTooShortError(Exception):
 319         """Content Too Short exception.
 320
 321         This exception may be raised by FileDownloader objects when a file they
 322         download is too small for what the server announced first, indicating
 323         the connection was probably interrupted.
 324         """
 325         # Both in bytes
 326         downloaded = None
 327         expected = None
 328
 329         def __init__(self, downloaded, expected):
 330                 self.downloaded = downloaded
 331                 self.expected = expected
 332
 333
 334 class YoutubeDLHandler(urllib2.HTTPHandler):
 335         """Handler for HTTP requests and responses.
 336
 337         This class, when installed with an OpenerDirector, automatically adds
 338         the standard headers to every HTTP request and handles gzipped and
 339         deflated responses from web servers. If compression is to be avoided in
 340         a particular request, the original request in the program code only has
 341         to include the HTTP header "Youtubedl-No-Compression", which will be
 342         removed before making the real request.
 343
 344         Part of this code was copied from:
 345
 346         http://techknack.net/python-urllib2-handlers/
 347
 348         Andrew Rowls, the author of that code, agreed to release it to the
 349         public domain.
 350         """
 351
 352         @staticmethod
 353         def deflate(data):
 354                 try:
 355                         return zlib.decompress(data, -zlib.MAX_WBITS)
 356                 except zlib.error:
 357                         return zlib.decompress(data)
 358
 359         @staticmethod
 360         def addinfourl_wrapper(stream, headers, url, code):
 361                 if hasattr(urllib2.addinfourl, 'getcode'):
 362                         return urllib2.addinfourl(stream, headers, url, code)
 363                 ret = urllib2.addinfourl(stream, headers, url)
 364                 ret.code = code
 365                 return ret
 366
 367         def http_request(self, req):
 368                 for h in std_headers:
 369                         if h in req.headers:
 370                                 del req.headers[h]
 371                         req.add_header(h, std_headers[h])
 372                 if 'Youtubedl-no-compression' in req.headers:
 373                         if 'Accept-encoding' in req.headers:
 374                                 del req.headers['Accept-encoding']
 375                         del req.headers['Youtubedl-no-compression']
 376                 return req
 377
 378         def http_response(self, req, resp):
 379                 old_resp = resp
 380                 # gzip
 381                 if resp.headers.get('Content-encoding', '') == 'gzip':
 382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 384                         resp.msg = old_resp.msg
 385                 # deflate
 386                 if resp.headers.get('Content-encoding', '') == 'deflate':
 387                         gz = StringIO.StringIO(self.deflate(resp.read()))
 388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 389                         resp.msg = old_resp.msg
 390                 return resp
 391
 392
 393 class FileDownloader(object):
 394         """File Downloader class.
 395
 396         File downloader objects are the ones responsible of downloading the
 397         actual video file and writing it to disk if the user has requested
 398         it, among some other tasks. In most cases there should be one per
 399         program. As, given a video URL, the downloader doesn't know how to
 400         extract all the needed information, task that InfoExtractors do, it
 401         has to pass the URL to one of them.
 402
 403         For this, file downloader objects have a method that allows
 404         InfoExtractors to be registered in a given order. When it is passed
 405         a URL, the file downloader handles it to the first InfoExtractor it
 406         finds that reports being able to handle it. The InfoExtractor extracts
 407         all the information about the video or videos the URL refers to, and
 408         asks the FileDownloader to process the video information, possibly
 409         downloading the video.
 410
 411         File downloaders accept a lot of parameters. In order not to saturate
 412         the object constructor with arguments, it receives a dictionary of
 413         options instead. These options are available through the params
 414         attribute for the InfoExtractors to use. The FileDownloader also
 415         registers itself as the downloader in charge for the InfoExtractors
 416         that are added to it, so this is a "mutual registration".
 417
 418         Available options:
 419
 420         username:         Username for authentication purposes.
 421         password:         Password for authentication purposes.
 422         usenetrc:         Use netrc for authentication instead.
 423         quiet:            Do not print messages to stdout.
 424         forceurl:         Force printing final URL.
 425         forcetitle:       Force printing title.
 426         forcethumbnail:   Force printing thumbnail URL.
 427         forcedescription: Force printing description.
 428         forcefilename:    Force printing final filename.
 429         simulate:         Do not download the video files.
 430         format:           Video format code.
 431         format_limit:     Highest quality format to try.
 432         outtmpl:          Template for output names.
 433         ignoreerrors:     Do not stop on download errors.
 434         ratelimit:        Download speed limit, in bytes/sec.
 435         nooverwrites:     Prevent overwriting files.
 436         retries:          Number of times to retry for HTTP error 5xx
 437         continuedl:       Try to continue downloads if possible.
 438         noprogress:       Do not print the progress bar.
 439         playliststart:    Playlist item to start at.
 440         playlistend:      Playlist item to end at.
 441         matchtitle:       Download only matching titles.
 442         rejecttitle:      Reject downloads for matching titles.
 443         logtostderr:      Log messages to stderr instead of stdout.
 444         consoletitle:     Display progress in console window's titlebar.
 445         nopart:           Do not use temporary .part files.
 446         updatetime:       Use the Last-modified header to set output file timestamps.
 447         writedescription: Write the video description to a .description file
 448         writeinfojson:    Write the video description to a .info.json file
 449         """
 450
 451         params = None
 452         _ies = []
 453         _pps = []
 454         _download_retcode = None
 455         _num_downloads = None
 456         _screen_file = None
 457
 458         def __init__(self, params):
 459                 """Create a FileDownloader object with the given options."""
 460                 self._ies = []
 461                 self._pps = []
 462                 self._download_retcode = 0
 463                 self._num_downloads = 0
 464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 465                 self.params = params
 466
 467         @staticmethod
 468         def format_bytes(bytes):
 469                 if bytes is None:
 470                         return 'N/A'
 471                 if type(bytes) is str:
 472                         bytes = float(bytes)
 473                 if bytes == 0.0:
 474                         exponent = 0
 475                 else:
 476                         exponent = long(math.log(bytes, 1024.0))
 477                 suffix = 'bkMGTPEZY'[exponent]
 478                 converted = float(bytes) / float(1024 ** exponent)
 479                 return '%.2f%s' % (converted, suffix)
 480
 481         @staticmethod
 482         def calc_percent(byte_counter, data_len):
 483                 if data_len is None:
 484                         return '---.-%'
 485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 486
 487         @staticmethod
 488         def calc_eta(start, now, total, current):
 489                 if total is None:
 490                         return '--:--'
 491                 dif = now - start
 492                 if current == 0 or dif < 0.001: # One millisecond
 493                         return '--:--'
 494                 rate = float(current) / dif
 495                 eta = long((float(total) - float(current)) / rate)
 496                 (eta_mins, eta_secs) = divmod(eta, 60)
 497                 if eta_mins > 99:
 498                         return '--:--'
 499                 return '%02d:%02d' % (eta_mins, eta_secs)
 500
 501         @staticmethod
 502         def calc_speed(start, now, bytes):
 503                 dif = now - start
 504                 if bytes == 0 or dif < 0.001: # One millisecond
 505                         return '%10s' % '---b/s'
 506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 507
 508         @staticmethod
 509         def best_block_size(elapsed_time, bytes):
 510                 new_min = max(bytes / 2.0, 1.0)
 511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 512                 if elapsed_time < 0.001:
 513                         return long(new_max)
 514                 rate = bytes / elapsed_time
 515                 if rate > new_max:
 516                         return long(new_max)
 517                 if rate < new_min:
 518                         return long(new_min)
 519                 return long(rate)
 520
 521         @staticmethod
 522         def parse_bytes(bytestr):
 523                 """Parse a string indicating a byte quantity into a long integer."""
 524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 525                 if matchobj is None:
 526                         return None
 527                 number = float(matchobj.group(1))
 528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 529                 return long(round(number * multiplier))
 530
 531         def add_info_extractor(self, ie):
 532                 """Add an InfoExtractor object to the end of the list."""
 533                 self._ies.append(ie)
 534                 ie.set_downloader(self)
 535
 536         def add_post_processor(self, pp):
 537                 """Add a PostProcessor object to the end of the chain."""
 538                 self._pps.append(pp)
 539                 pp.set_downloader(self)
 540
 541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 542                 """Print message to stdout if not in quiet mode."""
 543                 try:
 544                         if not self.params.get('quiet', False):
 545                                 terminator = [u'\n', u''][skip_eol]
 546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 547                         self._screen_file.flush()
 548                 except (UnicodeEncodeError), err:
 549                         if not ignore_encoding_errors:
 550                                 raise
 551
 552         def to_stderr(self, message):
 553                 """Print message to stderr."""
 554                 print >>sys.stderr, message.encode(preferredencoding())
 555
 556         def to_cons_title(self, message):
 557                 """Set console/terminal window title to message."""
 558                 if not self.params.get('consoletitle', False):
 559                         return
 560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 561                         # c_wchar_p() might not be necessary if `message` is
 562                         # already of type unicode()
 563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 564                 elif 'TERM' in os.environ:
 565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 566
 567         def fixed_template(self):
 568                 """Checks if the output template is fixed."""
 569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 570
 571         def trouble(self, message=None):
 572                 """Determine action to take when a download problem appears.
 573
 574                 Depending on if the downloader has been configured to ignore
 575                 download errors or not, this method may throw an exception or
 576                 not when errors are found, after printing the message.
 577                 """
 578                 if message is not None:
 579                         self.to_stderr(message)
 580                 if not self.params.get('ignoreerrors', False):
 581                         raise DownloadError(message)
 582                 self._download_retcode = 1
 583
 584         def slow_down(self, start_time, byte_counter):
 585                 """Sleep if the download speed is over the rate limit."""
 586                 rate_limit = self.params.get('ratelimit', None)
 587                 if rate_limit is None or byte_counter == 0:
 588                         return
 589                 now = time.time()
 590                 elapsed = now - start_time
 591                 if elapsed <= 0.0:
 592                         return
 593                 speed = float(byte_counter) / elapsed
 594                 if speed > rate_limit:
 595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 596
 597         def temp_name(self, filename):
 598                 """Returns a temporary filename for the given filename."""
 599                 if self.params.get('nopart', False) or filename == u'-' or \
 600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 601                         return filename
 602                 return filename + u'.part'
 603
 604         def undo_temp_name(self, filename):
 605                 if filename.endswith(u'.part'):
 606                         return filename[:-len(u'.part')]
 607                 return filename
 608
 609         def try_rename(self, old_filename, new_filename):
 610                 try:
 611                         if old_filename == new_filename:
 612                                 return
 613                         os.rename(old_filename, new_filename)
 614                 except (IOError, OSError), err:
 615                         self.trouble(u'ERROR: unable to rename file')
 616
 617         def try_utime(self, filename, last_modified_hdr):
 618                 """Try to set the last-modified time of the given file."""
 619                 if last_modified_hdr is None:
 620                         return
 621                 if not os.path.isfile(filename):
 622                         return
 623                 timestr = last_modified_hdr
 624                 if timestr is None:
 625                         return
 626                 filetime = timeconvert(timestr)
 627                 if filetime is None:
 628                         return filetime
 629                 try:
 630                         os.utime(filename, (time.time(), filetime))
 631                 except:
 632                         pass
 633                 return filetime
 634
 635         def report_writedescription(self, descfn):
 636                 """ Report that the description file is being written """
 637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 638
 639         def report_writeinfojson(self, infofn):
 640                 """ Report that the metadata file has been written """
 641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 642
 643         def report_destination(self, filename):
 644                 """Report destination filename."""
 645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 646
 647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 648                 """Report download progress."""
 649                 if self.params.get('noprogress', False):
 650                         return
 651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 655
 656         def report_resuming_byte(self, resume_len):
 657                 """Report attempt to resume at given byte."""
 658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 659
 660         def report_retry(self, count, retries):
 661                 """Report retry in case of HTTP error 5xx"""
 662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 663
 664         def report_file_already_downloaded(self, file_name):
 665                 """Report file has already been fully downloaded."""
 666                 try:
 667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 668                 except (UnicodeEncodeError), err:
 669                         self.to_screen(u'[download] The file has already been downloaded')
 670
 671         def report_unable_to_resume(self):
 672                 """Report it was impossible to resume download."""
 673                 self.to_screen(u'[download] Unable to resume')
 674
 675         def report_finish(self):
 676                 """Report download finished."""
 677                 if self.params.get('noprogress', False):
 678                         self.to_screen(u'[download] Download completed')
 679                 else:
 680                         self.to_screen(u'')
 681
 682         def increment_downloads(self):
 683                 """Increment the ordinal that assigns a number to each file."""
 684                 self._num_downloads += 1
 685
 686         def prepare_filename(self, info_dict):
 687                 """Generate the output filename."""
 688                 try:
 689                         template_dict = dict(info_dict)
 690                         template_dict['epoch'] = unicode(long(time.time()))
 691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 692                         filename = self.params['outtmpl'] % template_dict
 693                         return filename
 694                 except (ValueError, KeyError), err:
 695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 696                         return None
 697
 698         def process_info(self, info_dict):
 699                 """Process a single dictionary returned by an InfoExtractor."""
 700                 filename = self.prepare_filename(info_dict)
 701
 702                 # Forced printings
 703                 if self.params.get('forcetitle', False):
 704                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 705                 if self.params.get('forceurl', False):
 706                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 707                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 708                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 710                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 711                 if self.params.get('forcefilename', False) and filename is not None:
 712                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 713                 if self.params.get('forceformat', False):
 714                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 715
 716                 # Do nothing else if in simulate mode
 717                 if self.params.get('simulate', False):
 718                         return
 719
 720                 if filename is None:
 721                         return
 722
 723                 matchtitle=self.params.get('matchtitle',False)
 724                 rejecttitle=self.params.get('rejecttitle',False)
 725                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 726                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 727                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 728                         return
 729                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 730                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 731                         return
 732
 733                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 734                         self.to_stderr(u'WARNING: file exists and will be skipped')
 735                         return
 736
 737                 try:
 738                         dn = os.path.dirname(filename)
 739                         if dn != '' and not os.path.exists(dn):
 740                                 os.makedirs(dn)
 741                 except (OSError, IOError), err:
 742                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 743                         return
 744
 745                 if self.params.get('writedescription', False):
 746                         try:
 747                                 descfn = filename + '.description'
 748                                 self.report_writedescription(descfn)
 749                                 descfile = open(descfn, 'wb')
 750                                 try:
 751                                         descfile.write(info_dict['description'].encode('utf-8'))
 752                                 finally:
 753                                         descfile.close()
 754                         except (OSError, IOError):
 755                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 756                                 return
 757
 758                 if self.params.get('writeinfojson', False):
 759                         infofn = filename + '.info.json'
 760                         self.report_writeinfojson(infofn)
 761                         try:
 762                                 json.dump
 763                         except (NameError,AttributeError):
 764                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 765                                 return
 766                         try:
 767                                 infof = open(infofn, 'wb')
 768                                 try:
 769                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 770                                         json.dump(json_info_dict, infof)
 771                                 finally:
 772                                         infof.close()
 773                         except (OSError, IOError):
 774                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 775                                 return
 776
 777                 if not self.params.get('skip_download', False):
 778                         try:
 779                                 success = self._do_download(filename, info_dict)
 780                         except (OSError, IOError), err:
 781                                 raise UnavailableVideoError
 782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 783                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 784                                 return
 785                         except (ContentTooShortError, ), err:
 786                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 787                                 return
 788
 789                         if success:
 790                                 try:
 791                                         self.post_process(filename, info_dict)
 792                                 except (PostProcessingError), err:
 793                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 794                                         return
 795
 796         def download(self, url_list):
 797                 """Download a given list of URLs."""
 798                 if len(url_list) > 1 and self.fixed_template():
 799                         raise SameFileError(self.params['outtmpl'])
 800
 801                 for url in url_list:
 802                         suitable_found = False
 803                         for ie in self._ies:
 804                                 # Go to next InfoExtractor if not suitable
 805                                 if not ie.suitable(url):
 806                                         continue
 807
 808                                 # Suitable InfoExtractor found
 809                                 suitable_found = True
 810
 811                                 # Extract information from URL and process it
 812                                 ie.extract(url)
 813
 814                                 # Suitable InfoExtractor had been found; go to next URL
 815                                 break
 816
 817                         if not suitable_found:
 818                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 819
 820                 return self._download_retcode
 821
 822         def post_process(self, filename, ie_info):
 823                 """Run the postprocessing chain on the given file."""
 824                 info = dict(ie_info)
 825                 info['filepath'] = filename
 826                 for pp in self._pps:
 827                         info = pp.run(info)
 828                         if info is None:
 829                                 break
 830
 831         def _download_with_rtmpdump(self, filename, url, player_url):
 832                 self.report_destination(filename)
 833                 tmpfilename = self.temp_name(filename)
 834
 835                 # Check for rtmpdump first
 836                 try:
 837                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 838                 except (OSError, IOError):
 839                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 840                         return False
 841
 842                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 843                 # the connection was interrumpted and resuming appears to be
 844                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 845                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 846                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 847                 while retval == 2 or retval == 1:
 848                         prevsize = os.path.getsize(tmpfilename)
 849                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 850                         time.sleep(5.0) # This seems to be needed
 851                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 852                         cursize = os.path.getsize(tmpfilename)
 853                         if prevsize == cursize and retval == 1:
 854                                 break
 855                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 856                         if prevsize == cursize and retval == 2 and cursize > 1024:
 857                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 858                                 retval = 0
 859                                 break
 860                 if retval == 0:
 861                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 862                         self.try_rename(tmpfilename, filename)
 863                         return True
 864                 else:
 865                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 866                         return False
 867
 868         def _do_download(self, filename, info_dict):
 869                 url = info_dict['url']
 870                 player_url = info_dict.get('player_url', None)
 871
 872                 # Check file already present
 873                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 874                         self.report_file_already_downloaded(filename)
 875                         return True
 876
 877                 # Attempt to download using rtmpdump
 878                 if url.startswith('rtmp'):
 879                         return self._download_with_rtmpdump(filename, url, player_url)
 880
 881                 tmpfilename = self.temp_name(filename)
 882                 stream = None
 883
 884                 # Do not include the Accept-Encoding header
 885                 headers = {'Youtubedl-no-compression': 'True'}
 886                 basic_request = urllib2.Request(url, None, headers)
 887                 request = urllib2.Request(url, None, headers)
 888
 889                 # Establish possible resume length
 890                 if os.path.isfile(tmpfilename):
 891                         resume_len = os.path.getsize(tmpfilename)
 892                 else:
 893                         resume_len = 0
 894
 895                 open_mode = 'wb'
 896                 if resume_len != 0:
 897                         if self.params.get('continuedl', False):
 898                                 self.report_resuming_byte(resume_len)
 899                                 request.add_header('Range','bytes=%d-' % resume_len)
 900                                 open_mode = 'ab'
 901                         else:
 902                                 resume_len = 0
 903
 904                 count = 0
 905                 retries = self.params.get('retries', 0)
 906                 while count <= retries:
 907                         # Establish connection
 908                         try:
 909                                 if count == 0 and 'urlhandle' in info_dict:
 910                                         data = info_dict['urlhandle']
 911                                 data = urllib2.urlopen(request)
 912                                 break
 913                         except (urllib2.HTTPError, ), err:
 914                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 915                                         # Unexpected HTTP error
 916                                         raise
 917                                 elif err.code == 416:
 918                                         # Unable to resume (requested range not satisfiable)
 919                                         try:
 920                                                 # Open the connection again without the range header
 921                                                 data = urllib2.urlopen(basic_request)
 922                                                 content_length = data.info()['Content-Length']
 923                                         except (urllib2.HTTPError, ), err:
 924                                                 if err.code < 500 or err.code >= 600:
 925                                                         raise
 926                                         else:
 927                                                 # Examine the reported length
 928                                                 if (content_length is not None and
 929                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 930                                                         # The file had already been fully downloaded.
 931                                                         # Explanation to the above condition: in issue #175 it was revealed that
 932                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 933                                                         # changing the file size slightly and causing problems for some users. So
 934                                                         # I decided to implement a suggested change and consider the file
 935                                                         # completely downloaded if the file size differs less than 100 bytes from
 936                                                         # the one in the hard drive.
 937                                                         self.report_file_already_downloaded(filename)
 938                                                         self.try_rename(tmpfilename, filename)
 939                                                         return True
 940                                                 else:
 941                                                         # The length does not match, we start the download over
 942                                                         self.report_unable_to_resume()
 943                                                         open_mode = 'wb'
 944                                                         break
 945                         # Retry
 946                         count += 1
 947                         if count <= retries:
 948                                 self.report_retry(count, retries)
 949
 950                 if count > retries:
 951                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 952                         return False
 953
 954                 data_len = data.info().get('Content-length', None)
 955                 if data_len is not None:
 956                         data_len = long(data_len) + resume_len
 957                 data_len_str = self.format_bytes(data_len)
 958                 byte_counter = 0 + resume_len
 959                 block_size = 1024
 960                 start = time.time()
 961                 while True:
 962                         # Download and write
 963                         before = time.time()
 964                         data_block = data.read(block_size)
 965                         after = time.time()
 966                         if len(data_block) == 0:
 967                                 break
 968                         byte_counter += len(data_block)
 969
 970                         # Open file just in time
 971                         if stream is None:
 972                                 try:
 973                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 974                                         assert stream is not None
 975                                         filename = self.undo_temp_name(tmpfilename)
 976                                         self.report_destination(filename)
 977                                 except (OSError, IOError), err:
 978                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 979                                         return False
 980                         try:
 981                                 stream.write(data_block)
 982                         except (IOError, OSError), err:
 983                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 984                                 return False
 985                         block_size = self.best_block_size(after - before, len(data_block))
 986
 987                         # Progress message
 988                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 989                         if data_len is None:
 990                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
 991                         else:
 992                                 percent_str = self.calc_percent(byte_counter, data_len)
 993                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 994                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 995
 996                         # Apply rate limit
 997                         self.slow_down(start, byte_counter - resume_len)
 998
 999                 if stream is None:
1000                         self.trouble(u'\nERROR: Did not get any data blocks')
1001                         return False
1002                 stream.close()
1003                 self.report_finish()
1004                 if data_len is not None and byte_counter != data_len:
1005                         raise ContentTooShortError(byte_counter, long(data_len))
1006                 self.try_rename(tmpfilename, filename)
1007
1008                 # Update file modification time
1009                 if self.params.get('updatetime', True):
1010                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1011
1012                 return True
1013
1014
1015 class InfoExtractor(object):
1016         """Information Extractor class.
1017
1018         Information extractors are the classes that, given a URL, extract
1019         information from the video (or videos) the URL refers to. This
1020         information includes the real video URL, the video title and simplified
1021         title, author and others. The information is stored in a dictionary
1022         which is then passed to the FileDownloader. The FileDownloader
1023         processes this information possibly downloading the video to the file
1024         system, among other possible outcomes. The dictionaries must include
1025         the following fields:
1026
1027         id:             Video identifier.
1028         url:            Final video URL.
1029         uploader:       Nickname of the video uploader.
1030         title:          Literal title.
1031         stitle:         Simplified title.
1032         ext:            Video filename extension.
1033         format:         Video format.
1034         player_url:     SWF Player URL (may be None).
1035
1036         The following fields are optional. Their primary purpose is to allow
1037         youtube-dl to serve as the backend for a video search function, such
1038         as the one in youtube2mp3.  They are only used when their respective
1039         forced printing functions are called:
1040
1041         thumbnail:      Full URL to a video thumbnail image.
1042         description:    One-line video description.
1043
1044         Subclasses of this one should re-define the _real_initialize() and
1045         _real_extract() methods and define a _VALID_URL regexp.
1046         Probably, they should also be added to the list of extractors.
1047         """
1048
1049         _ready = False
1050         _downloader = None
1051
1052         def __init__(self, downloader=None):
1053                 """Constructor. Receives an optional downloader."""
1054                 self._ready = False
1055                 self.set_downloader(downloader)
1056
1057         def suitable(self, url):
1058                 """Receives a URL and returns True if suitable for this IE."""
1059                 return re.match(self._VALID_URL, url) is not None
1060
1061         def initialize(self):
1062                 """Initializes an instance (authentication, etc)."""
1063                 if not self._ready:
1064                         self._real_initialize()
1065                         self._ready = True
1066
1067         def extract(self, url):
1068                 """Extracts URL information and returns it in list of dicts."""
1069                 self.initialize()
1070                 return self._real_extract(url)
1071
1072         def set_downloader(self, downloader):
1073                 """Sets the downloader for this IE."""
1074                 self._downloader = downloader
1075
1076         def _real_initialize(self):
1077                 """Real initialization process. Redefine in subclasses."""
1078                 pass
1079
1080         def _real_extract(self, url):
1081                 """Real extraction process. Redefine in subclasses."""
1082                 pass
1083
1084
1085 class YoutubeIE(InfoExtractor):
1086         """Information extractor for youtube.com."""
1087
1088         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1089         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1090         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1091         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1092         _NETRC_MACHINE = 'youtube'
1093         # Listed in order of quality
1094         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1095         _video_extensions = {
1096                 '13': '3gp',
1097                 '17': 'mp4',
1098                 '18': 'mp4',
1099                 '22': 'mp4',
1100                 '37': 'mp4',
1101                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1102                 '43': 'webm',
1103                 '44': 'webm',
1104                 '45': 'webm',
1105         }
1106         IE_NAME = u'youtube'
1107
1108         def report_lang(self):
1109                 """Report attempt to set language."""
1110                 self._downloader.to_screen(u'[youtube] Setting language')
1111
1112         def report_login(self):
1113                 """Report attempt to log in."""
1114                 self._downloader.to_screen(u'[youtube] Logging in')
1115
1116         def report_age_confirmation(self):
1117                 """Report attempt to confirm age."""
1118                 self._downloader.to_screen(u'[youtube] Confirming age')
1119
1120         def report_video_webpage_download(self, video_id):
1121                 """Report attempt to download video webpage."""
1122                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1123
1124         def report_video_info_webpage_download(self, video_id):
1125                 """Report attempt to download video info webpage."""
1126                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1127
1128         def report_information_extraction(self, video_id):
1129                 """Report attempt to extract video information."""
1130                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1131
1132         def report_unavailable_format(self, video_id, format):
1133                 """Report extracted video URL."""
1134                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1135
1136         def report_rtmp_download(self):
1137                 """Indicate the download will use the RTMP protocol."""
1138                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1139
1140         def _real_initialize(self):
1141                 if self._downloader is None:
1142                         return
1143
1144                 username = None
1145                 password = None
1146                 downloader_params = self._downloader.params
1147
1148                 # Attempt to use provided username and password or .netrc data
1149                 if downloader_params.get('username', None) is not None:
1150                         username = downloader_params['username']
1151                         password = downloader_params['password']
1152                 elif downloader_params.get('usenetrc', False):
1153                         try:
1154                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1155                                 if info is not None:
1156                                         username = info[0]
1157                                         password = info[2]
1158                                 else:
1159                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1160                         except (IOError, netrc.NetrcParseError), err:
1161                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1162                                 return
1163
1164                 # Set language
1165                 request = urllib2.Request(self._LANG_URL)
1166                 try:
1167                         self.report_lang()
1168                         urllib2.urlopen(request).read()
1169                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1170                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1171                         return
1172
1173                 # No authentication to be performed
1174                 if username is None:
1175                         return
1176
1177                 # Log in
1178                 login_form = {
1179                                 'current_form': 'loginForm',
1180                                 'next':         '/',
1181                                 'action_login': 'Log In',
1182                                 'username':     username,
1183                                 'password':     password,
1184                                 }
1185                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1186                 try:
1187                         self.report_login()
1188                         login_results = urllib2.urlopen(request).read()
1189                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1190                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1191                                 return
1192                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1193                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1194                         return
1195
1196                 # Confirm age
1197                 age_form = {
1198                                 'next_url':             '/',
1199                                 'action_confirm':       'Confirm',
1200                                 }
1201                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1202                 try:
1203                         self.report_age_confirmation()
1204                         age_results = urllib2.urlopen(request).read()
1205                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1206                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1207                         return
1208
1209         def _real_extract(self, url):
1210                 # Extract video id from URL
1211                 mobj = re.match(self._VALID_URL, url)
1212                 if mobj is None:
1213                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1214                         return
1215                 video_id = mobj.group(2)
1216
1217                 # Get video webpage
1218                 self.report_video_webpage_download(video_id)
1219                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1220                 try:
1221                         video_webpage = urllib2.urlopen(request).read()
1222                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1223                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1224                         return
1225
1226                 # Attempt to extract SWF player URL
1227                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1228                 if mobj is not None:
1229                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1230                 else:
1231                         player_url = None
1232
1233                 # Get video info
1234                 self.report_video_info_webpage_download(video_id)
1235                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1236                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1237                                         % (video_id, el_type))
1238                         request = urllib2.Request(video_info_url)
1239                         try:
1240                                 video_info_webpage = urllib2.urlopen(request).read()
1241                                 video_info = parse_qs(video_info_webpage)
1242                                 if 'token' in video_info:
1243                                         break
1244                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1245                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1246                                 return
1247                 if 'token' not in video_info:
1248                         if 'reason' in video_info:
1249                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1250                         else:
1251                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1252                         return
1253
1254                 # Start extracting information
1255                 self.report_information_extraction(video_id)
1256
1257                 # uploader
1258                 if 'author' not in video_info:
1259                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1260                         return
1261                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1262
1263                 # title
1264                 if 'title' not in video_info:
1265                         self._downloader.trouble(u'ERROR: unable to extract video title')
1266                         return
1267                 video_title = urllib.unquote_plus(video_info['title'][0])
1268                 video_title = video_title.decode('utf-8')
1269                 video_title = sanitize_title(video_title)
1270
1271                 # simplified title
1272                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1273                 simple_title = simple_title.strip(ur'_')
1274
1275                 # thumbnail image
1276                 if 'thumbnail_url' not in video_info:
1277                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1278                         video_thumbnail = ''
1279                 else:   # don't panic if we can't find it
1280                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1281
1282                 # upload date
1283                 upload_date = u'NA'
1284                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1285                 if mobj is not None:
1286                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1287                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1288                         for expression in format_expressions:
1289                                 try:
1290                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1291                                 except:
1292                                         pass
1293
1294                 # description
1295                 try:
1296                         lxml.etree
1297                 except NameError:
1298                         video_description = u'No description available.'
1299                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1300                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1301                                 if mobj is not None:
1302                                         video_description = mobj.group(1).decode('utf-8')
1303                 else:
1304                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1305                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1306                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1307                         # TODO use another parser
1308
1309                 # token
1310                 video_token = urllib.unquote_plus(video_info['token'][0])
1311
1312                 # Decide which formats to download
1313                 req_format = self._downloader.params.get('format', None)
1314
1315                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1316                         self.report_rtmp_download()
1317                         video_url_list = [(None, video_info['conn'][0])]
1318                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1319                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1320                         url_data = [parse_qs(uds) for uds in url_data_strs]
1321                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1322                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1323
1324                         format_limit = self._downloader.params.get('format_limit', None)
1325                         if format_limit is not None and format_limit in self._available_formats:
1326                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1327                         else:
1328                                 format_list = self._available_formats
1329                         existing_formats = [x for x in format_list if x in url_map]
1330                         if len(existing_formats) == 0:
1331                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1332                                 return
1333                         if req_format is None or req_format == 'best':
1334                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1335                         elif req_format == 'worst':
1336                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1337                         elif req_format in ('-1', 'all'):
1338                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1339                         else:
1340                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1341                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1342                                 req_formats = req_format.split('/')
1343                                 video_url_list = None
1344                                 for rf in req_formats:
1345                                         if rf in url_map:
1346                                                 video_url_list = [(rf, url_map[rf])]
1347                                                 break
1348                                 if video_url_list is None:
1349                                         self._downloader.trouble(u'ERROR: requested format not available')
1350                                         return
1351                 else:
1352                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1353                         return
1354
1355                 for format_param, video_real_url in video_url_list:
1356                         # At this point we have a new video
1357                         self._downloader.increment_downloads()
1358
1359                         # Extension
1360                         video_extension = self._video_extensions.get(format_param, 'flv')
1361
1362                         try:
1363                                 # Process video information
1364                                 self._downloader.process_info({
1365                                         'id':           video_id.decode('utf-8'),
1366                                         'url':          video_real_url.decode('utf-8'),
1367                                         'uploader':     video_uploader.decode('utf-8'),
1368                                         'upload_date':  upload_date,
1369                                         'title':        video_title,
1370                                         'stitle':       simple_title,
1371                                         'ext':          video_extension.decode('utf-8'),
1372                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1373                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1374                                         'description':  video_description,
1375                                         'player_url':   player_url,
1376                                 })
1377                         except UnavailableVideoError, err:
1378                                 self._downloader.trouble(u'\nERROR: unable to download video')
1379
1380
1381 class MetacafeIE(InfoExtractor):
1382         """Information Extractor for metacafe.com."""
1383
1384         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1385         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1386         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1387         _youtube_ie = None
1388         IE_NAME = u'metacafe'
1389
1390         def __init__(self, youtube_ie, downloader=None):
1391                 InfoExtractor.__init__(self, downloader)
1392                 self._youtube_ie = youtube_ie
1393
1394         def report_disclaimer(self):
1395                 """Report disclaimer retrieval."""
1396                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1397
1398         def report_age_confirmation(self):
1399                 """Report attempt to confirm age."""
1400                 self._downloader.to_screen(u'[metacafe] Confirming age')
1401
1402         def report_download_webpage(self, video_id):
1403                 """Report webpage download."""
1404                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1405
1406         def report_extraction(self, video_id):
1407                 """Report information extraction."""
1408                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1409
1410         def _real_initialize(self):
1411                 # Retrieve disclaimer
1412                 request = urllib2.Request(self._DISCLAIMER)
1413                 try:
1414                         self.report_disclaimer()
1415                         disclaimer = urllib2.urlopen(request).read()
1416                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1417                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1418                         return
1419
1420                 # Confirm age
1421                 disclaimer_form = {
1422                         'filters': '0',
1423                         'submit': "Continue - I'm over 18",
1424                         }
1425                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1426                 try:
1427                         self.report_age_confirmation()
1428                         disclaimer = urllib2.urlopen(request).read()
1429                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1430                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1431                         return
1432
1433         def _real_extract(self, url):
1434                 # Extract id and simplified title from URL
1435                 mobj = re.match(self._VALID_URL, url)
1436                 if mobj is None:
1437                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1438                         return
1439
1440                 video_id = mobj.group(1)
1441
1442                 # Check if video comes from YouTube
1443                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1444                 if mobj2 is not None:
1445                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1446                         return
1447
1448                 # At this point we have a new video
1449                 self._downloader.increment_downloads()
1450
1451                 simple_title = mobj.group(2).decode('utf-8')
1452
1453                 # Retrieve video webpage to extract further information
1454                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1455                 try:
1456                         self.report_download_webpage(video_id)
1457                         webpage = urllib2.urlopen(request).read()
1458                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1459                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1460                         return
1461
1462                 # Extract URL, uploader and title from webpage
1463                 self.report_extraction(video_id)
1464                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1465                 if mobj is not None:
1466                         mediaURL = urllib.unquote(mobj.group(1))
1467                         video_extension = mediaURL[-3:]
1468
1469                         # Extract gdaKey if available
1470                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1471                         if mobj is None:
1472                                 video_url = mediaURL
1473                         else:
1474                                 gdaKey = mobj.group(1)
1475                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1476                 else:
1477                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1478                         if mobj is None:
1479                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1480                                 return
1481                         vardict = parse_qs(mobj.group(1))
1482                         if 'mediaData' not in vardict:
1483                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1484                                 return
1485                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1486                         if mobj is None:
1487                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1488                                 return
1489                         mediaURL = mobj.group(1).replace('\\/', '/')
1490                         video_extension = mediaURL[-3:]
1491                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1492
1493                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1494                 if mobj is None:
1495                         self._downloader.trouble(u'ERROR: unable to extract title')
1496                         return
1497                 video_title = mobj.group(1).decode('utf-8')
1498                 video_title = sanitize_title(video_title)
1499
1500                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1501                 if mobj is None:
1502                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1503                         return
1504                 video_uploader = mobj.group(1)
1505
1506                 try:
1507                         # Process video information
1508                         self._downloader.process_info({
1509                                 'id':           video_id.decode('utf-8'),
1510                                 'url':          video_url.decode('utf-8'),
1511                                 'uploader':     video_uploader.decode('utf-8'),
1512                                 'upload_date':  u'NA',
1513                                 'title':        video_title,
1514                                 'stitle':       simple_title,
1515                                 'ext':          video_extension.decode('utf-8'),
1516                                 'format':       u'NA',
1517                                 'player_url':   None,
1518                         })
1519                 except UnavailableVideoError:
1520                         self._downloader.trouble(u'\nERROR: unable to download video')
1521
1522
1523 class DailymotionIE(InfoExtractor):
1524         """Information Extractor for Dailymotion"""
1525
1526         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1527         IE_NAME = u'dailymotion'
1528
1529         def __init__(self, downloader=None):
1530                 InfoExtractor.__init__(self, downloader)
1531
1532         def report_download_webpage(self, video_id):
1533                 """Report webpage download."""
1534                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1535
1536         def report_extraction(self, video_id):
1537                 """Report information extraction."""
1538                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1539
1540         def _real_initialize(self):
1541                 return
1542
1543         def _real_extract(self, url):
1544                 # Extract id and simplified title from URL
1545                 mobj = re.match(self._VALID_URL, url)
1546                 if mobj is None:
1547                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1548                         return
1549
1550                 # At this point we have a new video
1551                 self._downloader.increment_downloads()
1552                 video_id = mobj.group(1)
1553
1554                 simple_title = mobj.group(2).decode('utf-8')
1555                 video_extension = 'flv'
1556
1557                 # Retrieve video webpage to extract further information
1558                 request = urllib2.Request(url)
1559                 request.add_header('Cookie', 'family_filter=off')
1560                 try:
1561                         self.report_download_webpage(video_id)
1562                         webpage = urllib2.urlopen(request).read()
1563                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1564                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1565                         return
1566
1567                 # Extract URL, uploader and title from webpage
1568                 self.report_extraction(video_id)
1569                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1570                 if mobj is None:
1571                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1572                         return
1573                 sequence = urllib.unquote(mobj.group(1))
1574                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1575                 if mobj is None:
1576                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1577                         return
1578                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1579
1580                 # if needed add http://www.dailymotion.com/ if relative URL
1581
1582                 video_url = mediaURL
1583
1584                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1585                 if mobj is None:
1586                         self._downloader.trouble(u'ERROR: unable to extract title')
1587                         return
1588                 video_title = mobj.group(1).decode('utf-8')
1589                 video_title = sanitize_title(video_title)
1590
1591                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1592                 if mobj is None:
1593                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1594                         return
1595                 video_uploader = mobj.group(1)
1596
1597                 try:
1598                         # Process video information
1599                         self._downloader.process_info({
1600                                 'id':           video_id.decode('utf-8'),
1601                                 'url':          video_url.decode('utf-8'),
1602                                 'uploader':     video_uploader.decode('utf-8'),
1603                                 'upload_date':  u'NA',
1604                                 'title':        video_title,
1605                                 'stitle':       simple_title,
1606                                 'ext':          video_extension.decode('utf-8'),
1607                                 'format':       u'NA',
1608                                 'player_url':   None,
1609                         })
1610                 except UnavailableVideoError:
1611                         self._downloader.trouble(u'\nERROR: unable to download video')
1612
1613
1614 class GoogleIE(InfoExtractor):
1615         """Information extractor for video.google.com."""
1616
1617         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1618         IE_NAME = u'video.google'
1619
1620         def __init__(self, downloader=None):
1621                 InfoExtractor.__init__(self, downloader)
1622
1623         def report_download_webpage(self, video_id):
1624                 """Report webpage download."""
1625                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1626
1627         def report_extraction(self, video_id):
1628                 """Report information extraction."""
1629                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1630
1631         def _real_initialize(self):
1632                 return
1633
1634         def _real_extract(self, url):
1635                 # Extract id from URL
1636                 mobj = re.match(self._VALID_URL, url)
1637                 if mobj is None:
1638                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1639                         return
1640
1641                 # At this point we have a new video
1642                 self._downloader.increment_downloads()
1643                 video_id = mobj.group(1)
1644
1645                 video_extension = 'mp4'
1646
1647                 # Retrieve video webpage to extract further information
1648                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1649                 try:
1650                         self.report_download_webpage(video_id)
1651                         webpage = urllib2.urlopen(request).read()
1652                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1653                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1654                         return
1655
1656                 # Extract URL, uploader, and title from webpage
1657                 self.report_extraction(video_id)
1658                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1659                 if mobj is None:
1660                         video_extension = 'flv'
1661                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1662                 if mobj is None:
1663                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1664                         return
1665                 mediaURL = urllib.unquote(mobj.group(1))
1666                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1667                 mediaURL = mediaURL.replace('\\x26', '\x26')
1668
1669                 video_url = mediaURL
1670
1671                 mobj = re.search(r'<title>(.*)</title>', webpage)
1672                 if mobj is None:
1673                         self._downloader.trouble(u'ERROR: unable to extract title')
1674                         return
1675                 video_title = mobj.group(1).decode('utf-8')
1676                 video_title = sanitize_title(video_title)
1677                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1678
1679                 # Extract video description
1680                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1681                 if mobj is None:
1682                         self._downloader.trouble(u'ERROR: unable to extract video description')
1683                         return
1684                 video_description = mobj.group(1).decode('utf-8')
1685                 if not video_description:
1686                         video_description = 'No description available.'
1687
1688                 # Extract video thumbnail
1689                 if self._downloader.params.get('forcethumbnail', False):
1690                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1691                         try:
1692                                 webpage = urllib2.urlopen(request).read()
1693                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1694                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1695                                 return
1696                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1697                         if mobj is None:
1698                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1699                                 return
1700                         video_thumbnail = mobj.group(1)
1701                 else:   # we need something to pass to process_info
1702                         video_thumbnail = ''
1703
1704                 try:
1705                         # Process video information
1706                         self._downloader.process_info({
1707                                 'id':           video_id.decode('utf-8'),
1708                                 'url':          video_url.decode('utf-8'),
1709                                 'uploader':     u'NA',
1710                                 'upload_date':  u'NA',
1711                                 'title':        video_title,
1712                                 'stitle':       simple_title,
1713                                 'ext':          video_extension.decode('utf-8'),
1714                                 'format':       u'NA',
1715                                 'player_url':   None,
1716                         })
1717                 except UnavailableVideoError:
1718                         self._downloader.trouble(u'\nERROR: unable to download video')
1719
1720
1721 class PhotobucketIE(InfoExtractor):
1722         """Information extractor for photobucket.com."""
1723
1724         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1725         IE_NAME = u'photobucket'
1726
1727         def __init__(self, downloader=None):
1728                 InfoExtractor.__init__(self, downloader)
1729
1730         def report_download_webpage(self, video_id):
1731                 """Report webpage download."""
1732                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1733
1734         def report_extraction(self, video_id):
1735                 """Report information extraction."""
1736                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1737
1738         def _real_initialize(self):
1739                 return
1740
1741         def _real_extract(self, url):
1742                 # Extract id from URL
1743                 mobj = re.match(self._VALID_URL, url)
1744                 if mobj is None:
1745                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1746                         return
1747
1748                 # At this point we have a new video
1749                 self._downloader.increment_downloads()
1750                 video_id = mobj.group(1)
1751
1752                 video_extension = 'flv'
1753
1754                 # Retrieve video webpage to extract further information
1755                 request = urllib2.Request(url)
1756                 try:
1757                         self.report_download_webpage(video_id)
1758                         webpage = urllib2.urlopen(request).read()
1759                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1760                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1761                         return
1762
1763                 # Extract URL, uploader, and title from webpage
1764                 self.report_extraction(video_id)
1765                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1766                 if mobj is None:
1767                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1768                         return
1769                 mediaURL = urllib.unquote(mobj.group(1))
1770
1771                 video_url = mediaURL
1772
1773                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1774                 if mobj is None:
1775                         self._downloader.trouble(u'ERROR: unable to extract title')
1776                         return
1777                 video_title = mobj.group(1).decode('utf-8')
1778                 video_title = sanitize_title(video_title)
1779                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1780
1781                 video_uploader = mobj.group(2).decode('utf-8')
1782
1783                 try:
1784                         # Process video information
1785                         self._downloader.process_info({
1786                                 'id':           video_id.decode('utf-8'),
1787                                 'url':          video_url.decode('utf-8'),
1788                                 'uploader':     video_uploader,
1789                                 'upload_date':  u'NA',
1790                                 'title':        video_title,
1791                                 'stitle':       simple_title,
1792                                 'ext':          video_extension.decode('utf-8'),
1793                                 'format':       u'NA',
1794                                 'player_url':   None,
1795                         })
1796                 except UnavailableVideoError:
1797                         self._downloader.trouble(u'\nERROR: unable to download video')
1798
1799
1800 class YahooIE(InfoExtractor):
1801         """Information extractor for video.yahoo.com."""
1802
1803         # _VALID_URL matches all Yahoo! Video URLs
1804         # _VPAGE_URL matches only the extractable '/watch/' URLs
1805         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1806         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1807         IE_NAME = u'video.yahoo'
1808
1809         def __init__(self, downloader=None):
1810                 InfoExtractor.__init__(self, downloader)
1811
1812         def report_download_webpage(self, video_id):
1813                 """Report webpage download."""
1814                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1815
1816         def report_extraction(self, video_id):
1817                 """Report information extraction."""
1818                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1819
1820         def _real_initialize(self):
1821                 return
1822
1823         def _real_extract(self, url, new_video=True):
1824                 # Extract ID from URL
1825                 mobj = re.match(self._VALID_URL, url)
1826                 if mobj is None:
1827                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1828                         return
1829
1830                 # At this point we have a new video
1831                 self._downloader.increment_downloads()
1832                 video_id = mobj.group(2)
1833                 video_extension = 'flv'
1834
1835                 # Rewrite valid but non-extractable URLs as
1836                 # extractable English language /watch/ URLs
1837                 if re.match(self._VPAGE_URL, url) is None:
1838                         request = urllib2.Request(url)
1839                         try:
1840                                 webpage = urllib2.urlopen(request).read()
1841                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1842                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1843                                 return
1844
1845                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1846                         if mobj is None:
1847                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1848                                 return
1849                         yahoo_id = mobj.group(1)
1850
1851                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1852                         if mobj is None:
1853                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1854                                 return
1855                         yahoo_vid = mobj.group(1)
1856
1857                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1858                         return self._real_extract(url, new_video=False)
1859
1860                 # Retrieve video webpage to extract further information
1861                 request = urllib2.Request(url)
1862                 try:
1863                         self.report_download_webpage(video_id)
1864                         webpage = urllib2.urlopen(request).read()
1865                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1866                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1867                         return
1868
1869                 # Extract uploader and title from webpage
1870                 self.report_extraction(video_id)
1871                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1872                 if mobj is None:
1873                         self._downloader.trouble(u'ERROR: unable to extract video title')
1874                         return
1875                 video_title = mobj.group(1).decode('utf-8')
1876                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1877
1878                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1879                 if mobj is None:
1880                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1881                         return
1882                 video_uploader = mobj.group(1).decode('utf-8')
1883
1884                 # Extract video thumbnail
1885                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1886                 if mobj is None:
1887                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1888                         return
1889                 video_thumbnail = mobj.group(1).decode('utf-8')
1890
1891                 # Extract video description
1892                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1893                 if mobj is None:
1894                         self._downloader.trouble(u'ERROR: unable to extract video description')
1895                         return
1896                 video_description = mobj.group(1).decode('utf-8')
1897                 if not video_description:
1898                         video_description = 'No description available.'
1899
1900                 # Extract video height and width
1901                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1902                 if mobj is None:
1903                         self._downloader.trouble(u'ERROR: unable to extract video height')
1904                         return
1905                 yv_video_height = mobj.group(1)
1906
1907                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1908                 if mobj is None:
1909                         self._downloader.trouble(u'ERROR: unable to extract video width')
1910                         return
1911                 yv_video_width = mobj.group(1)
1912
1913                 # Retrieve video playlist to extract media URL
1914                 # I'm not completely sure what all these options are, but we
1915                 # seem to need most of them, otherwise the server sends a 401.
1916                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1917                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1918                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1919                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1920                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1921                 try:
1922                         self.report_download_webpage(video_id)
1923                         webpage = urllib2.urlopen(request).read()
1924                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1925                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1926                         return
1927
1928                 # Extract media URL from playlist XML
1929                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1930                 if mobj is None:
1931                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1932                         return
1933                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1934                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1935
1936                 try:
1937                         # Process video information
1938                         self._downloader.process_info({
1939                                 'id':           video_id.decode('utf-8'),
1940                                 'url':          video_url,
1941                                 'uploader':     video_uploader,
1942                                 'upload_date':  u'NA',
1943                                 'title':        video_title,
1944                                 'stitle':       simple_title,
1945                                 'ext':          video_extension.decode('utf-8'),
1946                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1947                                 'description':  video_description,
1948                                 'thumbnail':    video_thumbnail,
1949                                 'player_url':   None,
1950                         })
1951                 except UnavailableVideoError:
1952                         self._downloader.trouble(u'\nERROR: unable to download video')
1953
1954
1955 class VimeoIE(InfoExtractor):
1956         """Information extractor for vimeo.com."""
1957
1958         # _VALID_URL matches Vimeo URLs
1959         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1960         IE_NAME = u'vimeo'
1961
1962         def __init__(self, downloader=None):
1963                 InfoExtractor.__init__(self, downloader)
1964
1965         def report_download_webpage(self, video_id):
1966                 """Report webpage download."""
1967                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1968
1969         def report_extraction(self, video_id):
1970                 """Report information extraction."""
1971                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1972
1973         def _real_initialize(self):
1974                 return
1975
1976         def _real_extract(self, url, new_video=True):
1977                 # Extract ID from URL
1978                 mobj = re.match(self._VALID_URL, url)
1979                 if mobj is None:
1980                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1981                         return
1982
1983                 # At this point we have a new video
1984                 self._downloader.increment_downloads()
1985                 video_id = mobj.group(1)
1986
1987                 # Retrieve video webpage to extract further information
1988                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1989                 try:
1990                         self.report_download_webpage(video_id)
1991                         webpage = urllib2.urlopen(request).read()
1992                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1994                         return
1995
1996                 # Now we begin extracting as much information as we can from what we
1997                 # retrieved. First we extract the information common to all extractors,
1998                 # and latter we extract those that are Vimeo specific.
1999                 self.report_extraction(video_id)
2000
2001                 # Extract title
2002                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2003                 if mobj is None:
2004                         self._downloader.trouble(u'ERROR: unable to extract video title')
2005                         return
2006                 video_title = mobj.group(1).decode('utf-8')
2007                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2008
2009                 # Extract uploader
2010                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2011                 if mobj is None:
2012                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2013                         return
2014                 video_uploader = mobj.group(1).decode('utf-8')
2015
2016                 # Extract video thumbnail
2017                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2018                 if mobj is None:
2019                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2020                         return
2021                 video_thumbnail = mobj.group(1).decode('utf-8')
2022
2023                 # # Extract video description
2024                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2025                 # if mobj is None:
2026                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2027                 #       return
2028                 # video_description = mobj.group(1).decode('utf-8')
2029                 # if not video_description: video_description = 'No description available.'
2030                 video_description = 'Foo.'
2031
2032                 # Vimeo specific: extract request signature
2033                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2034                 if mobj is None:
2035                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2036                         return
2037                 sig = mobj.group(1).decode('utf-8')
2038
2039                 # Vimeo specific: Extract request signature expiration
2040                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2041                 if mobj is None:
2042                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2043                         return
2044                 sig_exp = mobj.group(1).decode('utf-8')
2045
2046                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2047
2048                 try:
2049                         # Process video information
2050                         self._downloader.process_info({
2051                                 'id':           video_id.decode('utf-8'),
2052                                 'url':          video_url,
2053                                 'uploader':     video_uploader,
2054                                 'upload_date':  u'NA',
2055                                 'title':        video_title,
2056                                 'stitle':       simple_title,
2057                                 'ext':          u'mp4',
2058                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2059                                 'description':  video_description,
2060                                 'thumbnail':    video_thumbnail,
2061                                 'description':  video_description,
2062                                 'player_url':   None,
2063                         })
2064                 except UnavailableVideoError:
2065                         self._downloader.trouble(u'ERROR: unable to download video')
2066
2067
2068 class GenericIE(InfoExtractor):
2069         """Generic last-resort information extractor."""
2070
2071         _VALID_URL = r'.*'
2072         IE_NAME = u'generic'
2073
2074         def __init__(self, downloader=None):
2075                 InfoExtractor.__init__(self, downloader)
2076
2077         def report_download_webpage(self, video_id):
2078                 """Report webpage download."""
2079                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2080                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2081
2082         def report_extraction(self, video_id):
2083                 """Report information extraction."""
2084                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2085
2086         def _real_initialize(self):
2087                 return
2088
2089         def _real_extract(self, url):
2090                 # At this point we have a new video
2091                 self._downloader.increment_downloads()
2092
2093                 video_id = url.split('/')[-1]
2094                 request = urllib2.Request(url)
2095                 try:
2096                         self.report_download_webpage(video_id)
2097                         webpage = urllib2.urlopen(request).read()
2098                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2099                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2100                         return
2101                 except ValueError, err:
2102                         # since this is the last-resort InfoExtractor, if
2103                         # this error is thrown, it'll be thrown here
2104                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2105                         return
2106
2107                 self.report_extraction(video_id)
2108                 # Start with something easy: JW Player in SWFObject
2109                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2110                 if mobj is None:
2111                         # Broaden the search a little bit
2112                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2113                 if mobj is None:
2114                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2115                         return
2116
2117                 # It's possible that one of the regexes
2118                 # matched, but returned an empty group:
2119                 if mobj.group(1) is None:
2120                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2121                         return
2122
2123                 video_url = urllib.unquote(mobj.group(1))
2124                 video_id = os.path.basename(video_url)
2125
2126                 # here's a fun little line of code for you:
2127                 video_extension = os.path.splitext(video_id)[1][1:]
2128                 video_id = os.path.splitext(video_id)[0]
2129
2130                 # it's tempting to parse this further, but you would
2131                 # have to take into account all the variations like
2132                 #   Video Title - Site Name
2133                 #   Site Name | Video Title
2134                 #   Video Title - Tagline | Site Name
2135                 # and so on and so forth; it's just not practical
2136                 mobj = re.search(r'<title>(.*)</title>', webpage)
2137                 if mobj is None:
2138                         self._downloader.trouble(u'ERROR: unable to extract title')
2139                         return
2140                 video_title = mobj.group(1).decode('utf-8')
2141                 video_title = sanitize_title(video_title)
2142                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2143
2144                 # video uploader is domain name
2145                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2146                 if mobj is None:
2147                         self._downloader.trouble(u'ERROR: unable to extract title')
2148                         return
2149                 video_uploader = mobj.group(1).decode('utf-8')
2150
2151                 try:
2152                         # Process video information
2153                         self._downloader.process_info({
2154                                 'id':           video_id.decode('utf-8'),
2155                                 'url':          video_url.decode('utf-8'),
2156                                 'uploader':     video_uploader,
2157                                 'upload_date':  u'NA',
2158                                 'title':        video_title,
2159                                 'stitle':       simple_title,
2160                                 'ext':          video_extension.decode('utf-8'),
2161                                 'format':       u'NA',
2162                                 'player_url':   None,
2163                         })
2164                 except UnavailableVideoError, err:
2165                         self._downloader.trouble(u'\nERROR: unable to download video')
2166
2167
2168 class YoutubeSearchIE(InfoExtractor):
2169         """Information Extractor for YouTube search queries."""
2170         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2171         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2172         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2173         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2174         _youtube_ie = None
2175         _max_youtube_results = 1000
2176         IE_NAME = u'youtube:search'
2177
2178         def __init__(self, youtube_ie, downloader=None):
2179                 InfoExtractor.__init__(self, downloader)
2180                 self._youtube_ie = youtube_ie
2181
2182         def report_download_page(self, query, pagenum):
2183                 """Report attempt to download playlist page with given number."""
2184                 query = query.decode(preferredencoding())
2185                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2186
2187         def _real_initialize(self):
2188                 self._youtube_ie.initialize()
2189
2190         def _real_extract(self, query):
2191                 mobj = re.match(self._VALID_URL, query)
2192                 if mobj is None:
2193                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2194                         return
2195
2196                 prefix, query = query.split(':')
2197                 prefix = prefix[8:]
2198                 query = query.encode('utf-8')
2199                 if prefix == '':
2200                         self._download_n_results(query, 1)
2201                         return
2202                 elif prefix == 'all':
2203                         self._download_n_results(query, self._max_youtube_results)
2204                         return
2205                 else:
2206                         try:
2207                                 n = long(prefix)
2208                                 if n <= 0:
2209                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2210                                         return
2211                                 elif n > self._max_youtube_results:
2212                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2213                                         n = self._max_youtube_results
2214                                 self._download_n_results(query, n)
2215                                 return
2216                         except ValueError: # parsing prefix as integer fails
2217                                 self._download_n_results(query, 1)
2218                                 return
2219
2220         def _download_n_results(self, query, n):
2221                 """Downloads a specified number of results for a query"""
2222
2223                 video_ids = []
2224                 already_seen = set()
2225                 pagenum = 1
2226
2227                 while True:
2228                         self.report_download_page(query, pagenum)
2229                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2230                         request = urllib2.Request(result_url)
2231                         try:
2232                                 page = urllib2.urlopen(request).read()
2233                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2234                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2235                                 return
2236
2237                         # Extract video identifiers
2238                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2239                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2240                                 if video_id not in already_seen:
2241                                         video_ids.append(video_id)
2242                                         already_seen.add(video_id)
2243                                         if len(video_ids) == n:
2244                                                 # Specified n videos reached
2245                                                 for id in video_ids:
2246                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2247                                                 return
2248
2249                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2250                                 for id in video_ids:
2251                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2252                                 return
2253
2254                         pagenum = pagenum + 1
2255
2256
2257 class GoogleSearchIE(InfoExtractor):
2258         """Information Extractor for Google Video search queries."""
2259         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2260         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2261         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2262         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2263         _google_ie = None
2264         _max_google_results = 1000
2265         IE_NAME = u'video.google:search'
2266
2267         def __init__(self, google_ie, downloader=None):
2268                 InfoExtractor.__init__(self, downloader)
2269                 self._google_ie = google_ie
2270
2271         def report_download_page(self, query, pagenum):
2272                 """Report attempt to download playlist page with given number."""
2273                 query = query.decode(preferredencoding())
2274                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2275
2276         def _real_initialize(self):
2277                 self._google_ie.initialize()
2278
2279         def _real_extract(self, query):
2280                 mobj = re.match(self._VALID_URL, query)
2281                 if mobj is None:
2282                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2283                         return
2284
2285                 prefix, query = query.split(':')
2286                 prefix = prefix[8:]
2287                 query = query.encode('utf-8')
2288                 if prefix == '':
2289                         self._download_n_results(query, 1)
2290                         return
2291                 elif prefix == 'all':
2292                         self._download_n_results(query, self._max_google_results)
2293                         return
2294                 else:
2295                         try:
2296                                 n = long(prefix)
2297                                 if n <= 0:
2298                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2299                                         return
2300                                 elif n > self._max_google_results:
2301                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2302                                         n = self._max_google_results
2303                                 self._download_n_results(query, n)
2304                                 return
2305                         except ValueError: # parsing prefix as integer fails
2306                                 self._download_n_results(query, 1)
2307                                 return
2308
2309         def _download_n_results(self, query, n):
2310                 """Downloads a specified number of results for a query"""
2311
2312                 video_ids = []
2313                 already_seen = set()
2314                 pagenum = 1
2315
2316                 while True:
2317                         self.report_download_page(query, pagenum)
2318                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2319                         request = urllib2.Request(result_url)
2320                         try:
2321                                 page = urllib2.urlopen(request).read()
2322                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2323                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2324                                 return
2325
2326                         # Extract video identifiers
2327                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2328                                 video_id = mobj.group(1)
2329                                 if video_id not in already_seen:
2330                                         video_ids.append(video_id)
2331                                         already_seen.add(video_id)
2332                                         if len(video_ids) == n:
2333                                                 # Specified n videos reached
2334                                                 for id in video_ids:
2335                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2336                                                 return
2337
2338                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2339                                 for id in video_ids:
2340                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2341                                 return
2342
2343                         pagenum = pagenum + 1
2344
2345
2346 class YahooSearchIE(InfoExtractor):
2347         """Information Extractor for Yahoo! Video search queries."""
2348         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2349         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2350         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2351         _MORE_PAGES_INDICATOR = r'\s*Next'
2352         _yahoo_ie = None
2353         _max_yahoo_results = 1000
2354         IE_NAME = u'video.yahoo:search'
2355
2356         def __init__(self, yahoo_ie, downloader=None):
2357                 InfoExtractor.__init__(self, downloader)
2358                 self._yahoo_ie = yahoo_ie
2359
2360         def report_download_page(self, query, pagenum):
2361                 """Report attempt to download playlist page with given number."""
2362                 query = query.decode(preferredencoding())
2363                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2364
2365         def _real_initialize(self):
2366                 self._yahoo_ie.initialize()
2367
2368         def _real_extract(self, query):
2369                 mobj = re.match(self._VALID_URL, query)
2370                 if mobj is None:
2371                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2372                         return
2373
2374                 prefix, query = query.split(':')
2375                 prefix = prefix[8:]
2376                 query = query.encode('utf-8')
2377                 if prefix == '':
2378                         self._download_n_results(query, 1)
2379                         return
2380                 elif prefix == 'all':
2381                         self._download_n_results(query, self._max_yahoo_results)
2382                         return
2383                 else:
2384                         try:
2385                                 n = long(prefix)
2386                                 if n <= 0:
2387                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2388                                         return
2389                                 elif n > self._max_yahoo_results:
2390                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2391                                         n = self._max_yahoo_results
2392                                 self._download_n_results(query, n)
2393                                 return
2394                         except ValueError: # parsing prefix as integer fails
2395                                 self._download_n_results(query, 1)
2396                                 return
2397
2398         def _download_n_results(self, query, n):
2399                 """Downloads a specified number of results for a query"""
2400
2401                 video_ids = []
2402                 already_seen = set()
2403                 pagenum = 1
2404
2405                 while True:
2406                         self.report_download_page(query, pagenum)
2407                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2408                         request = urllib2.Request(result_url)
2409                         try:
2410                                 page = urllib2.urlopen(request).read()
2411                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2412                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2413                                 return
2414
2415                         # Extract video identifiers
2416                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2417                                 video_id = mobj.group(1)
2418                                 if video_id not in already_seen:
2419                                         video_ids.append(video_id)
2420                                         already_seen.add(video_id)
2421                                         if len(video_ids) == n:
2422                                                 # Specified n videos reached
2423                                                 for id in video_ids:
2424                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2425                                                 return
2426
2427                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2428                                 for id in video_ids:
2429                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2430                                 return
2431
2432                         pagenum = pagenum + 1
2433
2434
2435 class YoutubePlaylistIE(InfoExtractor):
2436         """Information Extractor for YouTube playlists."""
2437
2438         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2439         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2440         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2441         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2442         _youtube_ie = None
2443         IE_NAME = u'youtube:playlist'
2444
2445         def __init__(self, youtube_ie, downloader=None):
2446                 InfoExtractor.__init__(self, downloader)
2447                 self._youtube_ie = youtube_ie
2448
2449         def report_download_page(self, playlist_id, pagenum):
2450                 """Report attempt to download playlist page with given number."""
2451                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2452
2453         def _real_initialize(self):
2454                 self._youtube_ie.initialize()
2455
2456         def _real_extract(self, url):
2457                 # Extract playlist id
2458                 mobj = re.match(self._VALID_URL, url)
2459                 if mobj is None:
2460                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2461                         return
2462
2463                 # Single video case
2464                 if mobj.group(3) is not None:
2465                         self._youtube_ie.extract(mobj.group(3))
2466                         return
2467
2468                 # Download playlist pages
2469                 # prefix is 'p' as default for playlists but there are other types that need extra care
2470                 playlist_prefix = mobj.group(1)
2471                 if playlist_prefix == 'a':
2472                         playlist_access = 'artist'
2473                 else:
2474                         playlist_prefix = 'p'
2475                         playlist_access = 'view_play_list'
2476                 playlist_id = mobj.group(2)
2477                 video_ids = []
2478                 pagenum = 1
2479
2480                 while True:
2481                         self.report_download_page(playlist_id, pagenum)
2482                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2483                         try:
2484                                 page = urllib2.urlopen(request).read()
2485                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2486                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2487                                 return
2488
2489                         # Extract video identifiers
2490                         ids_in_page = []
2491                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2492                                 if mobj.group(1) not in ids_in_page:
2493                                         ids_in_page.append(mobj.group(1))
2494                         video_ids.extend(ids_in_page)
2495
2496                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2497                                 break
2498                         pagenum = pagenum + 1
2499
2500                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2501                 playlistend = self._downloader.params.get('playlistend', -1)
2502                 video_ids = video_ids[playliststart:playlistend]
2503
2504                 for id in video_ids:
2505                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2506                 return
2507
2508
2509 class YoutubeUserIE(InfoExtractor):
2510         """Information Extractor for YouTube users."""
2511
2512         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2513         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2514         _GDATA_PAGE_SIZE = 50
2515         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2516         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2517         _youtube_ie = None
2518         IE_NAME = u'youtube:user'
2519
2520         def __init__(self, youtube_ie, downloader=None):
2521                 InfoExtractor.__init__(self, downloader)
2522                 self._youtube_ie = youtube_ie
2523
2524         def report_download_page(self, username, start_index):
2525                 """Report attempt to download user page."""
2526                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2527                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2528
2529         def _real_initialize(self):
2530                 self._youtube_ie.initialize()
2531
2532         def _real_extract(self, url):
2533                 # Extract username
2534                 mobj = re.match(self._VALID_URL, url)
2535                 if mobj is None:
2536                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2537                         return
2538
2539                 username = mobj.group(1)
2540
2541                 # Download video ids using YouTube Data API. Result size per
2542                 # query is limited (currently to 50 videos) so we need to query
2543                 # page by page until there are no video ids - it means we got
2544                 # all of them.
2545
2546                 video_ids = []
2547                 pagenum = 0
2548
2549                 while True:
2550                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2551                         self.report_download_page(username, start_index)
2552
2553                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2554
2555                         try:
2556                                 page = urllib2.urlopen(request).read()
2557                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2558                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2559                                 return
2560
2561                         # Extract video identifiers
2562                         ids_in_page = []
2563
2564                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2565                                 if mobj.group(1) not in ids_in_page:
2566                                         ids_in_page.append(mobj.group(1))
2567
2568                         video_ids.extend(ids_in_page)
2569
2570                         # A little optimization - if current page is not
2571                         # "full", ie. does not contain PAGE_SIZE video ids then
2572                         # we can assume that this page is the last one - there
2573                         # are no more ids on further pages - no need to query
2574                         # again.
2575
2576                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2577                                 break
2578
2579                         pagenum += 1
2580
2581                 all_ids_count = len(video_ids)
2582                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2583                 playlistend = self._downloader.params.get('playlistend', -1)
2584
2585                 if playlistend == -1:
2586                         video_ids = video_ids[playliststart:]
2587                 else:
2588                         video_ids = video_ids[playliststart:playlistend]
2589
2590                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2591                                 (username, all_ids_count, len(video_ids)))
2592
2593                 for video_id in video_ids:
2594                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2595
2596
2597 class DepositFilesIE(InfoExtractor):
2598         """Information extractor for depositfiles.com"""
2599
2600         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2601         IE_NAME = u'DepositFiles'
2602
2603         def __init__(self, downloader=None):
2604                 InfoExtractor.__init__(self, downloader)
2605
2606         def report_download_webpage(self, file_id):
2607                 """Report webpage download."""
2608                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2609
2610         def report_extraction(self, file_id):
2611                 """Report information extraction."""
2612                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2613
2614         def _real_initialize(self):
2615                 return
2616
2617         def _real_extract(self, url):
2618                 # At this point we have a new file
2619                 self._downloader.increment_downloads()
2620
2621                 file_id = url.split('/')[-1]
2622                 # Rebuild url in english locale
2623                 url = 'http://depositfiles.com/en/files/' + file_id
2624
2625                 # Retrieve file webpage with 'Free download' button pressed
2626                 free_download_indication = { 'gateway_result' : '1' }
2627                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2628                 try:
2629                         self.report_download_webpage(file_id)
2630                         webpage = urllib2.urlopen(request).read()
2631                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2632                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2633                         return
2634
2635                 # Search for the real file URL
2636                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2637                 if (mobj is None) or (mobj.group(1) is None):
2638                         # Try to figure out reason of the error.
2639                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2640                         if (mobj is not None) and (mobj.group(1) is not None):
2641                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2642                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2643                         else:
2644                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2645                         return
2646
2647                 file_url = mobj.group(1)
2648                 file_extension = os.path.splitext(file_url)[1][1:]
2649
2650                 # Search for file title
2651                 mobj = re.search(r'<b title="(.*?)">', webpage)
2652                 if mobj is None:
2653                         self._downloader.trouble(u'ERROR: unable to extract title')
2654                         return
2655                 file_title = mobj.group(1).decode('utf-8')
2656
2657                 try:
2658                         # Process file information
2659                         self._downloader.process_info({
2660                                 'id':           file_id.decode('utf-8'),
2661                                 'url':          file_url.decode('utf-8'),
2662                                 'uploader':     u'NA',
2663                                 'upload_date':  u'NA',
2664                                 'title':        file_title,
2665                                 'stitle':       file_title,
2666                                 'ext':          file_extension.decode('utf-8'),
2667                                 'format':       u'NA',
2668                                 'player_url':   None,
2669                         })
2670                 except UnavailableVideoError, err:
2671                         self._downloader.trouble(u'ERROR: unable to download file')
2672
2673
2674 class FacebookIE(InfoExtractor):
2675         """Information Extractor for Facebook"""
2676
2677         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/video/video\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2678         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2679         _NETRC_MACHINE = 'facebook'
2680         _available_formats = ['highqual', 'lowqual']
2681         _video_extensions = {
2682                 'highqual': 'mp4',
2683                 'lowqual': 'mp4',
2684         }
2685         IE_NAME = u'facebook'
2686
2687         def __init__(self, downloader=None):
2688                 InfoExtractor.__init__(self, downloader)
2689
2690         def _reporter(self, message):
2691                 """Add header and report message."""
2692                 self._downloader.to_screen(u'[facebook] %s' % message)
2693
2694         def report_login(self):
2695                 """Report attempt to log in."""
2696                 self._reporter(u'Logging in')
2697
2698         def report_video_webpage_download(self, video_id):
2699                 """Report attempt to download video webpage."""
2700                 self._reporter(u'%s: Downloading video webpage' % video_id)
2701
2702         def report_information_extraction(self, video_id):
2703                 """Report attempt to extract video information."""
2704                 self._reporter(u'%s: Extracting video information' % video_id)
2705
2706         def _parse_page(self, video_webpage):
2707                 """Extract video information from page"""
2708                 # General data
2709                 data = {'title': r'class="video_title datawrap">(.*?)</',
2710                         'description': r'<div class="datawrap">(.*?)</div>',
2711                         'owner': r'\("video_owner_name", "(.*?)"\)',
2712                         'upload_date': r'data-date="(.*?)"',
2713                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2714                         }
2715                 video_info = {}
2716                 for piece in data.keys():
2717                         mobj = re.search(data[piece], video_webpage)
2718                         if mobj is not None:
2719                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2720
2721                 # Video urls
2722                 video_urls = {}
2723                 for fmt in self._available_formats:
2724                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2725                         if mobj is not None:
2726                                 # URL is in a Javascript segment inside an escaped Unicode format within
2727                                 # the generally utf-8 page
2728                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2729                 video_info['video_urls'] = video_urls
2730
2731                 return video_info
2732
2733         def _real_initialize(self):
2734                 if self._downloader is None:
2735                         return
2736
2737                 useremail = None
2738                 password = None
2739                 downloader_params = self._downloader.params
2740
2741                 # Attempt to use provided username and password or .netrc data
2742                 if downloader_params.get('username', None) is not None:
2743                         useremail = downloader_params['username']
2744                         password = downloader_params['password']
2745                 elif downloader_params.get('usenetrc', False):
2746                         try:
2747                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2748                                 if info is not None:
2749                                         useremail = info[0]
2750                                         password = info[2]
2751                                 else:
2752                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2753                         except (IOError, netrc.NetrcParseError), err:
2754                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2755                                 return
2756
2757                 if useremail is None:
2758                         return
2759
2760                 # Log in
2761                 login_form = {
2762                         'email': useremail,
2763                         'pass': password,
2764                         'login': 'Log+In'
2765                         }
2766                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2767                 try:
2768                         self.report_login()
2769                         login_results = urllib2.urlopen(request).read()
2770                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2771                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2772                                 return
2773                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2774                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2775                         return
2776
2777         def _real_extract(self, url):
2778                 mobj = re.match(self._VALID_URL, url)
2779                 if mobj is None:
2780                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2781                         return
2782                 video_id = mobj.group('ID')
2783
2784                 # Get video webpage
2785                 self.report_video_webpage_download(video_id)
2786                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2787                 try:
2788                         page = urllib2.urlopen(request)
2789                         video_webpage = page.read()
2790                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2791                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2792                         return
2793
2794                 # Start extracting information
2795                 self.report_information_extraction(video_id)
2796
2797                 # Extract information
2798                 video_info = self._parse_page(video_webpage)
2799
2800                 # uploader
2801                 if 'owner' not in video_info:
2802                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2803                         return
2804                 video_uploader = video_info['owner']
2805
2806                 # title
2807                 if 'title' not in video_info:
2808                         self._downloader.trouble(u'ERROR: unable to extract video title')
2809                         return
2810                 video_title = video_info['title']
2811                 video_title = video_title.decode('utf-8')
2812                 video_title = sanitize_title(video_title)
2813
2814                 # simplified title
2815                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2816                 simple_title = simple_title.strip(ur'_')
2817
2818                 # thumbnail image
2819                 if 'thumbnail' not in video_info:
2820                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2821                         video_thumbnail = ''
2822                 else:
2823                         video_thumbnail = video_info['thumbnail']
2824
2825                 # upload date
2826                 upload_date = u'NA'
2827                 if 'upload_date' in video_info:
2828                         upload_time = video_info['upload_date']
2829                         timetuple = email.utils.parsedate_tz(upload_time)
2830                         if timetuple is not None:
2831                                 try:
2832                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2833                                 except:
2834                                         pass
2835
2836                 # description
2837                 video_description = video_info.get('description', 'No description available.')
2838
2839                 url_map = video_info['video_urls']
2840                 if len(url_map.keys()) > 0:
2841                         # Decide which formats to download
2842                         req_format = self._downloader.params.get('format', None)
2843                         format_limit = self._downloader.params.get('format_limit', None)
2844
2845                         if format_limit is not None and format_limit in self._available_formats:
2846                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2847                         else:
2848                                 format_list = self._available_formats
2849                         existing_formats = [x for x in format_list if x in url_map]
2850                         if len(existing_formats) == 0:
2851                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2852                                 return
2853                         if req_format is None:
2854                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2855                         elif req_format == 'worst':
2856                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2857                         elif req_format == '-1':
2858                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2859                         else:
2860                                 # Specific format
2861                                 if req_format not in url_map:
2862                                         self._downloader.trouble(u'ERROR: requested format not available')
2863                                         return
2864                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2865
2866                 for format_param, video_real_url in video_url_list:
2867
2868                         # At this point we have a new video
2869                         self._downloader.increment_downloads()
2870
2871                         # Extension
2872                         video_extension = self._video_extensions.get(format_param, 'mp4')
2873
2874                         try:
2875                                 # Process video information
2876                                 self._downloader.process_info({
2877                                         'id':           video_id.decode('utf-8'),
2878                                         'url':          video_real_url.decode('utf-8'),
2879                                         'uploader':     video_uploader.decode('utf-8'),
2880                                         'upload_date':  upload_date,
2881                                         'title':        video_title,
2882                                         'stitle':       simple_title,
2883                                         'ext':          video_extension.decode('utf-8'),
2884                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2885                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2886                                         'description':  video_description.decode('utf-8'),
2887                                         'player_url':   None,
2888                                 })
2889                         except UnavailableVideoError, err:
2890                                 self._downloader.trouble(u'\nERROR: unable to download video')
2891
2892 class BlipTVIE(InfoExtractor):
2893         """Information extractor for blip.tv"""
2894
2895         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2896         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2897         IE_NAME = u'blip.tv'
2898
2899         def report_extraction(self, file_id):
2900                 """Report information extraction."""
2901                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2902
2903         def report_direct_download(self, title):
2904                 """Report information extraction."""
2905                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2906
2907         def _simplify_title(self, title):
2908                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2909                 res = res.strip(ur'_')
2910                 return res
2911
2912         def _real_extract(self, url):
2913                 mobj = re.match(self._VALID_URL, url)
2914                 if mobj is None:
2915                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2916                         return
2917
2918                 if '?' in url:
2919                         cchar = '&'
2920                 else:
2921                         cchar = '?'
2922                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2923                 request = urllib2.Request(json_url)
2924                 self.report_extraction(mobj.group(1))
2925                 info = None
2926                 try:
2927                         urlh = urllib2.urlopen(request)
2928                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2929                                 basename = url.split('/')[-1]
2930                                 title,ext = os.path.splitext(basename)
2931                                 ext = ext.replace('.', '')
2932                                 self.report_direct_download(title)
2933                                 info = {
2934                                         'id': title,
2935                                         'url': url,
2936                                         'title': title,
2937                                         'stitle': self._simplify_title(title),
2938                                         'ext': ext,
2939                                         'urlhandle': urlh
2940                                 }
2941                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2942                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2943                         return
2944                 if info is None: # Regular URL
2945                         try:
2946                                 json_code = urlh.read()
2947                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2948                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2949                                 return
2950
2951                         try:
2952                                 json_data = json.loads(json_code)
2953                                 if 'Post' in json_data:
2954                                         data = json_data['Post']
2955                                 else:
2956                                         data = json_data
2957
2958                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2959                                 video_url = data['media']['url']
2960                                 umobj = re.match(self._URL_EXT, video_url)
2961                                 if umobj is None:
2962                                         raise ValueError('Can not determine filename extension')
2963                                 ext = umobj.group(1)
2964
2965                                 info = {
2966                                         'id': data['item_id'],
2967                                         'url': video_url,
2968                                         'uploader': data['display_name'],
2969                                         'upload_date': upload_date,
2970                                         'title': data['title'],
2971                                         'stitle': self._simplify_title(data['title']),
2972                                         'ext': ext,
2973                                         'format': data['media']['mimeType'],
2974                                         'thumbnail': data['thumbnailUrl'],
2975                                         'description': data['description'],
2976                                         'player_url': data['embedUrl']
2977                                 }
2978                         except (ValueError,KeyError), err:
2979                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2980                                 return
2981
2982                 self._downloader.increment_downloads()
2983
2984                 try:
2985                         self._downloader.process_info(info)
2986                 except UnavailableVideoError, err:
2987                         self._downloader.trouble(u'\nERROR: unable to download video')
2988
2989
2990 class MyVideoIE(InfoExtractor):
2991         """Information Extractor for myvideo.de."""
2992
2993         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2994         IE_NAME = u'myvideo'
2995
2996         def __init__(self, downloader=None):
2997                 InfoExtractor.__init__(self, downloader)
2998
2999         def report_download_webpage(self, video_id):
3000                 """Report webpage download."""
3001                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3002
3003         def report_extraction(self, video_id):
3004                 """Report information extraction."""
3005                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3006
3007         def _real_initialize(self):
3008                 return
3009
3010         def _real_extract(self,url):
3011                 mobj = re.match(self._VALID_URL, url)
3012                 if mobj is None:
3013                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3014                         return
3015
3016                 video_id = mobj.group(1)
3017                 simple_title = mobj.group(2).decode('utf-8')
3018                 # should actually not be necessary
3019                 simple_title = sanitize_title(simple_title)
3020                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3021
3022                 # Get video webpage
3023                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3024                 try:
3025                         self.report_download_webpage(video_id)
3026                         webpage = urllib2.urlopen(request).read()
3027                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3028                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3029                         return
3030
3031                 self.report_extraction(video_id)
3032                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3033                                  webpage)
3034                 if mobj is None:
3035                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3036                         return
3037                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3038
3039                 mobj = re.search('<title>([^<]+)</title>', webpage)
3040                 if mobj is None:
3041                         self._downloader.trouble(u'ERROR: unable to extract title')
3042                         return
3043
3044                 video_title = mobj.group(1)
3045                 video_title = sanitize_title(video_title)
3046
3047                 try:
3048                         self._downloader.process_info({
3049                                 'id':           video_id,
3050                                 'url':          video_url,
3051                                 'uploader':     u'NA',
3052                                 'upload_date':  u'NA',
3053                                 'title':        video_title,
3054                                 'stitle':       simple_title,
3055                                 'ext':          u'flv',
3056                                 'format':       u'NA',
3057                                 'player_url':   None,
3058                         })
3059                 except UnavailableVideoError:
3060                         self._downloader.trouble(u'\nERROR: Unable to download video')
3061
3062 class ComedyCentralIE(InfoExtractor):
3063         """Information extractor for The Daily Show and Colbert Report """
3064
3065         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3066         IE_NAME = u'comedycentral'
3067
3068         def report_extraction(self, episode_id):
3069                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3070
3071         def report_config_download(self, episode_id):
3072                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3073
3074         def report_index_download(self, episode_id):
3075                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3076
3077         def report_player_url(self, episode_id):
3078                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3079
3080         def _simplify_title(self, title):
3081                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3082                 res = res.strip(ur'_')
3083                 return res
3084
3085         def _real_extract(self, url):
3086                 mobj = re.match(self._VALID_URL, url)
3087                 if mobj is None:
3088                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3089                         return
3090
3091                 if mobj.group('shortname'):
3092                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3093                                 url = 'http://www.thedailyshow.com/full-episodes/'
3094                         else:
3095                                 url = 'http://www.colbertnation.com/full-episodes/'
3096                         mobj = re.match(self._VALID_URL, url)
3097                         assert mobj is not None
3098
3099                 dlNewest = not mobj.group('episode')
3100                 if dlNewest:
3101                         epTitle = mobj.group('showname')
3102                 else:
3103                         epTitle = mobj.group('episode')
3104
3105                 req = urllib2.Request(url)
3106                 self.report_extraction(epTitle)
3107                 try:
3108                         htmlHandle = urllib2.urlopen(req)
3109                         html = htmlHandle.read()
3110                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3111                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3112                         return
3113                 if dlNewest:
3114                         url = htmlHandle.geturl()
3115                         mobj = re.match(self._VALID_URL, url)
3116                         if mobj is None:
3117                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3118                                 return
3119                         if mobj.group('episode') == '':
3120                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3121                                 return
3122                         epTitle = mobj.group('episode')
3123
3124                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3125                 if len(mMovieParams) == 0:
3126                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3127                         return
3128
3129                 playerUrl_raw = mMovieParams[0][0]
3130                 self.report_player_url(epTitle)
3131                 try:
3132                         urlHandle = urllib2.urlopen(playerUrl_raw)
3133                         playerUrl = urlHandle.geturl()
3134                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3135                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3136                         return
3137
3138                 uri = mMovieParams[0][1]
3139                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3140                 self.report_index_download(epTitle)
3141                 try:
3142                         indexXml = urllib2.urlopen(indexUrl).read()
3143                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3144                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3145                         return
3146
3147                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3148                 itemEls = idoc.findall('.//item')
3149                 for itemEl in itemEls:
3150                         mediaId = itemEl.findall('./guid')[0].text
3151                         shortMediaId = mediaId.split(':')[-1]
3152                         showId = mediaId.split(':')[-2].replace('.com', '')
3153                         officialTitle = itemEl.findall('./title')[0].text
3154                         officialDate = itemEl.findall('./pubDate')[0].text
3155
3156                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3157                                                 urllib.urlencode({'uri': mediaId}))
3158                         configReq = urllib2.Request(configUrl)
3159                         self.report_config_download(epTitle)
3160                         try:
3161                                 configXml = urllib2.urlopen(configReq).read()
3162                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3163                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3164                                 return
3165
3166                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3167                         turls = []
3168                         for rendition in cdoc.findall('.//rendition'):
3169                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3170                                 turls.append(finfo)
3171
3172                         if len(turls) == 0:
3173                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3174                                 continue
3175
3176                         # For now, just pick the highest bitrate
3177                         format,video_url = turls[-1]
3178
3179                         self._downloader.increment_downloads()
3180
3181                         effTitle = showId + '-' + epTitle
3182                         info = {
3183                                 'id': shortMediaId,
3184                                 'url': video_url,
3185                                 'uploader': showId,
3186                                 'upload_date': officialDate,
3187                                 'title': effTitle,
3188                                 'stitle': self._simplify_title(effTitle),
3189                                 'ext': 'mp4',
3190                                 'format': format,
3191                                 'thumbnail': None,
3192                                 'description': officialTitle,
3193                                 'player_url': playerUrl
3194                         }
3195
3196                         try:
3197                                 self._downloader.process_info(info)
3198                         except UnavailableVideoError, err:
3199                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3200                                 continue
3201
3202
3203 class EscapistIE(InfoExtractor):
3204         """Information extractor for The Escapist """
3205
3206         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3207         IE_NAME = u'escapist'
3208
3209         def report_extraction(self, showName):
3210                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3211
3212         def report_config_download(self, showName):
3213                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3214
3215         def _simplify_title(self, title):
3216                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3217                 res = res.strip(ur'_')
3218                 return res
3219
3220         def _real_extract(self, url):
3221                 htmlParser = HTMLParser.HTMLParser()
3222
3223                 mobj = re.match(self._VALID_URL, url)
3224                 if mobj is None:
3225                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3226                         return
3227                 showName = mobj.group('showname')
3228                 videoId = mobj.group('episode')
3229
3230                 self.report_extraction(showName)
3231                 try:
3232                         webPage = urllib2.urlopen(url).read()
3233                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3234                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3235                         return
3236
3237                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3238                 description = htmlParser.unescape(descMatch.group(1))
3239                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3240                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3241                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3242                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3243                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3244                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3245
3246                 self.report_config_download(showName)
3247                 try:
3248                         configJSON = urllib2.urlopen(configUrl).read()
3249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3250                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3251                         return
3252
3253                 # Technically, it's JavaScript, not JSON
3254                 configJSON = configJSON.replace("'", '"')
3255
3256                 try:
3257                         config = json.loads(configJSON)
3258                 except (ValueError,), err:
3259                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3260                         return
3261
3262                 playlist = config['playlist']
3263                 videoUrl = playlist[1]['url']
3264
3265                 self._downloader.increment_downloads()
3266                 info = {
3267                         'id': videoId,
3268                         'url': videoUrl,
3269                         'uploader': showName,
3270                         'upload_date': None,
3271                         'title': showName,
3272                         'stitle': self._simplify_title(showName),
3273                         'ext': 'flv',
3274                         'format': 'flv',
3275                         'thumbnail': imgUrl,
3276                         'description': description,
3277                         'player_url': playerUrl,
3278                 }
3279
3280                 try:
3281                         self._downloader.process_info(info)
3282                 except UnavailableVideoError, err:
3283                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3284
3285
3286
3287 class PostProcessor(object):
3288         """Post Processor class.
3289
3290         PostProcessor objects can be added to downloaders with their
3291         add_post_processor() method. When the downloader has finished a
3292         successful download, it will take its internal chain of PostProcessors
3293         and start calling the run() method on each one of them, first with
3294         an initial argument and then with the returned value of the previous
3295         PostProcessor.
3296
3297         The chain will be stopped if one of them ever returns None or the end
3298         of the chain is reached.
3299
3300         PostProcessor objects follow a "mutual registration" process similar
3301         to InfoExtractor objects.
3302         """
3303
3304         _downloader = None
3305
3306         def __init__(self, downloader=None):
3307                 self._downloader = downloader
3308
3309         def set_downloader(self, downloader):
3310                 """Sets the downloader for this PP."""
3311                 self._downloader = downloader
3312
3313         def run(self, information):
3314                 """Run the PostProcessor.
3315
3316                 The "information" argument is a dictionary like the ones
3317                 composed by InfoExtractors. The only difference is that this
3318                 one has an extra field called "filepath" that points to the
3319                 downloaded file.
3320
3321                 When this method returns None, the postprocessing chain is
3322                 stopped. However, this method may return an information
3323                 dictionary that will be passed to the next postprocessing
3324                 object in the chain. It can be the one it received after
3325                 changing some fields.
3326
3327                 In addition, this method may raise a PostProcessingError
3328                 exception that will be taken into account by the downloader
3329                 it was called from.
3330                 """
3331                 return information # by default, do nothing
3332
3333
3334 class FFmpegExtractAudioPP(PostProcessor):
3335
3336         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3337                 PostProcessor.__init__(self, downloader)
3338                 if preferredcodec is None:
3339                         preferredcodec = 'best'
3340                 self._preferredcodec = preferredcodec
3341                 self._preferredquality = preferredquality
3342                 self._keepvideo = keepvideo
3343
3344         @staticmethod
3345         def get_audio_codec(path):
3346                 try:
3347                         cmd = ['ffprobe', '-show_streams', '--', path]
3348                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3349                         output = handle.communicate()[0]
3350                         if handle.wait() != 0:
3351                                 return None
3352                 except (IOError, OSError):
3353                         return None
3354                 audio_codec = None
3355                 for line in output.split('\n'):
3356                         if line.startswith('codec_name='):
3357                                 audio_codec = line.split('=')[1].strip()
3358                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3359                                 return audio_codec
3360                 return None
3361
3362         @staticmethod
3363         def run_ffmpeg(path, out_path, codec, more_opts):
3364                 try:
3365                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3366                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3367                         return (ret == 0)
3368                 except (IOError, OSError):
3369                         return False
3370
3371         def run(self, information):
3372                 path = information['filepath']
3373
3374                 filecodec = self.get_audio_codec(path)
3375                 if filecodec is None:
3376                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3377                         return None
3378
3379                 more_opts = []
3380                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3381                         if filecodec in ['aac', 'mp3', 'vorbis']:
3382                                 # Lossless if possible
3383                                 acodec = 'copy'
3384                                 extension = filecodec
3385                                 if filecodec == 'aac':
3386                                         more_opts = ['-f', 'adts']
3387                                 if filecodec == 'vorbis':
3388                                         extension = 'ogg'
3389                         else:
3390                                 # MP3 otherwise.
3391                                 acodec = 'libmp3lame'
3392                                 extension = 'mp3'
3393                                 more_opts = []
3394                                 if self._preferredquality is not None:
3395                                         more_opts += ['-ab', self._preferredquality]
3396                 else:
3397                         # We convert the audio (lossy)
3398                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3399                         extension = self._preferredcodec
3400                         more_opts = []
3401                         if self._preferredquality is not None:
3402                                 more_opts += ['-ab', self._preferredquality]
3403                         if self._preferredcodec == 'aac':
3404                                 more_opts += ['-f', 'adts']
3405                         if self._preferredcodec == 'vorbis':
3406                                 extension = 'ogg'
3407
3408                 (prefix, ext) = os.path.splitext(path)
3409                 new_path = prefix + '.' + extension
3410                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3411                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3412
3413                 if not status:
3414                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3415                         return None
3416
3417                 # Try to update the date time for extracted audio file.
3418                 if information.get('filetime') is not None:
3419                         try:
3420                                 os.utime(new_path, (time.time(), information['filetime']))
3421                         except:
3422                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3423
3424                 if not self._keepvideo:
3425                         try:
3426                                 os.remove(path)
3427                         except (IOError, OSError):
3428                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3429                                 return None
3430
3431                 information['filepath'] = new_path
3432                 return information
3433
3434
3435 def updateSelf(downloader, filename):
3436         ''' Update the program file with the latest version from the repository '''
3437         # Note: downloader only used for options
3438         if not os.access(filename, os.W_OK):
3439                 sys.exit('ERROR: no write permissions on %s' % filename)
3440
3441         downloader.to_screen('Updating to latest version...')
3442
3443         try:
3444                 try:
3445                         urlh = urllib.urlopen(UPDATE_URL)
3446                         newcontent = urlh.read()
3447
3448                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3449                         if vmatch is not None and vmatch.group(1) == __version__:
3450                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3451                                 return
3452                 finally:
3453                         urlh.close()
3454         except (IOError, OSError), err:
3455                 sys.exit('ERROR: unable to download latest version')
3456
3457         try:
3458                 outf = open(filename, 'wb')
3459                 try:
3460                         outf.write(newcontent)
3461                 finally:
3462                         outf.close()
3463         except (IOError, OSError), err:
3464                 sys.exit('ERROR: unable to overwrite current version')
3465
3466         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3467
3468 def parseOpts():
3469         # Deferred imports
3470         import getpass
3471         import optparse
3472
3473         def _format_option_string(option):
3474                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3475
3476                 opts = []
3477
3478                 if option._short_opts: opts.append(option._short_opts[0])
3479                 if option._long_opts: opts.append(option._long_opts[0])
3480                 if len(opts) > 1: opts.insert(1, ', ')
3481
3482                 if option.takes_value(): opts.append(' %s' % option.metavar)
3483
3484                 return "".join(opts)
3485
3486         def _find_term_columns():
3487                 columns = os.environ.get('COLUMNS', None)
3488                 if columns:
3489                         return int(columns)
3490
3491                 try:
3492                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3493                         out,err = sp.communicate()
3494                         return int(out.split()[1])
3495                 except:
3496                         pass
3497                 return None
3498
3499         max_width = 80
3500         max_help_position = 80
3501
3502         # No need to wrap help messages if we're on a wide console
3503         columns = _find_term_columns()
3504         if columns: max_width = columns
3505
3506         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3507         fmt.format_option_strings = _format_option_string
3508
3509         kw = {
3510                 'version'   : __version__,
3511                 'formatter' : fmt,
3512                 'usage' : '%prog [options] url [url...]',
3513                 'conflict_handler' : 'resolve',
3514         }
3515
3516         parser = optparse.OptionParser(**kw)
3517
3518         # option groups
3519         general        = optparse.OptionGroup(parser, 'General Options')
3520         selection      = optparse.OptionGroup(parser, 'Video Selection')
3521         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3522         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3523         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3524         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3525         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3526
3527         general.add_option('-h', '--help',
3528                         action='help', help='print this help text and exit')
3529         general.add_option('-v', '--version',
3530                         action='version', help='print program version and exit')
3531         general.add_option('-U', '--update',
3532                         action='store_true', dest='update_self', help='update this program to latest version')
3533         general.add_option('-i', '--ignore-errors',
3534                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3535         general.add_option('-r', '--rate-limit',
3536                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3537         general.add_option('-R', '--retries',
3538                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3539         general.add_option('--dump-user-agent',
3540                         action='store_true', dest='dump_user_agent',
3541                         help='display the current browser identification', default=False)
3542         general.add_option('--list-extractors',
3543                         action='store_true', dest='list_extractors',
3544                         help='List all supported extractors and the URLs they would handle', default=False)
3545
3546         selection.add_option('--playlist-start',
3547                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3548         selection.add_option('--playlist-end',
3549                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3550         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3551         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3552
3553         authentication.add_option('-u', '--username',
3554                         dest='username', metavar='USERNAME', help='account username')
3555         authentication.add_option('-p', '--password',
3556                         dest='password', metavar='PASSWORD', help='account password')
3557         authentication.add_option('-n', '--netrc',
3558                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3559
3560
3561         video_format.add_option('-f', '--format',
3562                         action='store', dest='format', metavar='FORMAT', help='video format code')
3563         video_format.add_option('--all-formats',
3564                         action='store_const', dest='format', help='download all available video formats', const='all')
3565         video_format.add_option('--max-quality',
3566                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3567
3568
3569         verbosity.add_option('-q', '--quiet',
3570                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3571         verbosity.add_option('-s', '--simulate',
3572                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3573         verbosity.add_option('--skip-download',
3574                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3575         verbosity.add_option('-g', '--get-url',
3576                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3577         verbosity.add_option('-e', '--get-title',
3578                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3579         verbosity.add_option('--get-thumbnail',
3580                         action='store_true', dest='getthumbnail',
3581                         help='simulate, quiet but print thumbnail URL', default=False)
3582         verbosity.add_option('--get-description',
3583                         action='store_true', dest='getdescription',
3584                         help='simulate, quiet but print video description', default=False)
3585         verbosity.add_option('--get-filename',
3586                         action='store_true', dest='getfilename',
3587                         help='simulate, quiet but print output filename', default=False)
3588         verbosity.add_option('--get-format',
3589                         action='store_true', dest='getformat',
3590                         help='simulate, quiet but print output format', default=False)
3591         verbosity.add_option('--no-progress',
3592                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3593         verbosity.add_option('--console-title',
3594                         action='store_true', dest='consoletitle',
3595                         help='display progress in console titlebar', default=False)
3596
3597
3598         filesystem.add_option('-t', '--title',
3599                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3600         filesystem.add_option('-l', '--literal',
3601                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3602         filesystem.add_option('-A', '--auto-number',
3603                         action='store_true', dest='autonumber',
3604                         help='number downloaded files starting from 00000', default=False)
3605         filesystem.add_option('-o', '--output',
3606                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3607         filesystem.add_option('-a', '--batch-file',
3608                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3609         filesystem.add_option('-w', '--no-overwrites',
3610                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3611         filesystem.add_option('-c', '--continue',
3612                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3613         filesystem.add_option('--no-continue',
3614                         action='store_false', dest='continue_dl',
3615                         help='do not resume partially downloaded files (restart from beginning)')
3616         filesystem.add_option('--cookies',
3617                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3618         filesystem.add_option('--no-part',
3619                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3620         filesystem.add_option('--no-mtime',
3621                         action='store_false', dest='updatetime',
3622                         help='do not use the Last-modified header to set the file modification time', default=True)
3623         filesystem.add_option('--write-description',
3624                         action='store_true', dest='writedescription',
3625                         help='write video description to a .description file', default=False)
3626         filesystem.add_option('--write-info-json',
3627                         action='store_true', dest='writeinfojson',
3628                         help='write video metadata to a .info.json file', default=False)
3629
3630
3631         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3632                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3633         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3634                         help='"best", "aac", "vorbis" or "mp3"; best by default')
3635         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3636                         help='ffmpeg audio bitrate specification, 128k by default')
3637         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3638                         help='keeps the video file on disk after the post-processing; the video is erased by default')
3639
3640
3641         parser.add_option_group(general)
3642         parser.add_option_group(selection)
3643         parser.add_option_group(filesystem)
3644         parser.add_option_group(verbosity)
3645         parser.add_option_group(video_format)
3646         parser.add_option_group(authentication)
3647         parser.add_option_group(postproc)
3648
3649         opts, args = parser.parse_args()
3650
3651         return parser, opts, args
3652
3653 def gen_extractors():
3654         """ Return a list of an instance of every supported extractor.
3655         The order does matter; the first extractor matched is the one handling the URL.
3656         """
3657         youtube_ie = YoutubeIE()
3658         google_ie = GoogleIE()
3659         yahoo_ie = YahooIE()
3660         return [
3661                 YoutubePlaylistIE(youtube_ie),
3662                 YoutubeUserIE(youtube_ie),
3663                 YoutubeSearchIE(youtube_ie),
3664                 youtube_ie,
3665                 MetacafeIE(youtube_ie),
3666                 DailymotionIE(),
3667                 google_ie,
3668                 GoogleSearchIE(google_ie),
3669                 PhotobucketIE(),
3670                 yahoo_ie,
3671                 YahooSearchIE(yahoo_ie),
3672                 DepositFilesIE(),
3673                 FacebookIE(),
3674                 BlipTVIE(),
3675                 VimeoIE(),
3676                 MyVideoIE(),
3677                 ComedyCentralIE(),
3678                 EscapistIE(),
3679
3680                 GenericIE()
3681         ]
3682
3683 def main():
3684         parser, opts, args = parseOpts()
3685
3686         # Open appropriate CookieJar
3687         if opts.cookiefile is None:
3688                 jar = cookielib.CookieJar()
3689         else:
3690                 try:
3691                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3692                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3693                                 jar.load()
3694                 except (IOError, OSError), err:
3695                         sys.exit(u'ERROR: unable to open cookie file')
3696
3697         # Dump user agent
3698         if opts.dump_user_agent:
3699                 print std_headers['User-Agent']
3700                 sys.exit(0)
3701
3702         # Batch file verification
3703         batchurls = []
3704         if opts.batchfile is not None:
3705                 try:
3706                         if opts.batchfile == '-':
3707                                 batchfd = sys.stdin
3708                         else:
3709                                 batchfd = open(opts.batchfile, 'r')
3710                         batchurls = batchfd.readlines()
3711                         batchurls = [x.strip() for x in batchurls]
3712                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3713                 except IOError:
3714                         sys.exit(u'ERROR: batch file could not be read')
3715         all_urls = batchurls + args
3716
3717         # General configuration
3718         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3719         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3720         urllib2.install_opener(opener)
3721         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3722
3723         extractors = gen_extractors()
3724
3725         if opts.list_extractors:
3726                 for ie in extractors:
3727                         print(ie.IE_NAME)
3728                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3729                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3730                         for mu in matchedUrls:
3731                                 print(u'  ' + mu)
3732                 sys.exit(0)
3733
3734         # Conflicting, missing and erroneous options
3735         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3736                 parser.error(u'using .netrc conflicts with giving username/password')
3737         if opts.password is not None and opts.username is None:
3738                 parser.error(u'account username missing')
3739         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3740                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3741         if opts.usetitle and opts.useliteral:
3742                 parser.error(u'using title conflicts with using literal title')
3743         if opts.username is not None and opts.password is None:
3744                 opts.password = getpass.getpass(u'Type account password and press return:')
3745         if opts.ratelimit is not None:
3746                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3747                 if numeric_limit is None:
3748                         parser.error(u'invalid rate limit specified')
3749                 opts.ratelimit = numeric_limit
3750         if opts.retries is not None:
3751                 try:
3752                         opts.retries = long(opts.retries)
3753                 except (TypeError, ValueError), err:
3754                         parser.error(u'invalid retry count specified')
3755         try:
3756                 opts.playliststart = int(opts.playliststart)
3757                 if opts.playliststart <= 0:
3758                         raise ValueError(u'Playlist start must be positive')
3759         except (TypeError, ValueError), err:
3760                 parser.error(u'invalid playlist start number specified')
3761         try:
3762                 opts.playlistend = int(opts.playlistend)
3763                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3764                         raise ValueError(u'Playlist end must be greater than playlist start')
3765         except (TypeError, ValueError), err:
3766                 parser.error(u'invalid playlist end number specified')
3767         if opts.extractaudio:
3768                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
3769                         parser.error(u'invalid audio format specified')
3770
3771         # File downloader
3772         fd = FileDownloader({
3773                 'usenetrc': opts.usenetrc,
3774                 'username': opts.username,
3775                 'password': opts.password,
3776                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3777                 'forceurl': opts.geturl,
3778                 'forcetitle': opts.gettitle,
3779                 'forcethumbnail': opts.getthumbnail,
3780                 'forcedescription': opts.getdescription,
3781                 'forcefilename': opts.getfilename,
3782                 'forceformat': opts.getformat,
3783                 'simulate': opts.simulate,
3784                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3785                 'format': opts.format,
3786                 'format_limit': opts.format_limit,
3787                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3788                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3789                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3790                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3791                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3792                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3793                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3794                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3795                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3796                         or u'%(id)s.%(ext)s'),
3797                 'ignoreerrors': opts.ignoreerrors,
3798                 'ratelimit': opts.ratelimit,
3799                 'nooverwrites': opts.nooverwrites,
3800                 'retries': opts.retries,
3801                 'continuedl': opts.continue_dl,
3802                 'noprogress': opts.noprogress,
3803                 'playliststart': opts.playliststart,
3804                 'playlistend': opts.playlistend,
3805                 'logtostderr': opts.outtmpl == '-',
3806                 'consoletitle': opts.consoletitle,
3807                 'nopart': opts.nopart,
3808                 'updatetime': opts.updatetime,
3809                 'writedescription': opts.writedescription,
3810                 'writeinfojson': opts.writeinfojson,
3811                 'matchtitle': opts.matchtitle,
3812                 'rejecttitle': opts.rejecttitle,
3813                 })
3814         for extractor in extractors:
3815                 fd.add_info_extractor(extractor)
3816
3817         # PostProcessors
3818         if opts.extractaudio:
3819                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3820
3821         # Update version
3822         if opts.update_self:
3823                 updateSelf(fd, sys.argv[0])
3824
3825         # Maybe do nothing
3826         if len(all_urls) < 1:
3827                 if not opts.update_self:
3828                         parser.error(u'you must provide at least one URL')
3829                 else:
3830                         sys.exit()
3831         retcode = fd.download(all_urls)
3832
3833         # Dump cookie jar if requested
3834         if opts.cookiefile is not None:
3835                 try:
3836                         jar.save()
3837                 except (IOError, OSError), err:
3838                         sys.exit(u'ERROR: unable to save cookie jar')
3839
3840         sys.exit(retcode)
3841
3842
3843 if __name__ == '__main__':
3844         try:
3845                 main()
3846         except DownloadError:
3847                 sys.exit(1)
3848         except SameFileError:
3849                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3850         except KeyboardInterrupt:
3851                 sys.exit(u'\nERROR: Interrupted by user')
3852
3853 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: