youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         )
  15
  16 __license__ = 'Public Domain'
  17 __version__ = '2011.08.28-phihag'
  18
  19 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
  20
  21 import cookielib
  22 import datetime
  23 import gzip
  24 import htmlentitydefs
  25 import httplib
  26 import locale
  27 import math
  28 import netrc
  29 import os
  30 import os.path
  31 import re
  32 import socket
  33 import string
  34 import subprocess
  35 import sys
  36 import time
  37 import urllib
  38 import urllib2
  39 import warnings
  40 import zlib
  41
  42 if os.name == 'nt':
  43         import ctypes
  44
  45 try:
  46         import email.utils
  47 except ImportError: # Python 2.4
  48         import email.Utils
  49 try:
  50         import cStringIO as StringIO
  51 except ImportError:
  52         import StringIO
  53
  54 # parse_qs was moved from the cgi module to the urlparse module recently.
  55 try:
  56         from urlparse import parse_qs
  57 except ImportError:
  58         from cgi import parse_qs
  59
  60 try:
  61         import lxml.etree
  62 except ImportError:
  63         pass # Handled below
  64
  65 std_headers = {
  66         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  67         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  68         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  69         'Accept-Encoding': 'gzip, deflate',
  70         'Accept-Language': 'en-us,en;q=0.5',
  71 }
  72
  73 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  74
  75 try:
  76         import json
  77 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  78         import re
  79         class json(object):
  80                 @staticmethod
  81                 def loads(s):
  82                         s = s.decode('UTF-8')
  83                         def raiseError(msg, i):
  84                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  85                         def skipSpace(i, expectMore=True):
  86                                 while i < len(s) and s[i] in ' \t\r\n':
  87                                         i += 1
  88                                 if expectMore:
  89                                         if i >= len(s):
  90                                                 raiseError('Premature end', i)
  91                                 return i
  92                         def decodeEscape(match):
  93                                 esc = match.group(1)
  94                                 _STATIC = {
  95                                         '"': '"',
  96                                         '\\': '\\',
  97                                         '/': '/',
  98                                         'b': unichr(0x8),
  99                                         'f': unichr(0xc),
 100                                         'n': '\n',
 101                                         'r': '\r',
 102                                         't': '\t',
 103                                 }
 104                                 if esc in _STATIC:
 105                                         return _STATIC[esc]
 106                                 if esc[0] == 'u':
 107                                         if len(esc) == 1+4:
 108                                                 return unichr(int(esc[1:5], 16))
 109                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 110                                                 hi = int(esc[1:5], 16)
 111                                                 low = int(esc[7:11], 16)
 112                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 113                                 raise ValueError('Unknown escape ' + str(esc))
 114                         def parseString(i):
 115                                 i += 1
 116                                 e = i
 117                                 while True:
 118                                         e = s.index('"', e)
 119                                         bslashes = 0
 120                                         while s[e-bslashes-1] == '\\':
 121                                                 bslashes += 1
 122                                         if bslashes % 2 == 1:
 123                                                 e += 1
 124                                                 continue
 125                                         break
 126                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 127                                 stri = rexp.sub(decodeEscape, s[i:e])
 128                                 return (e+1,stri)
 129                         def parseObj(i):
 130                                 i += 1
 131                                 res = {}
 132                                 i = skipSpace(i)
 133                                 if s[i] == '}': # Empty dictionary
 134                                         return (i+1,res)
 135                                 while True:
 136                                         if s[i] != '"':
 137                                                 raiseError('Expected a string object key', i)
 138                                         i,key = parseString(i)
 139                                         i = skipSpace(i)
 140                                         if i >= len(s) or s[i] != ':':
 141                                                 raiseError('Expected a colon', i)
 142                                         i,val = parse(i+1)
 143                                         res[key] = val
 144                                         i = skipSpace(i)
 145                                         if s[i] == '}':
 146                                                 return (i+1, res)
 147                                         if s[i] != ',':
 148                                                 raiseError('Expected comma or closing curly brace', i)
 149                                         i = skipSpace(i+1)
 150                         def parseArray(i):
 151                                 res = []
 152                                 i = skipSpace(i+1)
 153                                 if s[i] == ']': # Empty array
 154                                         return (i+1,res)
 155                                 while True:
 156                                         i,val = parse(i)
 157                                         res.append(val)
 158                                         i = skipSpace(i) # Raise exception if premature end
 159                                         if s[i] == ']':
 160                                                 return (i+1, res)
 161                                         if s[i] != ',':
 162                                                 raiseError('Expected a comma or closing bracket', i)
 163                                         i = skipSpace(i+1)
 164                         def parseDiscrete(i):
 165                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 166                                         if s.startswith(k, i):
 167                                                 return (i+len(k), v)
 168                                 raiseError('Not a boolean (or null)', i)
 169                         def parseNumber(i):
 170                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 171                                 if mobj is None:
 172                                         raiseError('Not a number', i)
 173                                 nums = mobj.group(1)
 174                                 if '.' in nums or 'e' in nums or 'E' in nums:
 175                                         return (i+len(nums), float(nums))
 176                                 return (i+len(nums), int(nums))
 177                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 178                         def parse(i):
 179                                 i = skipSpace(i)
 180                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 181                                 i = skipSpace(i, False)
 182                                 return (i,res)
 183                         i,res = parse(0)
 184                         if i < len(s):
 185                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 186                         return res
 187
 188 def preferredencoding():
 189         """Get preferred encoding.
 190
 191         Returns the best encoding scheme for the system, based on
 192         locale.getpreferredencoding() and some further tweaks.
 193         """
 194         def yield_preferredencoding():
 195                 try:
 196                         pref = locale.getpreferredencoding()
 197                         u'TEST'.encode(pref)
 198                 except:
 199                         pref = 'UTF-8'
 200                 while True:
 201                         yield pref
 202         return yield_preferredencoding().next()
 203
 204
 205 def htmlentity_transform(matchobj):
 206         """Transforms an HTML entity to a Unicode character.
 207
 208         This function receives a match object and is intended to be used with
 209         the re.sub() function.
 210         """
 211         entity = matchobj.group(1)
 212
 213         # Known non-numeric HTML entity
 214         if entity in htmlentitydefs.name2codepoint:
 215                 return unichr(htmlentitydefs.name2codepoint[entity])
 216
 217         # Unicode character
 218         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 219         if mobj is not None:
 220                 numstr = mobj.group(1)
 221                 if numstr.startswith(u'x'):
 222                         base = 16
 223                         numstr = u'0%s' % numstr
 224                 else:
 225                         base = 10
 226                 return unichr(long(numstr, base))
 227
 228         # Unknown entity in name, return its literal representation
 229         return (u'&%s;' % entity)
 230
 231
 232 def sanitize_title(utitle):
 233         """Sanitizes a video title so it could be used as part of a filename."""
 234         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 235         return utitle.replace(unicode(os.sep), u'%')
 236
 237
 238 def sanitize_open(filename, open_mode):
 239         """Try to open the given filename, and slightly tweak it if this fails.
 240
 241         Attempts to open the given filename. If this fails, it tries to change
 242         the filename slightly, step by step, until it's either able to open it
 243         or it fails and raises a final exception, like the standard open()
 244         function.
 245
 246         It returns the tuple (stream, definitive_file_name).
 247         """
 248         try:
 249                 if filename == u'-':
 250                         if sys.platform == 'win32':
 251                                 import msvcrt
 252                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 253                         return (sys.stdout, filename)
 254                 stream = open(filename, open_mode)
 255                 return (stream, filename)
 256         except (IOError, OSError), err:
 257                 # In case of error, try to remove win32 forbidden chars
 258                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 259
 260                 # An exception here should be caught in the caller
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263
 264
 265 def timeconvert(timestr):
 266         """Convert RFC 2822 defined time string into system timestamp"""
 267         timestamp = None
 268         timetuple = email.utils.parsedate_tz(timestr)
 269         if timetuple is not None:
 270                 timestamp = email.utils.mktime_tz(timetuple)
 271         return timestamp
 272
 273
 274 class DownloadError(Exception):
 275         """Download Error exception.
 276
 277         This exception may be thrown by FileDownloader objects if they are not
 278         configured to continue on errors. They will contain the appropriate
 279         error message.
 280         """
 281         pass
 282
 283
 284 class SameFileError(Exception):
 285         """Same File exception.
 286
 287         This exception will be thrown by FileDownloader objects if they detect
 288         multiple files would have to be downloaded to the same file on disk.
 289         """
 290         pass
 291
 292
 293 class PostProcessingError(Exception):
 294         """Post Processing exception.
 295
 296         This exception may be raised by PostProcessor's .run() method to
 297         indicate an error in the postprocessing task.
 298         """
 299         pass
 300
 301
 302 class UnavailableVideoError(Exception):
 303         """Unavailable Format exception.
 304
 305         This exception will be thrown when a video is requested
 306         in a format that is not available for that video.
 307         """
 308         pass
 309
 310
 311 class ContentTooShortError(Exception):
 312         """Content Too Short exception.
 313
 314         This exception may be raised by FileDownloader objects when a file they
 315         download is too small for what the server announced first, indicating
 316         the connection was probably interrupted.
 317         """
 318         # Both in bytes
 319         downloaded = None
 320         expected = None
 321
 322         def __init__(self, downloaded, expected):
 323                 self.downloaded = downloaded
 324                 self.expected = expected
 325
 326
 327 class YoutubeDLHandler(urllib2.HTTPHandler):
 328         """Handler for HTTP requests and responses.
 329
 330         This class, when installed with an OpenerDirector, automatically adds
 331         the standard headers to every HTTP request and handles gzipped and
 332         deflated responses from web servers. If compression is to be avoided in
 333         a particular request, the original request in the program code only has
 334         to include the HTTP header "Youtubedl-No-Compression", which will be
 335         removed before making the real request.
 336
 337         Part of this code was copied from:
 338
 339         http://techknack.net/python-urllib2-handlers/
 340
 341         Andrew Rowls, the author of that code, agreed to release it to the
 342         public domain.
 343         """
 344
 345         @staticmethod
 346         def deflate(data):
 347                 try:
 348                         return zlib.decompress(data, -zlib.MAX_WBITS)
 349                 except zlib.error:
 350                         return zlib.decompress(data)
 351
 352         @staticmethod
 353         def addinfourl_wrapper(stream, headers, url, code):
 354                 if hasattr(urllib2.addinfourl, 'getcode'):
 355                         return urllib2.addinfourl(stream, headers, url, code)
 356                 ret = urllib2.addinfourl(stream, headers, url)
 357                 ret.code = code
 358                 return ret
 359
 360         def http_request(self, req):
 361                 for h in std_headers:
 362                         if h in req.headers:
 363                                 del req.headers[h]
 364                         req.add_header(h, std_headers[h])
 365                 if 'Youtubedl-no-compression' in req.headers:
 366                         if 'Accept-encoding' in req.headers:
 367                                 del req.headers['Accept-encoding']
 368                         del req.headers['Youtubedl-no-compression']
 369                 return req
 370
 371         def http_response(self, req, resp):
 372                 old_resp = resp
 373                 # gzip
 374                 if resp.headers.get('Content-encoding', '') == 'gzip':
 375                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 376                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 377                         resp.msg = old_resp.msg
 378                 # deflate
 379                 if resp.headers.get('Content-encoding', '') == 'deflate':
 380                         gz = StringIO.StringIO(self.deflate(resp.read()))
 381                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 382                         resp.msg = old_resp.msg
 383                 return resp
 384
 385
 386 class FileDownloader(object):
 387         """File Downloader class.
 388
 389         File downloader objects are the ones responsible of downloading the
 390         actual video file and writing it to disk if the user has requested
 391         it, among some other tasks. In most cases there should be one per
 392         program. As, given a video URL, the downloader doesn't know how to
 393         extract all the needed information, task that InfoExtractors do, it
 394         has to pass the URL to one of them.
 395
 396         For this, file downloader objects have a method that allows
 397         InfoExtractors to be registered in a given order. When it is passed
 398         a URL, the file downloader handles it to the first InfoExtractor it
 399         finds that reports being able to handle it. The InfoExtractor extracts
 400         all the information about the video or videos the URL refers to, and
 401         asks the FileDownloader to process the video information, possibly
 402         downloading the video.
 403
 404         File downloaders accept a lot of parameters. In order not to saturate
 405         the object constructor with arguments, it receives a dictionary of
 406         options instead. These options are available through the params
 407         attribute for the InfoExtractors to use. The FileDownloader also
 408         registers itself as the downloader in charge for the InfoExtractors
 409         that are added to it, so this is a "mutual registration".
 410
 411         Available options:
 412
 413         username:         Username for authentication purposes.
 414         password:         Password for authentication purposes.
 415         usenetrc:         Use netrc for authentication instead.
 416         quiet:            Do not print messages to stdout.
 417         forceurl:         Force printing final URL.
 418         forcetitle:       Force printing title.
 419         forcethumbnail:   Force printing thumbnail URL.
 420         forcedescription: Force printing description.
 421         forcefilename:    Force printing final filename.
 422         simulate:         Do not download the video files.
 423         format:           Video format code.
 424         format_limit:     Highest quality format to try.
 425         outtmpl:          Template for output names.
 426         ignoreerrors:     Do not stop on download errors.
 427         ratelimit:        Download speed limit, in bytes/sec.
 428         nooverwrites:     Prevent overwriting files.
 429         retries:          Number of times to retry for HTTP error 5xx
 430         continuedl:       Try to continue downloads if possible.
 431         noprogress:       Do not print the progress bar.
 432         playliststart:    Playlist item to start at.
 433         playlistend:      Playlist item to end at.
 434         logtostderr:      Log messages to stderr instead of stdout.
 435         consoletitle:     Display progress in console window's titlebar.
 436         nopart:           Do not use temporary .part files.
 437         updatetime:       Use the Last-modified header to set output file timestamps.
 438         writedescription: Write the video description to a .description file
 439         writeinfojson:    Write the video description to a .info.json file
 440         """
 441
 442         params = None
 443         _ies = []
 444         _pps = []
 445         _download_retcode = None
 446         _num_downloads = None
 447         _screen_file = None
 448
 449         def __init__(self, params):
 450                 """Create a FileDownloader object with the given options."""
 451                 self._ies = []
 452                 self._pps = []
 453                 self._download_retcode = 0
 454                 self._num_downloads = 0
 455                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 456                 self.params = params
 457
 458         @staticmethod
 459         def format_bytes(bytes):
 460                 if bytes is None:
 461                         return 'N/A'
 462                 if type(bytes) is str:
 463                         bytes = float(bytes)
 464                 if bytes == 0.0:
 465                         exponent = 0
 466                 else:
 467                         exponent = long(math.log(bytes, 1024.0))
 468                 suffix = 'bkMGTPEZY'[exponent]
 469                 converted = float(bytes) / float(1024 ** exponent)
 470                 return '%.2f%s' % (converted, suffix)
 471
 472         @staticmethod
 473         def calc_percent(byte_counter, data_len):
 474                 if data_len is None:
 475                         return '---.-%'
 476                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 477
 478         @staticmethod
 479         def calc_eta(start, now, total, current):
 480                 if total is None:
 481                         return '--:--'
 482                 dif = now - start
 483                 if current == 0 or dif < 0.001: # One millisecond
 484                         return '--:--'
 485                 rate = float(current) / dif
 486                 eta = long((float(total) - float(current)) / rate)
 487                 (eta_mins, eta_secs) = divmod(eta, 60)
 488                 if eta_mins > 99:
 489                         return '--:--'
 490                 return '%02d:%02d' % (eta_mins, eta_secs)
 491
 492         @staticmethod
 493         def calc_speed(start, now, bytes):
 494                 dif = now - start
 495                 if bytes == 0 or dif < 0.001: # One millisecond
 496                         return '%10s' % '---b/s'
 497                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 498
 499         @staticmethod
 500         def best_block_size(elapsed_time, bytes):
 501                 new_min = max(bytes / 2.0, 1.0)
 502                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 503                 if elapsed_time < 0.001:
 504                         return long(new_max)
 505                 rate = bytes / elapsed_time
 506                 if rate > new_max:
 507                         return long(new_max)
 508                 if rate < new_min:
 509                         return long(new_min)
 510                 return long(rate)
 511
 512         @staticmethod
 513         def parse_bytes(bytestr):
 514                 """Parse a string indicating a byte quantity into a long integer."""
 515                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 516                 if matchobj is None:
 517                         return None
 518                 number = float(matchobj.group(1))
 519                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 520                 return long(round(number * multiplier))
 521
 522         def add_info_extractor(self, ie):
 523                 """Add an InfoExtractor object to the end of the list."""
 524                 self._ies.append(ie)
 525                 ie.set_downloader(self)
 526
 527         def add_post_processor(self, pp):
 528                 """Add a PostProcessor object to the end of the chain."""
 529                 self._pps.append(pp)
 530                 pp.set_downloader(self)
 531
 532         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 533                 """Print message to stdout if not in quiet mode."""
 534                 try:
 535                         if not self.params.get('quiet', False):
 536                                 terminator = [u'\n', u''][skip_eol]
 537                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 538                         self._screen_file.flush()
 539                 except (UnicodeEncodeError), err:
 540                         if not ignore_encoding_errors:
 541                                 raise
 542
 543         def to_stderr(self, message):
 544                 """Print message to stderr."""
 545                 print >>sys.stderr, message.encode(preferredencoding())
 546
 547         def to_cons_title(self, message):
 548                 """Set console/terminal window title to message."""
 549                 if not self.params.get('consoletitle', False):
 550                         return
 551                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 552                         # c_wchar_p() might not be necessary if `message` is
 553                         # already of type unicode()
 554                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 555                 elif 'TERM' in os.environ:
 556                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 557
 558         def fixed_template(self):
 559                 """Checks if the output template is fixed."""
 560                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 561
 562         def trouble(self, message=None):
 563                 """Determine action to take when a download problem appears.
 564
 565                 Depending on if the downloader has been configured to ignore
 566                 download errors or not, this method may throw an exception or
 567                 not when errors are found, after printing the message.
 568                 """
 569                 if message is not None:
 570                         self.to_stderr(message)
 571                 if not self.params.get('ignoreerrors', False):
 572                         raise DownloadError(message)
 573                 self._download_retcode = 1
 574
 575         def slow_down(self, start_time, byte_counter):
 576                 """Sleep if the download speed is over the rate limit."""
 577                 rate_limit = self.params.get('ratelimit', None)
 578                 if rate_limit is None or byte_counter == 0:
 579                         return
 580                 now = time.time()
 581                 elapsed = now - start_time
 582                 if elapsed <= 0.0:
 583                         return
 584                 speed = float(byte_counter) / elapsed
 585                 if speed > rate_limit:
 586                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 587
 588         def temp_name(self, filename):
 589                 """Returns a temporary filename for the given filename."""
 590                 if self.params.get('nopart', False) or filename == u'-' or \
 591                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 592                         return filename
 593                 return filename + u'.part'
 594
 595         def undo_temp_name(self, filename):
 596                 if filename.endswith(u'.part'):
 597                         return filename[:-len(u'.part')]
 598                 return filename
 599
 600         def try_rename(self, old_filename, new_filename):
 601                 try:
 602                         if old_filename == new_filename:
 603                                 return
 604                         os.rename(old_filename, new_filename)
 605                 except (IOError, OSError), err:
 606                         self.trouble(u'ERROR: unable to rename file')
 607
 608         def try_utime(self, filename, last_modified_hdr):
 609                 """Try to set the last-modified time of the given file."""
 610                 if last_modified_hdr is None:
 611                         return
 612                 if not os.path.isfile(filename):
 613                         return
 614                 timestr = last_modified_hdr
 615                 if timestr is None:
 616                         return
 617                 filetime = timeconvert(timestr)
 618                 if filetime is None:
 619                         return
 620                 try:
 621                         os.utime(filename, (time.time(), filetime))
 622                 except:
 623                         pass
 624
 625         def report_writedescription(self, descfn):
 626                 """ Report that the description file is being written """
 627                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 628
 629         def report_writeinfojson(self, infofn):
 630                 """ Report that the metadata file has been written """
 631                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 632
 633         def report_destination(self, filename):
 634                 """Report destination filename."""
 635                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 636
 637         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 638                 """Report download progress."""
 639                 if self.params.get('noprogress', False):
 640                         return
 641                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 642                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 643                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 644                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 645
 646         def report_resuming_byte(self, resume_len):
 647                 """Report attempt to resume at given byte."""
 648                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 649
 650         def report_retry(self, count, retries):
 651                 """Report retry in case of HTTP error 5xx"""
 652                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 653
 654         def report_file_already_downloaded(self, file_name):
 655                 """Report file has already been fully downloaded."""
 656                 try:
 657                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 658                 except (UnicodeEncodeError), err:
 659                         self.to_screen(u'[download] The file has already been downloaded')
 660
 661         def report_unable_to_resume(self):
 662                 """Report it was impossible to resume download."""
 663                 self.to_screen(u'[download] Unable to resume')
 664
 665         def report_finish(self):
 666                 """Report download finished."""
 667                 if self.params.get('noprogress', False):
 668                         self.to_screen(u'[download] Download completed')
 669                 else:
 670                         self.to_screen(u'')
 671
 672         def increment_downloads(self):
 673                 """Increment the ordinal that assigns a number to each file."""
 674                 self._num_downloads += 1
 675
 676         def prepare_filename(self, info_dict):
 677                 """Generate the output filename."""
 678                 try:
 679                         template_dict = dict(info_dict)
 680                         template_dict['epoch'] = unicode(long(time.time()))
 681                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 682                         filename = self.params['outtmpl'] % template_dict
 683                         return filename
 684                 except (ValueError, KeyError), err:
 685                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 686                         return None
 687
 688         def process_info(self, info_dict):
 689                 """Process a single dictionary returned by an InfoExtractor."""
 690                 filename = self.prepare_filename(info_dict)
 691                 # Do nothing else if in simulate mode
 692                 if self.params.get('simulate', False):
 693                         # Forced printings
 694                         if self.params.get('forcetitle', False):
 695                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 696                         if self.params.get('forceurl', False):
 697                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 698                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 699                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 700                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 701                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 702                         if self.params.get('forcefilename', False) and filename is not None:
 703                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 704
 705                         return
 706
 707                 if filename is None:
 708                         return
 709                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 710                         self.to_stderr(u'WARNING: file exists and will be skipped')
 711                         return
 712
 713                 try:
 714                         os.makedirs(os.path.dirname(filename))
 715                 except (OSError, IOError), err:
 716                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 717                         return
 718
 719                 if self.params.get('writedescription', False):
 720                         try:
 721                                 descfn = filename + '.description'
 722                                 self.report_writedescription(descfn)
 723                                 descfile = open(descfn, 'wb')
 724                                 try:
 725                                         descfile.write(info_dict['description'].encode('utf-8'))
 726                                 finally:
 727                                         descfile.close()
 728                         except (OSError, IOError):
 729                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 730                                 return
 731
 732                 if self.params.get('writeinfojson', False):
 733                         infofn = filename + '.info.json'
 734                         self.report_writeinfojson(infofn)
 735                         try:
 736                                 json.dump
 737                         except (NameError,AttributeError):
 738                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 739                                 return
 740                         try:
 741                                 infof = open(infofn, 'wb')
 742                                 try:
 743                                         json.dump(info_dict, infof)
 744                                 finally:
 745                                         infof.close()
 746                         except (OSError, IOError):
 747                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
 748                                 return
 749
 750                 try:
 751                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 752                 except (OSError, IOError), err:
 753                         raise UnavailableVideoError
 754                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 755                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 756                         return
 757                 except (ContentTooShortError, ), err:
 758                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 759                         return
 760
 761                 if success:
 762                         try:
 763                                 self.post_process(filename, info_dict)
 764                         except (PostProcessingError), err:
 765                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 766                                 return
 767
 768         def download(self, url_list):
 769                 """Download a given list of URLs."""
 770                 if len(url_list) > 1 and self.fixed_template():
 771                         raise SameFileError(self.params['outtmpl'])
 772
 773                 for url in url_list:
 774                         suitable_found = False
 775                         for ie in self._ies:
 776                                 # Go to next InfoExtractor if not suitable
 777                                 if not ie.suitable(url):
 778                                         continue
 779
 780                                 # Suitable InfoExtractor found
 781                                 suitable_found = True
 782
 783                                 # Extract information from URL and process it
 784                                 ie.extract(url)
 785
 786                                 # Suitable InfoExtractor had been found; go to next URL
 787                                 break
 788
 789                         if not suitable_found:
 790                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 791
 792                 return self._download_retcode
 793
 794         def post_process(self, filename, ie_info):
 795                 """Run the postprocessing chain on the given file."""
 796                 info = dict(ie_info)
 797                 info['filepath'] = filename
 798                 for pp in self._pps:
 799                         info = pp.run(info)
 800                         if info is None:
 801                                 break
 802
 803         def _download_with_rtmpdump(self, filename, url, player_url):
 804                 self.report_destination(filename)
 805                 tmpfilename = self.temp_name(filename)
 806
 807                 # Check for rtmpdump first
 808                 try:
 809                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 810                 except (OSError, IOError):
 811                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 812                         return False
 813
 814                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 815                 # the connection was interrumpted and resuming appears to be
 816                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 817                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 818                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 819                 while retval == 2 or retval == 1:
 820                         prevsize = os.path.getsize(tmpfilename)
 821                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 822                         time.sleep(5.0) # This seems to be needed
 823                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 824                         cursize = os.path.getsize(tmpfilename)
 825                         if prevsize == cursize and retval == 1:
 826                                 break
 827                 if retval == 0:
 828                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 829                         self.try_rename(tmpfilename, filename)
 830                         return True
 831                 else:
 832                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 833                         return False
 834
 835         def _do_download(self, filename, url, player_url):
 836                 # Check file already present
 837                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 838                         self.report_file_already_downloaded(filename)
 839                         return True
 840
 841                 # Attempt to download using rtmpdump
 842                 if url.startswith('rtmp'):
 843                         return self._download_with_rtmpdump(filename, url, player_url)
 844
 845                 tmpfilename = self.temp_name(filename)
 846                 stream = None
 847                 open_mode = 'wb'
 848
 849                 # Do not include the Accept-Encoding header
 850                 headers = {'Youtubedl-no-compression': 'True'}
 851                 basic_request = urllib2.Request(url, None, headers)
 852                 request = urllib2.Request(url, None, headers)
 853
 854                 # Establish possible resume length
 855                 if os.path.isfile(tmpfilename):
 856                         resume_len = os.path.getsize(tmpfilename)
 857                 else:
 858                         resume_len = 0
 859
 860                 # Request parameters in case of being able to resume
 861                 if self.params.get('continuedl', False) and resume_len != 0:
 862                         self.report_resuming_byte(resume_len)
 863                         request.add_header('Range', 'bytes=%d-' % resume_len)
 864                         open_mode = 'ab'
 865
 866                 count = 0
 867                 retries = self.params.get('retries', 0)
 868                 while count <= retries:
 869                         # Establish connection
 870                         try:
 871                                 data = urllib2.urlopen(request)
 872                                 break
 873                         except (urllib2.HTTPError, ), err:
 874                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 875                                         # Unexpected HTTP error
 876                                         raise
 877                                 elif err.code == 416:
 878                                         # Unable to resume (requested range not satisfiable)
 879                                         try:
 880                                                 # Open the connection again without the range header
 881                                                 data = urllib2.urlopen(basic_request)
 882                                                 content_length = data.info()['Content-Length']
 883                                         except (urllib2.HTTPError, ), err:
 884                                                 if err.code < 500 or err.code >= 600:
 885                                                         raise
 886                                         else:
 887                                                 # Examine the reported length
 888                                                 if (content_length is not None and
 889                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 890                                                         # The file had already been fully downloaded.
 891                                                         # Explanation to the above condition: in issue #175 it was revealed that
 892                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 893                                                         # changing the file size slightly and causing problems for some users. So
 894                                                         # I decided to implement a suggested change and consider the file
 895                                                         # completely downloaded if the file size differs less than 100 bytes from
 896                                                         # the one in the hard drive.
 897                                                         self.report_file_already_downloaded(filename)
 898                                                         self.try_rename(tmpfilename, filename)
 899                                                         return True
 900                                                 else:
 901                                                         # The length does not match, we start the download over
 902                                                         self.report_unable_to_resume()
 903                                                         open_mode = 'wb'
 904                                                         break
 905                         # Retry
 906                         count += 1
 907                         if count <= retries:
 908                                 self.report_retry(count, retries)
 909
 910                 if count > retries:
 911                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 912                         return False
 913
 914                 data_len = data.info().get('Content-length', None)
 915                 if data_len is not None:
 916                         data_len = long(data_len) + resume_len
 917                 data_len_str = self.format_bytes(data_len)
 918                 byte_counter = 0 + resume_len
 919                 block_size = 1024
 920                 start = time.time()
 921                 while True:
 922                         # Download and write
 923                         before = time.time()
 924                         data_block = data.read(block_size)
 925                         after = time.time()
 926                         if len(data_block) == 0:
 927                                 break
 928                         byte_counter += len(data_block)
 929
 930                         # Open file just in time
 931                         if stream is None:
 932                                 try:
 933                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 934                                         assert stream is not None
 935                                         filename = self.undo_temp_name(tmpfilename)
 936                                         self.report_destination(filename)
 937                                 except (OSError, IOError), err:
 938                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 939                                         return False
 940                         try:
 941                                 stream.write(data_block)
 942                         except (IOError, OSError), err:
 943                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 944                                 return False
 945                         block_size = self.best_block_size(after - before, len(data_block))
 946
 947                         # Progress message
 948                         percent_str = self.calc_percent(byte_counter, data_len)
 949                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 950                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 951                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 952
 953                         # Apply rate limit
 954                         self.slow_down(start, byte_counter - resume_len)
 955
 956                 if stream is None:
 957                         self.trouble(u'\nERROR: Did not get any data blocks')
 958                         return False
 959                 stream.close()
 960                 self.report_finish()
 961                 if data_len is not None and byte_counter != data_len:
 962                         raise ContentTooShortError(byte_counter, long(data_len))
 963                 self.try_rename(tmpfilename, filename)
 964
 965                 # Update file modification time
 966                 if self.params.get('updatetime', True):
 967                         self.try_utime(filename, data.info().get('last-modified', None))
 968
 969                 return True
 970
 971
 972 class InfoExtractor(object):
 973         """Information Extractor class.
 974
 975         Information extractors are the classes that, given a URL, extract
 976         information from the video (or videos) the URL refers to. This
 977         information includes the real video URL, the video title and simplified
 978         title, author and others. The information is stored in a dictionary
 979         which is then passed to the FileDownloader. The FileDownloader
 980         processes this information possibly downloading the video to the file
 981         system, among other possible outcomes. The dictionaries must include
 982         the following fields:
 983
 984         id:             Video identifier.
 985         url:            Final video URL.
 986         uploader:       Nickname of the video uploader.
 987         title:          Literal title.
 988         stitle:         Simplified title.
 989         ext:            Video filename extension.
 990         format:         Video format.
 991         player_url:     SWF Player URL (may be None).
 992
 993         The following fields are optional. Their primary purpose is to allow
 994         youtube-dl to serve as the backend for a video search function, such
 995         as the one in youtube2mp3.  They are only used when their respective
 996         forced printing functions are called:
 997
 998         thumbnail:      Full URL to a video thumbnail image.
 999         description:    One-line video description.
1000
1001         Subclasses of this one should re-define the _real_initialize() and
1002         _real_extract() methods, as well as the suitable() static method.
1003         Probably, they should also be instantiated and added to the main
1004         downloader.
1005         """
1006
1007         _ready = False
1008         _downloader = None
1009
1010         def __init__(self, downloader=None):
1011                 """Constructor. Receives an optional downloader."""
1012                 self._ready = False
1013                 self.set_downloader(downloader)
1014
1015         @staticmethod
1016         def suitable(url):
1017                 """Receives a URL and returns True if suitable for this IE."""
1018                 return False
1019
1020         def initialize(self):
1021                 """Initializes an instance (authentication, etc)."""
1022                 if not self._ready:
1023                         self._real_initialize()
1024                         self._ready = True
1025
1026         def extract(self, url):
1027                 """Extracts URL information and returns it in list of dicts."""
1028                 self.initialize()
1029                 return self._real_extract(url)
1030
1031         def set_downloader(self, downloader):
1032                 """Sets the downloader for this IE."""
1033                 self._downloader = downloader
1034
1035         def _real_initialize(self):
1036                 """Real initialization process. Redefine in subclasses."""
1037                 pass
1038
1039         def _real_extract(self, url):
1040                 """Real extraction process. Redefine in subclasses."""
1041                 pass
1042
1043
1044 class YoutubeIE(InfoExtractor):
1045         """Information extractor for youtube.com."""
1046
1047         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1048         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1049         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1050         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1051         _NETRC_MACHINE = 'youtube'
1052         # Listed in order of quality
1053         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1054         _video_extensions = {
1055                 '13': '3gp',
1056                 '17': 'mp4',
1057                 '18': 'mp4',
1058                 '22': 'mp4',
1059                 '37': 'mp4',
1060                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1061                 '43': 'webm',
1062                 '45': 'webm',
1063         }
1064
1065         @staticmethod
1066         def suitable(url):
1067                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1068
1069         def report_lang(self):
1070                 """Report attempt to set language."""
1071                 self._downloader.to_screen(u'[youtube] Setting language')
1072
1073         def report_login(self):
1074                 """Report attempt to log in."""
1075                 self._downloader.to_screen(u'[youtube] Logging in')
1076
1077         def report_age_confirmation(self):
1078                 """Report attempt to confirm age."""
1079                 self._downloader.to_screen(u'[youtube] Confirming age')
1080
1081         def report_video_webpage_download(self, video_id):
1082                 """Report attempt to download video webpage."""
1083                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1084
1085         def report_video_info_webpage_download(self, video_id):
1086                 """Report attempt to download video info webpage."""
1087                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1088
1089         def report_information_extraction(self, video_id):
1090                 """Report attempt to extract video information."""
1091                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1092
1093         def report_unavailable_format(self, video_id, format):
1094                 """Report extracted video URL."""
1095                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1096
1097         def report_rtmp_download(self):
1098                 """Indicate the download will use the RTMP protocol."""
1099                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1100
1101         def _real_initialize(self):
1102                 if self._downloader is None:
1103                         return
1104
1105                 username = None
1106                 password = None
1107                 downloader_params = self._downloader.params
1108
1109                 # Attempt to use provided username and password or .netrc data
1110                 if downloader_params.get('username', None) is not None:
1111                         username = downloader_params['username']
1112                         password = downloader_params['password']
1113                 elif downloader_params.get('usenetrc', False):
1114                         try:
1115                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1116                                 if info is not None:
1117                                         username = info[0]
1118                                         password = info[2]
1119                                 else:
1120                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1121                         except (IOError, netrc.NetrcParseError), err:
1122                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1123                                 return
1124
1125                 # Set language
1126                 request = urllib2.Request(self._LANG_URL)
1127                 try:
1128                         self.report_lang()
1129                         urllib2.urlopen(request).read()
1130                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1131                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1132                         return
1133
1134                 # No authentication to be performed
1135                 if username is None:
1136                         return
1137
1138                 # Log in
1139                 login_form = {
1140                                 'current_form': 'loginForm',
1141                                 'next':         '/',
1142                                 'action_login': 'Log In',
1143                                 'username':     username,
1144                                 'password':     password,
1145                                 }
1146                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1147                 try:
1148                         self.report_login()
1149                         login_results = urllib2.urlopen(request).read()
1150                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1151                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1152                                 return
1153                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1154                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1155                         return
1156
1157                 # Confirm age
1158                 age_form = {
1159                                 'next_url':             '/',
1160                                 'action_confirm':       'Confirm',
1161                                 }
1162                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1163                 try:
1164                         self.report_age_confirmation()
1165                         age_results = urllib2.urlopen(request).read()
1166                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1167                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1168                         return
1169
1170         def _real_extract(self, url):
1171                 # Extract video id from URL
1172                 mobj = re.match(self._VALID_URL, url)
1173                 if mobj is None:
1174                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1175                         return
1176                 video_id = mobj.group(2)
1177
1178                 # Get video webpage
1179                 self.report_video_webpage_download(video_id)
1180                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1181                 try:
1182                         video_webpage = urllib2.urlopen(request).read()
1183                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1184                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1185                         return
1186
1187                 # Attempt to extract SWF player URL
1188                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1189                 if mobj is not None:
1190                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1191                 else:
1192                         player_url = None
1193
1194                 # Get video info
1195                 self.report_video_info_webpage_download(video_id)
1196                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1197                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1198                                         % (video_id, el_type))
1199                         request = urllib2.Request(video_info_url)
1200                         try:
1201                                 video_info_webpage = urllib2.urlopen(request).read()
1202                                 video_info = parse_qs(video_info_webpage)
1203                                 if 'token' in video_info:
1204                                         break
1205                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1206                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1207                                 return
1208                 if 'token' not in video_info:
1209                         if 'reason' in video_info:
1210                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1211                         else:
1212                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1213                         return
1214
1215                 # Start extracting information
1216                 self.report_information_extraction(video_id)
1217
1218                 # uploader
1219                 if 'author' not in video_info:
1220                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1221                         return
1222                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1223
1224                 # title
1225                 if 'title' not in video_info:
1226                         self._downloader.trouble(u'ERROR: unable to extract video title')
1227                         return
1228                 video_title = urllib.unquote_plus(video_info['title'][0])
1229                 video_title = video_title.decode('utf-8')
1230                 video_title = sanitize_title(video_title)
1231
1232                 # simplified title
1233                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1234                 simple_title = simple_title.strip(ur'_')
1235
1236                 # thumbnail image
1237                 if 'thumbnail_url' not in video_info:
1238                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1239                         video_thumbnail = ''
1240                 else:   # don't panic if we can't find it
1241                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1242
1243                 # upload date
1244                 upload_date = u'NA'
1245                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1246                 if mobj is not None:
1247                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1248                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1249                         for expression in format_expressions:
1250                                 try:
1251                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1252                                 except:
1253                                         pass
1254
1255                 # description
1256                 try:
1257                         lxml.etree
1258                 except NameError:
1259                         video_description = u'No description available.'
1260                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1261                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1262                                 if mobj is not None:
1263                                         video_description = mobj.group(1).decode('utf-8')
1264                 else:
1265                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1266                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1267                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1268                         # TODO use another parser
1269
1270                 # token
1271                 video_token = urllib.unquote_plus(video_info['token'][0])
1272
1273                 # Decide which formats to download
1274                 req_format = self._downloader.params.get('format', None)
1275
1276                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1277                         self.report_rtmp_download()
1278                         video_url_list = [(None, video_info['conn'][0])]
1279                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1280                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1281                         url_data = [parse_qs(uds) for uds in url_data_strs]
1282                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1283                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1284
1285                         format_limit = self._downloader.params.get('format_limit', None)
1286                         if format_limit is not None and format_limit in self._available_formats:
1287                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1288                         else:
1289                                 format_list = self._available_formats
1290                         existing_formats = [x for x in format_list if x in url_map]
1291                         if len(existing_formats) == 0:
1292                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1293                                 return
1294                         if req_format is None:
1295                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1296                         elif req_format == '-1':
1297                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1298                         else:
1299                                 # Specific format
1300                                 if req_format not in url_map:
1301                                         self._downloader.trouble(u'ERROR: requested format not available')
1302                                         return
1303                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1304                 else:
1305                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1306                         return
1307
1308                 for format_param, video_real_url in video_url_list:
1309                         # At this point we have a new video
1310                         self._downloader.increment_downloads()
1311
1312                         # Extension
1313                         video_extension = self._video_extensions.get(format_param, 'flv')
1314
1315                         try:
1316                                 # Process video information
1317                                 self._downloader.process_info({
1318                                         'id':           video_id.decode('utf-8'),
1319                                         'url':          video_real_url.decode('utf-8'),
1320                                         'uploader':     video_uploader.decode('utf-8'),
1321                                         'upload_date':  upload_date,
1322                                         'title':        video_title,
1323                                         'stitle':       simple_title,
1324                                         'ext':          video_extension.decode('utf-8'),
1325                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1326                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1327                                         'description':  video_description,
1328                                         'player_url':   player_url,
1329                                 })
1330                         except UnavailableVideoError, err:
1331                                 self._downloader.trouble(u'\nERROR: unable to download video')
1332
1333
1334 class MetacafeIE(InfoExtractor):
1335         """Information Extractor for metacafe.com."""
1336
1337         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1338         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1339         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1340         _youtube_ie = None
1341
1342         def __init__(self, youtube_ie, downloader=None):
1343                 InfoExtractor.__init__(self, downloader)
1344                 self._youtube_ie = youtube_ie
1345
1346         @staticmethod
1347         def suitable(url):
1348                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1349
1350         def report_disclaimer(self):
1351                 """Report disclaimer retrieval."""
1352                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1353
1354         def report_age_confirmation(self):
1355                 """Report attempt to confirm age."""
1356                 self._downloader.to_screen(u'[metacafe] Confirming age')
1357
1358         def report_download_webpage(self, video_id):
1359                 """Report webpage download."""
1360                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1361
1362         def report_extraction(self, video_id):
1363                 """Report information extraction."""
1364                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1365
1366         def _real_initialize(self):
1367                 # Retrieve disclaimer
1368                 request = urllib2.Request(self._DISCLAIMER)
1369                 try:
1370                         self.report_disclaimer()
1371                         disclaimer = urllib2.urlopen(request).read()
1372                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1373                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1374                         return
1375
1376                 # Confirm age
1377                 disclaimer_form = {
1378                         'filters': '0',
1379                         'submit': "Continue - I'm over 18",
1380                         }
1381                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1382                 try:
1383                         self.report_age_confirmation()
1384                         disclaimer = urllib2.urlopen(request).read()
1385                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1386                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1387                         return
1388
1389         def _real_extract(self, url):
1390                 # Extract id and simplified title from URL
1391                 mobj = re.match(self._VALID_URL, url)
1392                 if mobj is None:
1393                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1394                         return
1395
1396                 video_id = mobj.group(1)
1397
1398                 # Check if video comes from YouTube
1399                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1400                 if mobj2 is not None:
1401                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1402                         return
1403
1404                 # At this point we have a new video
1405                 self._downloader.increment_downloads()
1406
1407                 simple_title = mobj.group(2).decode('utf-8')
1408
1409                 # Retrieve video webpage to extract further information
1410                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1411                 try:
1412                         self.report_download_webpage(video_id)
1413                         webpage = urllib2.urlopen(request).read()
1414                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1415                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1416                         return
1417
1418                 # Extract URL, uploader and title from webpage
1419                 self.report_extraction(video_id)
1420                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1421                 if mobj is not None:
1422                         mediaURL = urllib.unquote(mobj.group(1))
1423                         video_extension = mediaURL[-3:]
1424
1425                         # Extract gdaKey if available
1426                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1427                         if mobj is None:
1428                                 video_url = mediaURL
1429                         else:
1430                                 gdaKey = mobj.group(1)
1431                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1432                 else:
1433                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1434                         if mobj is None:
1435                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1436                                 return
1437                         vardict = parse_qs(mobj.group(1))
1438                         if 'mediaData' not in vardict:
1439                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1440                                 return
1441                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1442                         if mobj is None:
1443                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1444                                 return
1445                         mediaURL = mobj.group(1).replace('\\/', '/')
1446                         video_extension = mediaURL[-3:]
1447                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1448
1449                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1450                 if mobj is None:
1451                         self._downloader.trouble(u'ERROR: unable to extract title')
1452                         return
1453                 video_title = mobj.group(1).decode('utf-8')
1454                 video_title = sanitize_title(video_title)
1455
1456                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1457                 if mobj is None:
1458                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1459                         return
1460                 video_uploader = mobj.group(1)
1461
1462                 try:
1463                         # Process video information
1464                         self._downloader.process_info({
1465                                 'id':           video_id.decode('utf-8'),
1466                                 'url':          video_url.decode('utf-8'),
1467                                 'uploader':     video_uploader.decode('utf-8'),
1468                                 'upload_date':  u'NA',
1469                                 'title':        video_title,
1470                                 'stitle':       simple_title,
1471                                 'ext':          video_extension.decode('utf-8'),
1472                                 'format':       u'NA',
1473                                 'player_url':   None,
1474                         })
1475                 except UnavailableVideoError:
1476                         self._downloader.trouble(u'\nERROR: unable to download video')
1477
1478
1479 class DailymotionIE(InfoExtractor):
1480         """Information Extractor for Dailymotion"""
1481
1482         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1483
1484         def __init__(self, downloader=None):
1485                 InfoExtractor.__init__(self, downloader)
1486
1487         @staticmethod
1488         def suitable(url):
1489                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1490
1491         def report_download_webpage(self, video_id):
1492                 """Report webpage download."""
1493                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1494
1495         def report_extraction(self, video_id):
1496                 """Report information extraction."""
1497                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1498
1499         def _real_initialize(self):
1500                 return
1501
1502         def _real_extract(self, url):
1503                 # Extract id and simplified title from URL
1504                 mobj = re.match(self._VALID_URL, url)
1505                 if mobj is None:
1506                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1507                         return
1508
1509                 # At this point we have a new video
1510                 self._downloader.increment_downloads()
1511                 video_id = mobj.group(1)
1512
1513                 simple_title = mobj.group(2).decode('utf-8')
1514                 video_extension = 'flv'
1515
1516                 # Retrieve video webpage to extract further information
1517                 request = urllib2.Request(url)
1518                 try:
1519                         self.report_download_webpage(video_id)
1520                         webpage = urllib2.urlopen(request).read()
1521                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1522                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1523                         return
1524
1525                 # Extract URL, uploader and title from webpage
1526                 self.report_extraction(video_id)
1527                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1528                 if mobj is None:
1529                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1530                         return
1531                 mediaURL = urllib.unquote(mobj.group(1))
1532
1533                 # if needed add http://www.dailymotion.com/ if relative URL
1534
1535                 video_url = mediaURL
1536
1537                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1538                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1539                 if mobj is None:
1540                         self._downloader.trouble(u'ERROR: unable to extract title')
1541                         return
1542                 video_title = mobj.group(1).decode('utf-8')
1543                 video_title = sanitize_title(video_title)
1544
1545                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1546                 if mobj is None:
1547                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1548                         return
1549                 video_uploader = mobj.group(1)
1550
1551                 try:
1552                         # Process video information
1553                         self._downloader.process_info({
1554                                 'id':           video_id.decode('utf-8'),
1555                                 'url':          video_url.decode('utf-8'),
1556                                 'uploader':     video_uploader.decode('utf-8'),
1557                                 'upload_date':  u'NA',
1558                                 'title':        video_title,
1559                                 'stitle':       simple_title,
1560                                 'ext':          video_extension.decode('utf-8'),
1561                                 'format':       u'NA',
1562                                 'player_url':   None,
1563                         })
1564                 except UnavailableVideoError:
1565                         self._downloader.trouble(u'\nERROR: unable to download video')
1566
1567
1568 class GoogleIE(InfoExtractor):
1569         """Information extractor for video.google.com."""
1570
1571         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1572
1573         def __init__(self, downloader=None):
1574                 InfoExtractor.__init__(self, downloader)
1575
1576         @staticmethod
1577         def suitable(url):
1578                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1579
1580         def report_download_webpage(self, video_id):
1581                 """Report webpage download."""
1582                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1583
1584         def report_extraction(self, video_id):
1585                 """Report information extraction."""
1586                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1587
1588         def _real_initialize(self):
1589                 return
1590
1591         def _real_extract(self, url):
1592                 # Extract id from URL
1593                 mobj = re.match(self._VALID_URL, url)
1594                 if mobj is None:
1595                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1596                         return
1597
1598                 # At this point we have a new video
1599                 self._downloader.increment_downloads()
1600                 video_id = mobj.group(1)
1601
1602                 video_extension = 'mp4'
1603
1604                 # Retrieve video webpage to extract further information
1605                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1606                 try:
1607                         self.report_download_webpage(video_id)
1608                         webpage = urllib2.urlopen(request).read()
1609                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1610                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1611                         return
1612
1613                 # Extract URL, uploader, and title from webpage
1614                 self.report_extraction(video_id)
1615                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1616                 if mobj is None:
1617                         video_extension = 'flv'
1618                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1619                 if mobj is None:
1620                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1621                         return
1622                 mediaURL = urllib.unquote(mobj.group(1))
1623                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1624                 mediaURL = mediaURL.replace('\\x26', '\x26')
1625
1626                 video_url = mediaURL
1627
1628                 mobj = re.search(r'<title>(.*)</title>', webpage)
1629                 if mobj is None:
1630                         self._downloader.trouble(u'ERROR: unable to extract title')
1631                         return
1632                 video_title = mobj.group(1).decode('utf-8')
1633                 video_title = sanitize_title(video_title)
1634                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1635
1636                 # Extract video description
1637                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1638                 if mobj is None:
1639                         self._downloader.trouble(u'ERROR: unable to extract video description')
1640                         return
1641                 video_description = mobj.group(1).decode('utf-8')
1642                 if not video_description:
1643                         video_description = 'No description available.'
1644
1645                 # Extract video thumbnail
1646                 if self._downloader.params.get('forcethumbnail', False):
1647                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1648                         try:
1649                                 webpage = urllib2.urlopen(request).read()
1650                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1651                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1652                                 return
1653                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1654                         if mobj is None:
1655                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1656                                 return
1657                         video_thumbnail = mobj.group(1)
1658                 else:   # we need something to pass to process_info
1659                         video_thumbnail = ''
1660
1661                 try:
1662                         # Process video information
1663                         self._downloader.process_info({
1664                                 'id':           video_id.decode('utf-8'),
1665                                 'url':          video_url.decode('utf-8'),
1666                                 'uploader':     u'NA',
1667                                 'upload_date':  u'NA',
1668                                 'title':        video_title,
1669                                 'stitle':       simple_title,
1670                                 'ext':          video_extension.decode('utf-8'),
1671                                 'format':       u'NA',
1672                                 'player_url':   None,
1673                         })
1674                 except UnavailableVideoError:
1675                         self._downloader.trouble(u'\nERROR: unable to download video')
1676
1677
1678 class PhotobucketIE(InfoExtractor):
1679         """Information extractor for photobucket.com."""
1680
1681         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1682
1683         def __init__(self, downloader=None):
1684                 InfoExtractor.__init__(self, downloader)
1685
1686         @staticmethod
1687         def suitable(url):
1688                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1689
1690         def report_download_webpage(self, video_id):
1691                 """Report webpage download."""
1692                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1693
1694         def report_extraction(self, video_id):
1695                 """Report information extraction."""
1696                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1697
1698         def _real_initialize(self):
1699                 return
1700
1701         def _real_extract(self, url):
1702                 # Extract id from URL
1703                 mobj = re.match(self._VALID_URL, url)
1704                 if mobj is None:
1705                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1706                         return
1707
1708                 # At this point we have a new video
1709                 self._downloader.increment_downloads()
1710                 video_id = mobj.group(1)
1711
1712                 video_extension = 'flv'
1713
1714                 # Retrieve video webpage to extract further information
1715                 request = urllib2.Request(url)
1716                 try:
1717                         self.report_download_webpage(video_id)
1718                         webpage = urllib2.urlopen(request).read()
1719                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1720                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1721                         return
1722
1723                 # Extract URL, uploader, and title from webpage
1724                 self.report_extraction(video_id)
1725                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1726                 if mobj is None:
1727                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1728                         return
1729                 mediaURL = urllib.unquote(mobj.group(1))
1730
1731                 video_url = mediaURL
1732
1733                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1734                 if mobj is None:
1735                         self._downloader.trouble(u'ERROR: unable to extract title')
1736                         return
1737                 video_title = mobj.group(1).decode('utf-8')
1738                 video_title = sanitize_title(video_title)
1739                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1740
1741                 video_uploader = mobj.group(2).decode('utf-8')
1742
1743                 try:
1744                         # Process video information
1745                         self._downloader.process_info({
1746                                 'id':           video_id.decode('utf-8'),
1747                                 'url':          video_url.decode('utf-8'),
1748                                 'uploader':     video_uploader,
1749                                 'upload_date':  u'NA',
1750                                 'title':        video_title,
1751                                 'stitle':       simple_title,
1752                                 'ext':          video_extension.decode('utf-8'),
1753                                 'format':       u'NA',
1754                                 'player_url':   None,
1755                         })
1756                 except UnavailableVideoError:
1757                         self._downloader.trouble(u'\nERROR: unable to download video')
1758
1759
1760 class YahooIE(InfoExtractor):
1761         """Information extractor for video.yahoo.com."""
1762
1763         # _VALID_URL matches all Yahoo! Video URLs
1764         # _VPAGE_URL matches only the extractable '/watch/' URLs
1765         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1766         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1767
1768         def __init__(self, downloader=None):
1769                 InfoExtractor.__init__(self, downloader)
1770
1771         @staticmethod
1772         def suitable(url):
1773                 return (re.match(YahooIE._VALID_URL, url) is not None)
1774
1775         def report_download_webpage(self, video_id):
1776                 """Report webpage download."""
1777                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1778
1779         def report_extraction(self, video_id):
1780                 """Report information extraction."""
1781                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1782
1783         def _real_initialize(self):
1784                 return
1785
1786         def _real_extract(self, url, new_video=True):
1787                 # Extract ID from URL
1788                 mobj = re.match(self._VALID_URL, url)
1789                 if mobj is None:
1790                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1791                         return
1792
1793                 # At this point we have a new video
1794                 self._downloader.increment_downloads()
1795                 video_id = mobj.group(2)
1796                 video_extension = 'flv'
1797
1798                 # Rewrite valid but non-extractable URLs as
1799                 # extractable English language /watch/ URLs
1800                 if re.match(self._VPAGE_URL, url) is None:
1801                         request = urllib2.Request(url)
1802                         try:
1803                                 webpage = urllib2.urlopen(request).read()
1804                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1805                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1806                                 return
1807
1808                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1809                         if mobj is None:
1810                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1811                                 return
1812                         yahoo_id = mobj.group(1)
1813
1814                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1815                         if mobj is None:
1816                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1817                                 return
1818                         yahoo_vid = mobj.group(1)
1819
1820                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1821                         return self._real_extract(url, new_video=False)
1822
1823                 # Retrieve video webpage to extract further information
1824                 request = urllib2.Request(url)
1825                 try:
1826                         self.report_download_webpage(video_id)
1827                         webpage = urllib2.urlopen(request).read()
1828                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1829                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1830                         return
1831
1832                 # Extract uploader and title from webpage
1833                 self.report_extraction(video_id)
1834                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1835                 if mobj is None:
1836                         self._downloader.trouble(u'ERROR: unable to extract video title')
1837                         return
1838                 video_title = mobj.group(1).decode('utf-8')
1839                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1840
1841                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1842                 if mobj is None:
1843                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1844                         return
1845                 video_uploader = mobj.group(1).decode('utf-8')
1846
1847                 # Extract video thumbnail
1848                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1849                 if mobj is None:
1850                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1851                         return
1852                 video_thumbnail = mobj.group(1).decode('utf-8')
1853
1854                 # Extract video description
1855                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1856                 if mobj is None:
1857                         self._downloader.trouble(u'ERROR: unable to extract video description')
1858                         return
1859                 video_description = mobj.group(1).decode('utf-8')
1860                 if not video_description:
1861                         video_description = 'No description available.'
1862
1863                 # Extract video height and width
1864                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1865                 if mobj is None:
1866                         self._downloader.trouble(u'ERROR: unable to extract video height')
1867                         return
1868                 yv_video_height = mobj.group(1)
1869
1870                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1871                 if mobj is None:
1872                         self._downloader.trouble(u'ERROR: unable to extract video width')
1873                         return
1874                 yv_video_width = mobj.group(1)
1875
1876                 # Retrieve video playlist to extract media URL
1877                 # I'm not completely sure what all these options are, but we
1878                 # seem to need most of them, otherwise the server sends a 401.
1879                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1880                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1881                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1882                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1883                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1884                 try:
1885                         self.report_download_webpage(video_id)
1886                         webpage = urllib2.urlopen(request).read()
1887                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1888                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1889                         return
1890
1891                 # Extract media URL from playlist XML
1892                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1893                 if mobj is None:
1894                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1895                         return
1896                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1897                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1898
1899                 try:
1900                         # Process video information
1901                         self._downloader.process_info({
1902                                 'id':           video_id.decode('utf-8'),
1903                                 'url':          video_url,
1904                                 'uploader':     video_uploader,
1905                                 'upload_date':  u'NA',
1906                                 'title':        video_title,
1907                                 'stitle':       simple_title,
1908                                 'ext':          video_extension.decode('utf-8'),
1909                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1910                                 'description':  video_description,
1911                                 'thumbnail':    video_thumbnail,
1912                                 'description':  video_description,
1913                                 'player_url':   None,
1914                         })
1915                 except UnavailableVideoError:
1916                         self._downloader.trouble(u'\nERROR: unable to download video')
1917
1918
1919 class VimeoIE(InfoExtractor):
1920         """Information extractor for vimeo.com."""
1921
1922         # _VALID_URL matches Vimeo URLs
1923         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1924
1925         def __init__(self, downloader=None):
1926                 InfoExtractor.__init__(self, downloader)
1927
1928         @staticmethod
1929         def suitable(url):
1930                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1931
1932         def report_download_webpage(self, video_id):
1933                 """Report webpage download."""
1934                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1935
1936         def report_extraction(self, video_id):
1937                 """Report information extraction."""
1938                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1939
1940         def _real_initialize(self):
1941                 return
1942
1943         def _real_extract(self, url, new_video=True):
1944                 # Extract ID from URL
1945                 mobj = re.match(self._VALID_URL, url)
1946                 if mobj is None:
1947                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1948                         return
1949
1950                 # At this point we have a new video
1951                 self._downloader.increment_downloads()
1952                 video_id = mobj.group(1)
1953
1954                 # Retrieve video webpage to extract further information
1955                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1956                 try:
1957                         self.report_download_webpage(video_id)
1958                         webpage = urllib2.urlopen(request).read()
1959                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1960                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1961                         return
1962
1963                 # Now we begin extracting as much information as we can from what we
1964                 # retrieved. First we extract the information common to all extractors,
1965                 # and latter we extract those that are Vimeo specific.
1966                 self.report_extraction(video_id)
1967
1968                 # Extract title
1969                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1970                 if mobj is None:
1971                         self._downloader.trouble(u'ERROR: unable to extract video title')
1972                         return
1973                 video_title = mobj.group(1).decode('utf-8')
1974                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1975
1976                 # Extract uploader
1977                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1978                 if mobj is None:
1979                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1980                         return
1981                 video_uploader = mobj.group(1).decode('utf-8')
1982
1983                 # Extract video thumbnail
1984                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1985                 if mobj is None:
1986                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1987                         return
1988                 video_thumbnail = mobj.group(1).decode('utf-8')
1989
1990                 # # Extract video description
1991                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1992                 # if mobj is None:
1993                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
1994                 #       return
1995                 # video_description = mobj.group(1).decode('utf-8')
1996                 # if not video_description: video_description = 'No description available.'
1997                 video_description = 'Foo.'
1998
1999                 # Vimeo specific: extract request signature
2000                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2001                 if mobj is None:
2002                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2003                         return
2004                 sig = mobj.group(1).decode('utf-8')
2005
2006                 # Vimeo specific: Extract request signature expiration
2007                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2008                 if mobj is None:
2009                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2010                         return
2011                 sig_exp = mobj.group(1).decode('utf-8')
2012
2013                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2014
2015                 try:
2016                         # Process video information
2017                         self._downloader.process_info({
2018                                 'id':           video_id.decode('utf-8'),
2019                                 'url':          video_url,
2020                                 'uploader':     video_uploader,
2021                                 'upload_date':  u'NA',
2022                                 'title':        video_title,
2023                                 'stitle':       simple_title,
2024                                 'ext':          u'mp4',
2025                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2026                                 'description':  video_description,
2027                                 'thumbnail':    video_thumbnail,
2028                                 'description':  video_description,
2029                                 'player_url':   None,
2030                         })
2031                 except UnavailableVideoError:
2032                         self._downloader.trouble(u'ERROR: unable to download video')
2033
2034
2035 class GenericIE(InfoExtractor):
2036         """Generic last-resort information extractor."""
2037
2038         def __init__(self, downloader=None):
2039                 InfoExtractor.__init__(self, downloader)
2040
2041         @staticmethod
2042         def suitable(url):
2043                 return True
2044
2045         def report_download_webpage(self, video_id):
2046                 """Report webpage download."""
2047                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2048                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2049
2050         def report_extraction(self, video_id):
2051                 """Report information extraction."""
2052                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2053
2054         def _real_initialize(self):
2055                 return
2056
2057         def _real_extract(self, url):
2058                 # At this point we have a new video
2059                 self._downloader.increment_downloads()
2060
2061                 video_id = url.split('/')[-1]
2062                 request = urllib2.Request(url)
2063                 try:
2064                         self.report_download_webpage(video_id)
2065                         webpage = urllib2.urlopen(request).read()
2066                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2067                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2068                         return
2069                 except ValueError, err:
2070                         # since this is the last-resort InfoExtractor, if
2071                         # this error is thrown, it'll be thrown here
2072                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2073                         return
2074
2075                 self.report_extraction(video_id)
2076                 # Start with something easy: JW Player in SWFObject
2077                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2078                 if mobj is None:
2079                         # Broaden the search a little bit
2080                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2081                 if mobj is None:
2082                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2083                         return
2084
2085                 # It's possible that one of the regexes
2086                 # matched, but returned an empty group:
2087                 if mobj.group(1) is None:
2088                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2089                         return
2090
2091                 video_url = urllib.unquote(mobj.group(1))
2092                 video_id = os.path.basename(video_url)
2093
2094                 # here's a fun little line of code for you:
2095                 video_extension = os.path.splitext(video_id)[1][1:]
2096                 video_id = os.path.splitext(video_id)[0]
2097
2098                 # it's tempting to parse this further, but you would
2099                 # have to take into account all the variations like
2100                 #   Video Title - Site Name
2101                 #   Site Name | Video Title
2102                 #   Video Title - Tagline | Site Name
2103                 # and so on and so forth; it's just not practical
2104                 mobj = re.search(r'<title>(.*)</title>', webpage)
2105                 if mobj is None:
2106                         self._downloader.trouble(u'ERROR: unable to extract title')
2107                         return
2108                 video_title = mobj.group(1).decode('utf-8')
2109                 video_title = sanitize_title(video_title)
2110                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2111
2112                 # video uploader is domain name
2113                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2114                 if mobj is None:
2115                         self._downloader.trouble(u'ERROR: unable to extract title')
2116                         return
2117                 video_uploader = mobj.group(1).decode('utf-8')
2118
2119                 try:
2120                         # Process video information
2121                         self._downloader.process_info({
2122                                 'id':           video_id.decode('utf-8'),
2123                                 'url':          video_url.decode('utf-8'),
2124                                 'uploader':     video_uploader,
2125                                 'upload_date':  u'NA',
2126                                 'title':        video_title,
2127                                 'stitle':       simple_title,
2128                                 'ext':          video_extension.decode('utf-8'),
2129                                 'format':       u'NA',
2130                                 'player_url':   None,
2131                         })
2132                 except UnavailableVideoError, err:
2133                         self._downloader.trouble(u'\nERROR: unable to download video')
2134
2135
2136 class YoutubeSearchIE(InfoExtractor):
2137         """Information Extractor for YouTube search queries."""
2138         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2139         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2140         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2141         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2142         _youtube_ie = None
2143         _max_youtube_results = 1000
2144
2145         def __init__(self, youtube_ie, downloader=None):
2146                 InfoExtractor.__init__(self, downloader)
2147                 self._youtube_ie = youtube_ie
2148
2149         @staticmethod
2150         def suitable(url):
2151                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2152
2153         def report_download_page(self, query, pagenum):
2154                 """Report attempt to download playlist page with given number."""
2155                 query = query.decode(preferredencoding())
2156                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2157
2158         def _real_initialize(self):
2159                 self._youtube_ie.initialize()
2160
2161         def _real_extract(self, query):
2162                 mobj = re.match(self._VALID_QUERY, query)
2163                 if mobj is None:
2164                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2165                         return
2166
2167                 prefix, query = query.split(':')
2168                 prefix = prefix[8:]
2169                 query = query.encode('utf-8')
2170                 if prefix == '':
2171                         self._download_n_results(query, 1)
2172                         return
2173                 elif prefix == 'all':
2174                         self._download_n_results(query, self._max_youtube_results)
2175                         return
2176                 else:
2177                         try:
2178                                 n = long(prefix)
2179                                 if n <= 0:
2180                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2181                                         return
2182                                 elif n > self._max_youtube_results:
2183                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2184                                         n = self._max_youtube_results
2185                                 self._download_n_results(query, n)
2186                                 return
2187                         except ValueError: # parsing prefix as integer fails
2188                                 self._download_n_results(query, 1)
2189                                 return
2190
2191         def _download_n_results(self, query, n):
2192                 """Downloads a specified number of results for a query"""
2193
2194                 video_ids = []
2195                 already_seen = set()
2196                 pagenum = 1
2197
2198                 while True:
2199                         self.report_download_page(query, pagenum)
2200                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2201                         request = urllib2.Request(result_url)
2202                         try:
2203                                 page = urllib2.urlopen(request).read()
2204                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2206                                 return
2207
2208                         # Extract video identifiers
2209                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2210                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2211                                 if video_id not in already_seen:
2212                                         video_ids.append(video_id)
2213                                         already_seen.add(video_id)
2214                                         if len(video_ids) == n:
2215                                                 # Specified n videos reached
2216                                                 for id in video_ids:
2217                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2218                                                 return
2219
2220                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2221                                 for id in video_ids:
2222                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2223                                 return
2224
2225                         pagenum = pagenum + 1
2226
2227
2228 class GoogleSearchIE(InfoExtractor):
2229         """Information Extractor for Google Video search queries."""
2230         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2231         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2232         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2233         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2234         _google_ie = None
2235         _max_google_results = 1000
2236
2237         def __init__(self, google_ie, downloader=None):
2238                 InfoExtractor.__init__(self, downloader)
2239                 self._google_ie = google_ie
2240
2241         @staticmethod
2242         def suitable(url):
2243                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2244
2245         def report_download_page(self, query, pagenum):
2246                 """Report attempt to download playlist page with given number."""
2247                 query = query.decode(preferredencoding())
2248                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2249
2250         def _real_initialize(self):
2251                 self._google_ie.initialize()
2252
2253         def _real_extract(self, query):
2254                 mobj = re.match(self._VALID_QUERY, query)
2255                 if mobj is None:
2256                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2257                         return
2258
2259                 prefix, query = query.split(':')
2260                 prefix = prefix[8:]
2261                 query = query.encode('utf-8')
2262                 if prefix == '':
2263                         self._download_n_results(query, 1)
2264                         return
2265                 elif prefix == 'all':
2266                         self._download_n_results(query, self._max_google_results)
2267                         return
2268                 else:
2269                         try:
2270                                 n = long(prefix)
2271                                 if n <= 0:
2272                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2273                                         return
2274                                 elif n > self._max_google_results:
2275                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2276                                         n = self._max_google_results
2277                                 self._download_n_results(query, n)
2278                                 return
2279                         except ValueError: # parsing prefix as integer fails
2280                                 self._download_n_results(query, 1)
2281                                 return
2282
2283         def _download_n_results(self, query, n):
2284                 """Downloads a specified number of results for a query"""
2285
2286                 video_ids = []
2287                 already_seen = set()
2288                 pagenum = 1
2289
2290                 while True:
2291                         self.report_download_page(query, pagenum)
2292                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2293                         request = urllib2.Request(result_url)
2294                         try:
2295                                 page = urllib2.urlopen(request).read()
2296                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2297                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2298                                 return
2299
2300                         # Extract video identifiers
2301                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2302                                 video_id = mobj.group(1)
2303                                 if video_id not in already_seen:
2304                                         video_ids.append(video_id)
2305                                         already_seen.add(video_id)
2306                                         if len(video_ids) == n:
2307                                                 # Specified n videos reached
2308                                                 for id in video_ids:
2309                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2310                                                 return
2311
2312                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2313                                 for id in video_ids:
2314                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2315                                 return
2316
2317                         pagenum = pagenum + 1
2318
2319
2320 class YahooSearchIE(InfoExtractor):
2321         """Information Extractor for Yahoo! Video search queries."""
2322         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2323         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2324         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2325         _MORE_PAGES_INDICATOR = r'\s*Next'
2326         _yahoo_ie = None
2327         _max_yahoo_results = 1000
2328
2329         def __init__(self, yahoo_ie, downloader=None):
2330                 InfoExtractor.__init__(self, downloader)
2331                 self._yahoo_ie = yahoo_ie
2332
2333         @staticmethod
2334         def suitable(url):
2335                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2336
2337         def report_download_page(self, query, pagenum):
2338                 """Report attempt to download playlist page with given number."""
2339                 query = query.decode(preferredencoding())
2340                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2341
2342         def _real_initialize(self):
2343                 self._yahoo_ie.initialize()
2344
2345         def _real_extract(self, query):
2346                 mobj = re.match(self._VALID_QUERY, query)
2347                 if mobj is None:
2348                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2349                         return
2350
2351                 prefix, query = query.split(':')
2352                 prefix = prefix[8:]
2353                 query = query.encode('utf-8')
2354                 if prefix == '':
2355                         self._download_n_results(query, 1)
2356                         return
2357                 elif prefix == 'all':
2358                         self._download_n_results(query, self._max_yahoo_results)
2359                         return
2360                 else:
2361                         try:
2362                                 n = long(prefix)
2363                                 if n <= 0:
2364                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2365                                         return
2366                                 elif n > self._max_yahoo_results:
2367                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2368                                         n = self._max_yahoo_results
2369                                 self._download_n_results(query, n)
2370                                 return
2371                         except ValueError: # parsing prefix as integer fails
2372                                 self._download_n_results(query, 1)
2373                                 return
2374
2375         def _download_n_results(self, query, n):
2376                 """Downloads a specified number of results for a query"""
2377
2378                 video_ids = []
2379                 already_seen = set()
2380                 pagenum = 1
2381
2382                 while True:
2383                         self.report_download_page(query, pagenum)
2384                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2385                         request = urllib2.Request(result_url)
2386                         try:
2387                                 page = urllib2.urlopen(request).read()
2388                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2389                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2390                                 return
2391
2392                         # Extract video identifiers
2393                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2394                                 video_id = mobj.group(1)
2395                                 if video_id not in already_seen:
2396                                         video_ids.append(video_id)
2397                                         already_seen.add(video_id)
2398                                         if len(video_ids) == n:
2399                                                 # Specified n videos reached
2400                                                 for id in video_ids:
2401                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2402                                                 return
2403
2404                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2405                                 for id in video_ids:
2406                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2407                                 return
2408
2409                         pagenum = pagenum + 1
2410
2411
2412 class YoutubePlaylistIE(InfoExtractor):
2413         """Information Extractor for YouTube playlists."""
2414
2415         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2416         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2417         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2418         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2419         _youtube_ie = None
2420
2421         def __init__(self, youtube_ie, downloader=None):
2422                 InfoExtractor.__init__(self, downloader)
2423                 self._youtube_ie = youtube_ie
2424
2425         @staticmethod
2426         def suitable(url):
2427                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2428
2429         def report_download_page(self, playlist_id, pagenum):
2430                 """Report attempt to download playlist page with given number."""
2431                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2432
2433         def _real_initialize(self):
2434                 self._youtube_ie.initialize()
2435
2436         def _real_extract(self, url):
2437                 # Extract playlist id
2438                 mobj = re.match(self._VALID_URL, url)
2439                 if mobj is None:
2440                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2441                         return
2442
2443                 # Single video case
2444                 if mobj.group(3) is not None:
2445                         self._youtube_ie.extract(mobj.group(3))
2446                         return
2447
2448                 # Download playlist pages
2449                 # prefix is 'p' as default for playlists but there are other types that need extra care
2450                 playlist_prefix = mobj.group(1)
2451                 if playlist_prefix == 'a':
2452                         playlist_access = 'artist'
2453                 else:
2454                         playlist_prefix = 'p'
2455                         playlist_access = 'view_play_list'
2456                 playlist_id = mobj.group(2)
2457                 video_ids = []
2458                 pagenum = 1
2459
2460                 while True:
2461                         self.report_download_page(playlist_id, pagenum)
2462                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2463                         try:
2464                                 page = urllib2.urlopen(request).read()
2465                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2466                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2467                                 return
2468
2469                         # Extract video identifiers
2470                         ids_in_page = []
2471                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2472                                 if mobj.group(1) not in ids_in_page:
2473                                         ids_in_page.append(mobj.group(1))
2474                         video_ids.extend(ids_in_page)
2475
2476                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2477                                 break
2478                         pagenum = pagenum + 1
2479
2480                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2481                 playlistend = self._downloader.params.get('playlistend', -1)
2482                 video_ids = video_ids[playliststart:playlistend]
2483
2484                 for id in video_ids:
2485                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2486                 return
2487
2488
2489 class YoutubeUserIE(InfoExtractor):
2490         """Information Extractor for YouTube users."""
2491
2492         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2493         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2494         _GDATA_PAGE_SIZE = 50
2495         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2496         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2497         _youtube_ie = None
2498
2499         def __init__(self, youtube_ie, downloader=None):
2500                 InfoExtractor.__init__(self, downloader)
2501                 self._youtube_ie = youtube_ie
2502
2503         @staticmethod
2504         def suitable(url):
2505                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2506
2507         def report_download_page(self, username, start_index):
2508                 """Report attempt to download user page."""
2509                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2510                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2511
2512         def _real_initialize(self):
2513                 self._youtube_ie.initialize()
2514
2515         def _real_extract(self, url):
2516                 # Extract username
2517                 mobj = re.match(self._VALID_URL, url)
2518                 if mobj is None:
2519                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2520                         return
2521
2522                 username = mobj.group(1)
2523
2524                 # Download video ids using YouTube Data API. Result size per
2525                 # query is limited (currently to 50 videos) so we need to query
2526                 # page by page until there are no video ids - it means we got
2527                 # all of them.
2528
2529                 video_ids = []
2530                 pagenum = 0
2531
2532                 while True:
2533                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2534                         self.report_download_page(username, start_index)
2535
2536                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2537
2538                         try:
2539                                 page = urllib2.urlopen(request).read()
2540                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2541                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2542                                 return
2543
2544                         # Extract video identifiers
2545                         ids_in_page = []
2546
2547                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2548                                 if mobj.group(1) not in ids_in_page:
2549                                         ids_in_page.append(mobj.group(1))
2550
2551                         video_ids.extend(ids_in_page)
2552
2553                         # A little optimization - if current page is not
2554                         # "full", ie. does not contain PAGE_SIZE video ids then
2555                         # we can assume that this page is the last one - there
2556                         # are no more ids on further pages - no need to query
2557                         # again.
2558
2559                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2560                                 break
2561
2562                         pagenum += 1
2563
2564                 all_ids_count = len(video_ids)
2565                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2566                 playlistend = self._downloader.params.get('playlistend', -1)
2567
2568                 if playlistend == -1:
2569                         video_ids = video_ids[playliststart:]
2570                 else:
2571                         video_ids = video_ids[playliststart:playlistend]
2572
2573                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2574                                 (username, all_ids_count, len(video_ids)))
2575
2576                 for video_id in video_ids:
2577                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2578
2579
2580 class DepositFilesIE(InfoExtractor):
2581         """Information extractor for depositfiles.com"""
2582
2583         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2584
2585         def __init__(self, downloader=None):
2586                 InfoExtractor.__init__(self, downloader)
2587
2588         @staticmethod
2589         def suitable(url):
2590                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2591
2592         def report_download_webpage(self, file_id):
2593                 """Report webpage download."""
2594                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2595
2596         def report_extraction(self, file_id):
2597                 """Report information extraction."""
2598                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2599
2600         def _real_initialize(self):
2601                 return
2602
2603         def _real_extract(self, url):
2604                 # At this point we have a new file
2605                 self._downloader.increment_downloads()
2606
2607                 file_id = url.split('/')[-1]
2608                 # Rebuild url in english locale
2609                 url = 'http://depositfiles.com/en/files/' + file_id
2610
2611                 # Retrieve file webpage with 'Free download' button pressed
2612                 free_download_indication = { 'gateway_result' : '1' }
2613                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2614                 try:
2615                         self.report_download_webpage(file_id)
2616                         webpage = urllib2.urlopen(request).read()
2617                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2619                         return
2620
2621                 # Search for the real file URL
2622                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2623                 if (mobj is None) or (mobj.group(1) is None):
2624                         # Try to figure out reason of the error.
2625                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2626                         if (mobj is not None) and (mobj.group(1) is not None):
2627                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2628                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2629                         else:
2630                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2631                         return
2632
2633                 file_url = mobj.group(1)
2634                 file_extension = os.path.splitext(file_url)[1][1:]
2635
2636                 # Search for file title
2637                 mobj = re.search(r'<b title="(.*?)">', webpage)
2638                 if mobj is None:
2639                         self._downloader.trouble(u'ERROR: unable to extract title')
2640                         return
2641                 file_title = mobj.group(1).decode('utf-8')
2642
2643                 try:
2644                         # Process file information
2645                         self._downloader.process_info({
2646                                 'id':           file_id.decode('utf-8'),
2647                                 'url':          file_url.decode('utf-8'),
2648                                 'uploader':     u'NA',
2649                                 'upload_date':  u'NA',
2650                                 'title':        file_title,
2651                                 'stitle':       file_title,
2652                                 'ext':          file_extension.decode('utf-8'),
2653                                 'format':       u'NA',
2654                                 'player_url':   None,
2655                         })
2656                 except UnavailableVideoError, err:
2657                         self._downloader.trouble(u'ERROR: unable to download file')
2658
2659
2660 class FacebookIE(InfoExtractor):
2661         """Information Extractor for Facebook"""
2662
2663         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2664         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2665         _NETRC_MACHINE = 'facebook'
2666         _available_formats = ['highqual', 'lowqual']
2667         _video_extensions = {
2668                 'highqual': 'mp4',
2669                 'lowqual': 'mp4',
2670         }
2671
2672         def __init__(self, downloader=None):
2673                 InfoExtractor.__init__(self, downloader)
2674
2675         @staticmethod
2676         def suitable(url):
2677                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2678
2679         def _reporter(self, message):
2680                 """Add header and report message."""
2681                 self._downloader.to_screen(u'[facebook] %s' % message)
2682
2683         def report_login(self):
2684                 """Report attempt to log in."""
2685                 self._reporter(u'Logging in')
2686
2687         def report_video_webpage_download(self, video_id):
2688                 """Report attempt to download video webpage."""
2689                 self._reporter(u'%s: Downloading video webpage' % video_id)
2690
2691         def report_information_extraction(self, video_id):
2692                 """Report attempt to extract video information."""
2693                 self._reporter(u'%s: Extracting video information' % video_id)
2694
2695         def _parse_page(self, video_webpage):
2696                 """Extract video information from page"""
2697                 # General data
2698                 data = {'title': r'class="video_title datawrap">(.*?)</',
2699                         'description': r'<div class="datawrap">(.*?)</div>',
2700                         'owner': r'\("video_owner_name", "(.*?)"\)',
2701                         'upload_date': r'data-date="(.*?)"',
2702                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2703                         }
2704                 video_info = {}
2705                 for piece in data.keys():
2706                         mobj = re.search(data[piece], video_webpage)
2707                         if mobj is not None:
2708                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2709
2710                 # Video urls
2711                 video_urls = {}
2712                 for fmt in self._available_formats:
2713                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2714                         if mobj is not None:
2715                                 # URL is in a Javascript segment inside an escaped Unicode format within
2716                                 # the generally utf-8 page
2717                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2718                 video_info['video_urls'] = video_urls
2719
2720                 return video_info
2721
2722         def _real_initialize(self):
2723                 if self._downloader is None:
2724                         return
2725
2726                 useremail = None
2727                 password = None
2728                 downloader_params = self._downloader.params
2729
2730                 # Attempt to use provided username and password or .netrc data
2731                 if downloader_params.get('username', None) is not None:
2732                         useremail = downloader_params['username']
2733                         password = downloader_params['password']
2734                 elif downloader_params.get('usenetrc', False):
2735                         try:
2736                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2737                                 if info is not None:
2738                                         useremail = info[0]
2739                                         password = info[2]
2740                                 else:
2741                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2742                         except (IOError, netrc.NetrcParseError), err:
2743                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2744                                 return
2745
2746                 if useremail is None:
2747                         return
2748
2749                 # Log in
2750                 login_form = {
2751                         'email': useremail,
2752                         'pass': password,
2753                         'login': 'Log+In'
2754                         }
2755                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2756                 try:
2757                         self.report_login()
2758                         login_results = urllib2.urlopen(request).read()
2759                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2760                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2761                                 return
2762                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2763                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2764                         return
2765
2766         def _real_extract(self, url):
2767                 mobj = re.match(self._VALID_URL, url)
2768                 if mobj is None:
2769                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2770                         return
2771                 video_id = mobj.group('ID')
2772
2773                 # Get video webpage
2774                 self.report_video_webpage_download(video_id)
2775                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2776                 try:
2777                         page = urllib2.urlopen(request)
2778                         video_webpage = page.read()
2779                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2780                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2781                         return
2782
2783                 # Start extracting information
2784                 self.report_information_extraction(video_id)
2785
2786                 # Extract information
2787                 video_info = self._parse_page(video_webpage)
2788
2789                 # uploader
2790                 if 'owner' not in video_info:
2791                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2792                         return
2793                 video_uploader = video_info['owner']
2794
2795                 # title
2796                 if 'title' not in video_info:
2797                         self._downloader.trouble(u'ERROR: unable to extract video title')
2798                         return
2799                 video_title = video_info['title']
2800                 video_title = video_title.decode('utf-8')
2801                 video_title = sanitize_title(video_title)
2802
2803                 # simplified title
2804                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2805                 simple_title = simple_title.strip(ur'_')
2806
2807                 # thumbnail image
2808                 if 'thumbnail' not in video_info:
2809                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2810                         video_thumbnail = ''
2811                 else:
2812                         video_thumbnail = video_info['thumbnail']
2813
2814                 # upload date
2815                 upload_date = u'NA'
2816                 if 'upload_date' in video_info:
2817                         upload_time = video_info['upload_date']
2818                         timetuple = email.utils.parsedate_tz(upload_time)
2819                         if timetuple is not None:
2820                                 try:
2821                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2822                                 except:
2823                                         pass
2824
2825                 # description
2826                 video_description = video_info.get('description', 'No description available.')
2827
2828                 url_map = video_info['video_urls']
2829                 if len(url_map.keys()) > 0:
2830                         # Decide which formats to download
2831                         req_format = self._downloader.params.get('format', None)
2832                         format_limit = self._downloader.params.get('format_limit', None)
2833
2834                         if format_limit is not None and format_limit in self._available_formats:
2835                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2836                         else:
2837                                 format_list = self._available_formats
2838                         existing_formats = [x for x in format_list if x in url_map]
2839                         if len(existing_formats) == 0:
2840                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2841                                 return
2842                         if req_format is None:
2843                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2844                         elif req_format == '-1':
2845                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2846                         else:
2847                                 # Specific format
2848                                 if req_format not in url_map:
2849                                         self._downloader.trouble(u'ERROR: requested format not available')
2850                                         return
2851                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2852
2853                 for format_param, video_real_url in video_url_list:
2854
2855                         # At this point we have a new video
2856                         self._downloader.increment_downloads()
2857
2858                         # Extension
2859                         video_extension = self._video_extensions.get(format_param, 'mp4')
2860
2861                         try:
2862                                 # Process video information
2863                                 self._downloader.process_info({
2864                                         'id':           video_id.decode('utf-8'),
2865                                         'url':          video_real_url.decode('utf-8'),
2866                                         'uploader':     video_uploader.decode('utf-8'),
2867                                         'upload_date':  upload_date,
2868                                         'title':        video_title,
2869                                         'stitle':       simple_title,
2870                                         'ext':          video_extension.decode('utf-8'),
2871                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2872                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2873                                         'description':  video_description.decode('utf-8'),
2874                                         'player_url':   None,
2875                                 })
2876                         except UnavailableVideoError, err:
2877                                 self._downloader.trouble(u'\nERROR: unable to download video')
2878
2879 class BlipTVIE(InfoExtractor):
2880         """Information extractor for blip.tv"""
2881
2882         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2883         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2884
2885         @staticmethod
2886         def suitable(url):
2887                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2888
2889         def report_extraction(self, file_id):
2890                 """Report information extraction."""
2891                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2892
2893         def _simplify_title(self, title):
2894                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2895                 res = res.strip(ur'_')
2896                 return res
2897
2898         def _real_extract(self, url):
2899                 mobj = re.match(self._VALID_URL, url)
2900                 if mobj is None:
2901                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2902                         return
2903
2904                 if '?' in url:
2905                         cchar = '&'
2906                 else:
2907                         cchar = '?'
2908                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2909                 request = urllib2.Request(json_url)
2910                 self.report_extraction(mobj.group(1))
2911                 try:
2912                         json_code = urllib2.urlopen(request).read()
2913                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2914                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2915                         return
2916                 try:
2917                         json_data = json.loads(json_code)
2918                         if 'Post' in json_data:
2919                                 data = json_data['Post']
2920                         else:
2921                                 data = json_data
2922
2923                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2924                         video_url = data['media']['url']
2925                         umobj = re.match(self._URL_EXT, video_url)
2926                         if umobj is None:
2927                                 raise ValueError('Can not determine filename extension')
2928                         ext = umobj.group(1)
2929
2930                         self._downloader.increment_downloads()
2931
2932                         info = {
2933                                 'id': data['item_id'],
2934                                 'url': video_url,
2935                                 'uploader': data['display_name'],
2936                                 'upload_date': upload_date,
2937                                 'title': data['title'],
2938                                 'stitle': self._simplify_title(data['title']),
2939                                 'ext': ext,
2940                                 'format': data['media']['mimeType'],
2941                                 'thumbnail': data['thumbnailUrl'],
2942                                 'description': data['description'],
2943                                 'player_url': data['embedUrl']
2944                         }
2945                 except (ValueError,KeyError), err:
2946                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2947                         return
2948
2949                 try:
2950                         self._downloader.process_info(info)
2951                 except UnavailableVideoError, err:
2952                         self._downloader.trouble(u'\nERROR: unable to download video')
2953
2954
2955 class PostProcessor(object):
2956         """Post Processor class.
2957
2958         PostProcessor objects can be added to downloaders with their
2959         add_post_processor() method. When the downloader has finished a
2960         successful download, it will take its internal chain of PostProcessors
2961         and start calling the run() method on each one of them, first with
2962         an initial argument and then with the returned value of the previous
2963         PostProcessor.
2964
2965         The chain will be stopped if one of them ever returns None or the end
2966         of the chain is reached.
2967
2968         PostProcessor objects follow a "mutual registration" process similar
2969         to InfoExtractor objects.
2970         """
2971
2972         _downloader = None
2973
2974         def __init__(self, downloader=None):
2975                 self._downloader = downloader
2976
2977         def set_downloader(self, downloader):
2978                 """Sets the downloader for this PP."""
2979                 self._downloader = downloader
2980
2981         def run(self, information):
2982                 """Run the PostProcessor.
2983
2984                 The "information" argument is a dictionary like the ones
2985                 composed by InfoExtractors. The only difference is that this
2986                 one has an extra field called "filepath" that points to the
2987                 downloaded file.
2988
2989                 When this method returns None, the postprocessing chain is
2990                 stopped. However, this method may return an information
2991                 dictionary that will be passed to the next postprocessing
2992                 object in the chain. It can be the one it received after
2993                 changing some fields.
2994
2995                 In addition, this method may raise a PostProcessingError
2996                 exception that will be taken into account by the downloader
2997                 it was called from.
2998                 """
2999                 return information # by default, do nothing
3000
3001
3002 class FFmpegExtractAudioPP(PostProcessor):
3003
3004         def __init__(self, downloader=None, preferredcodec=None):
3005                 PostProcessor.__init__(self, downloader)
3006                 if preferredcodec is None:
3007                         preferredcodec = 'best'
3008                 self._preferredcodec = preferredcodec
3009
3010         @staticmethod
3011         def get_audio_codec(path):
3012                 try:
3013                         cmd = ['ffprobe', '-show_streams', '--', path]
3014                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3015                         output = handle.communicate()[0]
3016                         if handle.wait() != 0:
3017                                 return None
3018                 except (IOError, OSError):
3019                         return None
3020                 audio_codec = None
3021                 for line in output.split('\n'):
3022                         if line.startswith('codec_name='):
3023                                 audio_codec = line.split('=')[1].strip()
3024                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3025                                 return audio_codec
3026                 return None
3027
3028         @staticmethod
3029         def run_ffmpeg(path, out_path, codec, more_opts):
3030                 try:
3031                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3032                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3033                         return (ret == 0)
3034                 except (IOError, OSError):
3035                         return False
3036
3037         def run(self, information):
3038                 path = information['filepath']
3039
3040                 filecodec = self.get_audio_codec(path)
3041                 if filecodec is None:
3042                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3043                         return None
3044
3045                 more_opts = []
3046                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3047                         if filecodec == 'aac' or filecodec == 'mp3':
3048                                 # Lossless if possible
3049                                 acodec = 'copy'
3050                                 extension = filecodec
3051                                 if filecodec == 'aac':
3052                                         more_opts = ['-f', 'adts']
3053                         else:
3054                                 # MP3 otherwise.
3055                                 acodec = 'libmp3lame'
3056                                 extension = 'mp3'
3057                                 more_opts = ['-ab', '128k']
3058                 else:
3059                         # We convert the audio (lossy)
3060                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3061                         extension = self._preferredcodec
3062                         more_opts = ['-ab', '128k']
3063                         if self._preferredcodec == 'aac':
3064                                 more_opts += ['-f', 'adts']
3065
3066                 (prefix, ext) = os.path.splitext(path)
3067                 new_path = prefix + '.' + extension
3068                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3069                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3070
3071                 if not status:
3072                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3073                         return None
3074
3075                 try:
3076                         os.remove(path)
3077                 except (IOError, OSError):
3078                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3079                         return None
3080
3081                 information['filepath'] = new_path
3082                 return information
3083
3084
3085 def updateSelf(downloader, filename):
3086         ''' Update the program file with the latest version from the repository '''
3087         # Note: downloader only used for options
3088         if not os.access(filename, os.W_OK):
3089                 sys.exit('ERROR: no write permissions on %s' % filename)
3090
3091         downloader.to_screen('Updating to latest version...')
3092
3093         try:
3094                 try:
3095                         urlh = urllib.urlopen(UPDATE_URL)
3096                         newcontent = urlh.read()
3097                 finally:
3098                         urlh.close()
3099         except (IOError, OSError), err:
3100                 sys.exit('ERROR: unable to download latest version')
3101
3102         try:
3103                 outf = open(filename, 'wb')
3104                 try:
3105                         outf.write(newcontent)
3106                 finally:
3107                         outf.close()
3108         except (IOError, OSError), err:
3109                 sys.exit('ERROR: unable to overwrite current version')
3110
3111         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3112
3113 def parseOpts():
3114         # Deferred imports
3115         import getpass
3116         import optparse
3117
3118         def _format_option_string(option):
3119                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3120
3121                 opts = []
3122
3123                 if option._short_opts: opts.append(option._short_opts[0])
3124                 if option._long_opts: opts.append(option._long_opts[0])
3125                 if len(opts) > 1: opts.insert(1, ', ')
3126
3127                 if option.takes_value(): opts.append(' %s' % option.metavar)
3128
3129                 return "".join(opts)
3130
3131         def _find_term_columns():
3132                 columns = os.environ.get('COLUMNS', None)
3133                 if columns:
3134                         return int(columns)
3135
3136                 try:
3137                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3138                         out,err = sp.communicate()
3139                         return int(out.split()[1])
3140                 except:
3141                         pass
3142                 return None
3143
3144         max_width = 80
3145         max_help_position = 80
3146
3147         # No need to wrap help messages if we're on a wide console
3148         columns = _find_term_columns()
3149         if columns: max_width = columns
3150
3151         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3152         fmt.format_option_strings = _format_option_string
3153
3154         kw = {
3155                 'version'   : __version__,
3156                 'formatter' : fmt,
3157                 'usage' : '%prog [options] url...',
3158                 'conflict_handler' : 'resolve',
3159         }
3160
3161         parser = optparse.OptionParser(**kw)
3162
3163         # option groups
3164         general        = optparse.OptionGroup(parser, 'General Options')
3165         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3166         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3167         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3168         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3169         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3170
3171         general.add_option('-h', '--help',
3172                         action='help', help='print this help text and exit')
3173         general.add_option('-v', '--version',
3174                         action='version', help='print program version and exit')
3175         general.add_option('-U', '--update',
3176                         action='store_true', dest='update_self', help='update this program to latest version')
3177         general.add_option('-i', '--ignore-errors',
3178                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3179         general.add_option('-r', '--rate-limit',
3180                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3181         general.add_option('-R', '--retries',
3182                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3183         general.add_option('--playlist-start',
3184                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3185         general.add_option('--playlist-end',
3186                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3187         general.add_option('--dump-user-agent',
3188                         action='store_true', dest='dump_user_agent',
3189                         help='display the current browser identification', default=False)
3190
3191         authentication.add_option('-u', '--username',
3192                         dest='username', metavar='USERNAME', help='account username')
3193         authentication.add_option('-p', '--password',
3194                         dest='password', metavar='PASSWORD', help='account password')
3195         authentication.add_option('-n', '--netrc',
3196                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3197
3198
3199         video_format.add_option('-f', '--format',
3200                         action='store', dest='format', metavar='FORMAT', help='video format code')
3201         video_format.add_option('--all-formats',
3202                         action='store_const', dest='format', help='download all available video formats', const='-1')
3203         video_format.add_option('--max-quality',
3204                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3205
3206
3207         verbosity.add_option('-q', '--quiet',
3208                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3209         verbosity.add_option('-s', '--simulate',
3210                         action='store_true', dest='simulate', help='do not download video', default=False)
3211         verbosity.add_option('-g', '--get-url',
3212                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3213         verbosity.add_option('-e', '--get-title',
3214                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3215         verbosity.add_option('--get-thumbnail',
3216                         action='store_true', dest='getthumbnail',
3217                         help='simulate, quiet but print thumbnail URL', default=False)
3218         verbosity.add_option('--get-description',
3219                         action='store_true', dest='getdescription',
3220                         help='simulate, quiet but print video description', default=False)
3221         verbosity.add_option('--get-filename',
3222                         action='store_true', dest='getfilename',
3223                         help='simulate, quiet but print output filename', default=False)
3224         verbosity.add_option('--no-progress',
3225                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3226         verbosity.add_option('--console-title',
3227                         action='store_true', dest='consoletitle',
3228                         help='display progress in console titlebar', default=False)
3229
3230
3231         filesystem.add_option('-t', '--title',
3232                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3233         filesystem.add_option('-l', '--literal',
3234                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3235         filesystem.add_option('-A', '--auto-number',
3236                         action='store_true', dest='autonumber',
3237                         help='number downloaded files starting from 00000', default=False)
3238         filesystem.add_option('-o', '--output',
3239                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3240         filesystem.add_option('-a', '--batch-file',
3241                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3242         filesystem.add_option('-w', '--no-overwrites',
3243                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3244         filesystem.add_option('-c', '--continue',
3245                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3246         filesystem.add_option('--cookies',
3247                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3248         filesystem.add_option('--no-part',
3249                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3250         filesystem.add_option('--no-mtime',
3251                         action='store_false', dest='updatetime',
3252                         help='do not use the Last-modified header to set the file modification time', default=True)
3253         filesystem.add_option('--write-description',
3254                         action='store_true', dest='writedescription',
3255                         help='write video description to a .description file', default=False)
3256         filesystem.add_option('--write-info-json',
3257                         action='store_true', dest='writeinfojson',
3258                         help='write video metadata to a .info.json file', default=False)
3259
3260
3261         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3262                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3263         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3264                         help='"best", "aac" or "mp3"; best by default')
3265
3266
3267         parser.add_option_group(general)
3268         parser.add_option_group(filesystem)
3269         parser.add_option_group(verbosity)
3270         parser.add_option_group(video_format)
3271         parser.add_option_group(authentication)
3272         parser.add_option_group(postproc)
3273
3274         opts, args = parser.parse_args()
3275
3276         return parser, opts, args
3277
3278 def main():
3279         parser, opts, args = parseOpts()
3280
3281         # Open appropriate CookieJar
3282         if opts.cookiefile is None:
3283                 jar = cookielib.CookieJar()
3284         else:
3285                 try:
3286                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3287                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3288                                 jar.load()
3289                 except (IOError, OSError), err:
3290                         sys.exit(u'ERROR: unable to open cookie file')
3291
3292         # Dump user agent
3293         if opts.dump_user_agent:
3294                 print std_headers['User-Agent']
3295                 sys.exit(0)
3296
3297         # General configuration
3298         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3299         urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3300         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3301
3302         # Batch file verification
3303         batchurls = []
3304         if opts.batchfile is not None:
3305                 try:
3306                         if opts.batchfile == '-':
3307                                 batchfd = sys.stdin
3308                         else:
3309                                 batchfd = open(opts.batchfile, 'r')
3310                         batchurls = batchfd.readlines()
3311                         batchurls = [x.strip() for x in batchurls]
3312                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3313                 except IOError:
3314                         sys.exit(u'ERROR: batch file could not be read')
3315         all_urls = batchurls + args
3316
3317         # Conflicting, missing and erroneous options
3318         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3319                 parser.error(u'using .netrc conflicts with giving username/password')
3320         if opts.password is not None and opts.username is None:
3321                 parser.error(u'account username missing')
3322         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3323                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3324         if opts.usetitle and opts.useliteral:
3325                 parser.error(u'using title conflicts with using literal title')
3326         if opts.username is not None and opts.password is None:
3327                 opts.password = getpass.getpass(u'Type account password and press return:')
3328         if opts.ratelimit is not None:
3329                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3330                 if numeric_limit is None:
3331                         parser.error(u'invalid rate limit specified')
3332                 opts.ratelimit = numeric_limit
3333         if opts.retries is not None:
3334                 try:
3335                         opts.retries = long(opts.retries)
3336                 except (TypeError, ValueError), err:
3337                         parser.error(u'invalid retry count specified')
3338         try:
3339                 opts.playliststart = int(opts.playliststart)
3340                 if opts.playliststart <= 0:
3341                         raise ValueError(u'Playlist start must be positive')
3342         except (TypeError, ValueError), err:
3343                 parser.error(u'invalid playlist start number specified')
3344         try:
3345                 opts.playlistend = int(opts.playlistend)
3346                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3347                         raise ValueError(u'Playlist end must be greater than playlist start')
3348         except (TypeError, ValueError), err:
3349                 parser.error(u'invalid playlist end number specified')
3350         if opts.extractaudio:
3351                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3352                         parser.error(u'invalid audio format specified')
3353
3354         # Information extractors
3355         youtube_ie = YoutubeIE()
3356         metacafe_ie = MetacafeIE(youtube_ie)
3357         dailymotion_ie = DailymotionIE()
3358         youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3359         youtube_user_ie = YoutubeUserIE(youtube_ie)
3360         youtube_search_ie = YoutubeSearchIE(youtube_ie)
3361         google_ie = GoogleIE()
3362         google_search_ie = GoogleSearchIE(google_ie)
3363         photobucket_ie = PhotobucketIE()
3364         yahoo_ie = YahooIE()
3365         yahoo_search_ie = YahooSearchIE(yahoo_ie)
3366         deposit_files_ie = DepositFilesIE()
3367         facebook_ie = FacebookIE()
3368         bliptv_ie = BlipTVIE()
3369         vimeo_ie = VimeoIE()
3370         generic_ie = GenericIE()
3371
3372         # File downloader
3373         fd = FileDownloader({
3374                 'usenetrc': opts.usenetrc,
3375                 'username': opts.username,
3376                 'password': opts.password,
3377                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3378                 'forceurl': opts.geturl,
3379                 'forcetitle': opts.gettitle,
3380                 'forcethumbnail': opts.getthumbnail,
3381                 'forcedescription': opts.getdescription,
3382                 'forcefilename': opts.getfilename,
3383                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3384                 'format': opts.format,
3385                 'format_limit': opts.format_limit,
3386                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3387                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3388                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3389                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3390                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3391                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3392                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3393                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3394                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3395                         or u'%(id)s.%(ext)s'),
3396                 'ignoreerrors': opts.ignoreerrors,
3397                 'ratelimit': opts.ratelimit,
3398                 'nooverwrites': opts.nooverwrites,
3399                 'retries': opts.retries,
3400                 'continuedl': opts.continue_dl,
3401                 'noprogress': opts.noprogress,
3402                 'playliststart': opts.playliststart,
3403                 'playlistend': opts.playlistend,
3404                 'logtostderr': opts.outtmpl == '-',
3405                 'consoletitle': opts.consoletitle,
3406                 'nopart': opts.nopart,
3407                 'updatetime': opts.updatetime,
3408                 'writedescription': opts.writedescription,
3409                 'writeinfojson': opts.writeinfojson,
3410                 })
3411         fd.add_info_extractor(youtube_search_ie)
3412         fd.add_info_extractor(youtube_pl_ie)
3413         fd.add_info_extractor(youtube_user_ie)
3414         fd.add_info_extractor(metacafe_ie)
3415         fd.add_info_extractor(dailymotion_ie)
3416         fd.add_info_extractor(youtube_ie)
3417         fd.add_info_extractor(google_ie)
3418         fd.add_info_extractor(google_search_ie)
3419         fd.add_info_extractor(photobucket_ie)
3420         fd.add_info_extractor(yahoo_ie)
3421         fd.add_info_extractor(yahoo_search_ie)
3422         fd.add_info_extractor(deposit_files_ie)
3423         fd.add_info_extractor(facebook_ie)
3424         fd.add_info_extractor(bliptv_ie)
3425         fd.add_info_extractor(vimeo_ie)
3426
3427         # This must come last since it's the
3428         # fallback if none of the others work
3429         fd.add_info_extractor(generic_ie)
3430
3431         # PostProcessors
3432         if opts.extractaudio:
3433                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3434
3435         # Update version
3436         if opts.update_self:
3437                 updateSelf(fd, sys.argv[0])
3438
3439         # Maybe do nothing
3440         if len(all_urls) < 1:
3441                 if not opts.update_self:
3442                         parser.error(u'you must provide at least one URL')
3443                 else:
3444                         sys.exit()
3445         retcode = fd.download(all_urls)
3446
3447         # Dump cookie jar if requested
3448         if opts.cookiefile is not None:
3449                 try:
3450                         jar.save()
3451                 except (IOError, OSError), err:
3452                         sys.exit(u'ERROR: unable to save cookie jar')
3453
3454         sys.exit(retcode)
3455
3456
3457 if __name__ == '__main__':
3458         try:
3459                 main()
3460         except DownloadError:
3461                 sys.exit(1)
3462         except SameFileError:
3463                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3464         except KeyboardInterrupt:
3465                 sys.exit(u'\nERROR: Interrupted by user')
3466
3467 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: