youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.09.15'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import HTMLParser
  27 import httplib
  28 import locale
  29 import math
  30 import netrc
  31 import os
  32 import os.path
  33 import re
  34 import socket
  35 import string
  36 import subprocess
  37 import sys
  38 import time
  39 import urllib
  40 import urllib2
  41 import warnings
  42 import zlib
  43
  44 if os.name == 'nt':
  45         import ctypes
  46
  47 try:
  48         import email.utils
  49 except ImportError: # Python 2.4
  50         import email.Utils
  51 try:
  52         import cStringIO as StringIO
  53 except ImportError:
  54         import StringIO
  55
  56 # parse_qs was moved from the cgi module to the urlparse module recently.
  57 try:
  58         from urlparse import parse_qs
  59 except ImportError:
  60         from cgi import parse_qs
  61
  62 try:
  63         import lxml.etree
  64 except ImportError:
  65         pass # Handled below
  66
  67 try:
  68         import xml.etree.ElementTree
  69 except ImportError: # Python<2.5: Not officially supported, but let it slip
  70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  71
  72 std_headers = {
  73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  76         'Accept-Encoding': 'gzip, deflate',
  77         'Accept-Language': 'en-us,en;q=0.5',
  78 }
  79
  80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280
 281 class DownloadError(Exception):
 282         """Download Error exception.
 283
 284         This exception may be thrown by FileDownloader objects if they are not
 285         configured to continue on errors. They will contain the appropriate
 286         error message.
 287         """
 288         pass
 289
 290
 291 class SameFileError(Exception):
 292         """Same File exception.
 293
 294         This exception will be thrown by FileDownloader objects if they detect
 295         multiple files would have to be downloaded to the same file on disk.
 296         """
 297         pass
 298
 299
 300 class PostProcessingError(Exception):
 301         """Post Processing exception.
 302
 303         This exception may be raised by PostProcessor's .run() method to
 304         indicate an error in the postprocessing task.
 305         """
 306         pass
 307
 308
 309 class UnavailableVideoError(Exception):
 310         """Unavailable Format exception.
 311
 312         This exception will be thrown when a video is requested
 313         in a format that is not available for that video.
 314         """
 315         pass
 316
 317
 318 class ContentTooShortError(Exception):
 319         """Content Too Short exception.
 320
 321         This exception may be raised by FileDownloader objects when a file they
 322         download is too small for what the server announced first, indicating
 323         the connection was probably interrupted.
 324         """
 325         # Both in bytes
 326         downloaded = None
 327         expected = None
 328
 329         def __init__(self, downloaded, expected):
 330                 self.downloaded = downloaded
 331                 self.expected = expected
 332
 333
 334 class YoutubeDLHandler(urllib2.HTTPHandler):
 335         """Handler for HTTP requests and responses.
 336
 337         This class, when installed with an OpenerDirector, automatically adds
 338         the standard headers to every HTTP request and handles gzipped and
 339         deflated responses from web servers. If compression is to be avoided in
 340         a particular request, the original request in the program code only has
 341         to include the HTTP header "Youtubedl-No-Compression", which will be
 342         removed before making the real request.
 343
 344         Part of this code was copied from:
 345
 346         http://techknack.net/python-urllib2-handlers/
 347
 348         Andrew Rowls, the author of that code, agreed to release it to the
 349         public domain.
 350         """
 351
 352         @staticmethod
 353         def deflate(data):
 354                 try:
 355                         return zlib.decompress(data, -zlib.MAX_WBITS)
 356                 except zlib.error:
 357                         return zlib.decompress(data)
 358
 359         @staticmethod
 360         def addinfourl_wrapper(stream, headers, url, code):
 361                 if hasattr(urllib2.addinfourl, 'getcode'):
 362                         return urllib2.addinfourl(stream, headers, url, code)
 363                 ret = urllib2.addinfourl(stream, headers, url)
 364                 ret.code = code
 365                 return ret
 366
 367         def http_request(self, req):
 368                 for h in std_headers:
 369                         if h in req.headers:
 370                                 del req.headers[h]
 371                         req.add_header(h, std_headers[h])
 372                 if 'Youtubedl-no-compression' in req.headers:
 373                         if 'Accept-encoding' in req.headers:
 374                                 del req.headers['Accept-encoding']
 375                         del req.headers['Youtubedl-no-compression']
 376                 return req
 377
 378         def http_response(self, req, resp):
 379                 old_resp = resp
 380                 # gzip
 381                 if resp.headers.get('Content-encoding', '') == 'gzip':
 382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 384                         resp.msg = old_resp.msg
 385                 # deflate
 386                 if resp.headers.get('Content-encoding', '') == 'deflate':
 387                         gz = StringIO.StringIO(self.deflate(resp.read()))
 388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 389                         resp.msg = old_resp.msg
 390                 return resp
 391
 392
 393 class FileDownloader(object):
 394         """File Downloader class.
 395
 396         File downloader objects are the ones responsible of downloading the
 397         actual video file and writing it to disk if the user has requested
 398         it, among some other tasks. In most cases there should be one per
 399         program. As, given a video URL, the downloader doesn't know how to
 400         extract all the needed information, task that InfoExtractors do, it
 401         has to pass the URL to one of them.
 402
 403         For this, file downloader objects have a method that allows
 404         InfoExtractors to be registered in a given order. When it is passed
 405         a URL, the file downloader handles it to the first InfoExtractor it
 406         finds that reports being able to handle it. The InfoExtractor extracts
 407         all the information about the video or videos the URL refers to, and
 408         asks the FileDownloader to process the video information, possibly
 409         downloading the video.
 410
 411         File downloaders accept a lot of parameters. In order not to saturate
 412         the object constructor with arguments, it receives a dictionary of
 413         options instead. These options are available through the params
 414         attribute for the InfoExtractors to use. The FileDownloader also
 415         registers itself as the downloader in charge for the InfoExtractors
 416         that are added to it, so this is a "mutual registration".
 417
 418         Available options:
 419
 420         username:         Username for authentication purposes.
 421         password:         Password for authentication purposes.
 422         usenetrc:         Use netrc for authentication instead.
 423         quiet:            Do not print messages to stdout.
 424         forceurl:         Force printing final URL.
 425         forcetitle:       Force printing title.
 426         forcethumbnail:   Force printing thumbnail URL.
 427         forcedescription: Force printing description.
 428         forcefilename:    Force printing final filename.
 429         simulate:         Do not download the video files.
 430         format:           Video format code.
 431         format_limit:     Highest quality format to try.
 432         outtmpl:          Template for output names.
 433         ignoreerrors:     Do not stop on download errors.
 434         ratelimit:        Download speed limit, in bytes/sec.
 435         nooverwrites:     Prevent overwriting files.
 436         retries:          Number of times to retry for HTTP error 5xx
 437         continuedl:       Try to continue downloads if possible.
 438         noprogress:       Do not print the progress bar.
 439         playliststart:    Playlist item to start at.
 440         playlistend:      Playlist item to end at.
 441         matchtitle:       Download only matching titles.
 442         rejecttitle:      Reject downloads for matching titles.
 443         logtostderr:      Log messages to stderr instead of stdout.
 444         consoletitle:     Display progress in console window's titlebar.
 445         nopart:           Do not use temporary .part files.
 446         updatetime:       Use the Last-modified header to set output file timestamps.
 447         writedescription: Write the video description to a .description file
 448         writeinfojson:    Write the video description to a .info.json file
 449         """
 450
 451         params = None
 452         _ies = []
 453         _pps = []
 454         _download_retcode = None
 455         _num_downloads = None
 456         _screen_file = None
 457
 458         def __init__(self, params):
 459                 """Create a FileDownloader object with the given options."""
 460                 self._ies = []
 461                 self._pps = []
 462                 self._download_retcode = 0
 463                 self._num_downloads = 0
 464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 465                 self.params = params
 466
 467         @staticmethod
 468         def format_bytes(bytes):
 469                 if bytes is None:
 470                         return 'N/A'
 471                 if type(bytes) is str:
 472                         bytes = float(bytes)
 473                 if bytes == 0.0:
 474                         exponent = 0
 475                 else:
 476                         exponent = long(math.log(bytes, 1024.0))
 477                 suffix = 'bkMGTPEZY'[exponent]
 478                 converted = float(bytes) / float(1024 ** exponent)
 479                 return '%.2f%s' % (converted, suffix)
 480
 481         @staticmethod
 482         def calc_percent(byte_counter, data_len):
 483                 if data_len is None:
 484                         return '---.-%'
 485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 486
 487         @staticmethod
 488         def calc_eta(start, now, total, current):
 489                 if total is None:
 490                         return '--:--'
 491                 dif = now - start
 492                 if current == 0 or dif < 0.001: # One millisecond
 493                         return '--:--'
 494                 rate = float(current) / dif
 495                 eta = long((float(total) - float(current)) / rate)
 496                 (eta_mins, eta_secs) = divmod(eta, 60)
 497                 if eta_mins > 99:
 498                         return '--:--'
 499                 return '%02d:%02d' % (eta_mins, eta_secs)
 500
 501         @staticmethod
 502         def calc_speed(start, now, bytes):
 503                 dif = now - start
 504                 if bytes == 0 or dif < 0.001: # One millisecond
 505                         return '%10s' % '---b/s'
 506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 507
 508         @staticmethod
 509         def best_block_size(elapsed_time, bytes):
 510                 new_min = max(bytes / 2.0, 1.0)
 511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 512                 if elapsed_time < 0.001:
 513                         return long(new_max)
 514                 rate = bytes / elapsed_time
 515                 if rate > new_max:
 516                         return long(new_max)
 517                 if rate < new_min:
 518                         return long(new_min)
 519                 return long(rate)
 520
 521         @staticmethod
 522         def parse_bytes(bytestr):
 523                 """Parse a string indicating a byte quantity into a long integer."""
 524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 525                 if matchobj is None:
 526                         return None
 527                 number = float(matchobj.group(1))
 528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 529                 return long(round(number * multiplier))
 530
 531         def add_info_extractor(self, ie):
 532                 """Add an InfoExtractor object to the end of the list."""
 533                 self._ies.append(ie)
 534                 ie.set_downloader(self)
 535
 536         def add_post_processor(self, pp):
 537                 """Add a PostProcessor object to the end of the chain."""
 538                 self._pps.append(pp)
 539                 pp.set_downloader(self)
 540
 541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 542                 """Print message to stdout if not in quiet mode."""
 543                 try:
 544                         if not self.params.get('quiet', False):
 545                                 terminator = [u'\n', u''][skip_eol]
 546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 547                         self._screen_file.flush()
 548                 except (UnicodeEncodeError), err:
 549                         if not ignore_encoding_errors:
 550                                 raise
 551
 552         def to_stderr(self, message):
 553                 """Print message to stderr."""
 554                 print >>sys.stderr, message.encode(preferredencoding())
 555
 556         def to_cons_title(self, message):
 557                 """Set console/terminal window title to message."""
 558                 if not self.params.get('consoletitle', False):
 559                         return
 560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 561                         # c_wchar_p() might not be necessary if `message` is
 562                         # already of type unicode()
 563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 564                 elif 'TERM' in os.environ:
 565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 566
 567         def fixed_template(self):
 568                 """Checks if the output template is fixed."""
 569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 570
 571         def trouble(self, message=None):
 572                 """Determine action to take when a download problem appears.
 573
 574                 Depending on if the downloader has been configured to ignore
 575                 download errors or not, this method may throw an exception or
 576                 not when errors are found, after printing the message.
 577                 """
 578                 if message is not None:
 579                         self.to_stderr(message)
 580                 if not self.params.get('ignoreerrors', False):
 581                         raise DownloadError(message)
 582                 self._download_retcode = 1
 583
 584         def slow_down(self, start_time, byte_counter):
 585                 """Sleep if the download speed is over the rate limit."""
 586                 rate_limit = self.params.get('ratelimit', None)
 587                 if rate_limit is None or byte_counter == 0:
 588                         return
 589                 now = time.time()
 590                 elapsed = now - start_time
 591                 if elapsed <= 0.0:
 592                         return
 593                 speed = float(byte_counter) / elapsed
 594                 if speed > rate_limit:
 595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 596
 597         def temp_name(self, filename):
 598                 """Returns a temporary filename for the given filename."""
 599                 if self.params.get('nopart', False) or filename == u'-' or \
 600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 601                         return filename
 602                 return filename + u'.part'
 603
 604         def undo_temp_name(self, filename):
 605                 if filename.endswith(u'.part'):
 606                         return filename[:-len(u'.part')]
 607                 return filename
 608
 609         def try_rename(self, old_filename, new_filename):
 610                 try:
 611                         if old_filename == new_filename:
 612                                 return
 613                         os.rename(old_filename, new_filename)
 614                 except (IOError, OSError), err:
 615                         self.trouble(u'ERROR: unable to rename file')
 616
 617         def try_utime(self, filename, last_modified_hdr):
 618                 """Try to set the last-modified time of the given file."""
 619                 if last_modified_hdr is None:
 620                         return
 621                 if not os.path.isfile(filename):
 622                         return
 623                 timestr = last_modified_hdr
 624                 if timestr is None:
 625                         return
 626                 filetime = timeconvert(timestr)
 627                 if filetime is None:
 628                         return filetime
 629                 try:
 630                         os.utime(filename, (time.time(), filetime))
 631                 except:
 632                         pass
 633                 return filetime
 634
 635         def report_writedescription(self, descfn):
 636                 """ Report that the description file is being written """
 637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 638
 639         def report_writeinfojson(self, infofn):
 640                 """ Report that the metadata file has been written """
 641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 642
 643         def report_destination(self, filename):
 644                 """Report destination filename."""
 645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 646
 647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 648                 """Report download progress."""
 649                 if self.params.get('noprogress', False):
 650                         return
 651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 655
 656         def report_resuming_byte(self, resume_len):
 657                 """Report attempt to resume at given byte."""
 658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 659
 660         def report_retry(self, count, retries):
 661                 """Report retry in case of HTTP error 5xx"""
 662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 663
 664         def report_file_already_downloaded(self, file_name):
 665                 """Report file has already been fully downloaded."""
 666                 try:
 667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 668                 except (UnicodeEncodeError), err:
 669                         self.to_screen(u'[download] The file has already been downloaded')
 670
 671         def report_unable_to_resume(self):
 672                 """Report it was impossible to resume download."""
 673                 self.to_screen(u'[download] Unable to resume')
 674
 675         def report_finish(self):
 676                 """Report download finished."""
 677                 if self.params.get('noprogress', False):
 678                         self.to_screen(u'[download] Download completed')
 679                 else:
 680                         self.to_screen(u'')
 681
 682         def increment_downloads(self):
 683                 """Increment the ordinal that assigns a number to each file."""
 684                 self._num_downloads += 1
 685
 686         def prepare_filename(self, info_dict):
 687                 """Generate the output filename."""
 688                 try:
 689                         template_dict = dict(info_dict)
 690                         template_dict['epoch'] = unicode(long(time.time()))
 691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 692                         filename = self.params['outtmpl'] % template_dict
 693                         return filename
 694                 except (ValueError, KeyError), err:
 695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 696                         return None
 697
 698         def process_info(self, info_dict):
 699                 """Process a single dictionary returned by an InfoExtractor."""
 700                 filename = self.prepare_filename(info_dict)
 701
 702                 # Forced printings
 703                 if self.params.get('forcetitle', False):
 704                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 705                 if self.params.get('forceurl', False):
 706                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 707                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 708                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 710                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 711                 if self.params.get('forcefilename', False) and filename is not None:
 712                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 713                 if self.params.get('forceformat', False):
 714                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 715
 716                 # Do nothing else if in simulate mode
 717                 if self.params.get('simulate', False):
 718                         return
 719
 720                 if filename is None:
 721                         return
 722
 723                 matchtitle=self.params.get('matchtitle',False)
 724                 rejecttitle=self.params.get('rejecttitle',False)
 725                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 726                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 727                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 728                         return
 729                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 730                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 731                         return
 732
 733                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 734                         self.to_stderr(u'WARNING: file exists and will be skipped')
 735                         return
 736
 737                 try:
 738                         dn = os.path.dirname(filename)
 739                         if dn != '' and not os.path.exists(dn):
 740                                 os.makedirs(dn)
 741                 except (OSError, IOError), err:
 742                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 743                         return
 744
 745                 if self.params.get('writedescription', False):
 746                         try:
 747                                 descfn = filename + '.description'
 748                                 self.report_writedescription(descfn)
 749                                 descfile = open(descfn, 'wb')
 750                                 try:
 751                                         descfile.write(info_dict['description'].encode('utf-8'))
 752                                 finally:
 753                                         descfile.close()
 754                         except (OSError, IOError):
 755                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 756                                 return
 757
 758                 if self.params.get('writeinfojson', False):
 759                         infofn = filename + '.info.json'
 760                         self.report_writeinfojson(infofn)
 761                         try:
 762                                 json.dump
 763                         except (NameError,AttributeError):
 764                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 765                                 return
 766                         try:
 767                                 infof = open(infofn, 'wb')
 768                                 try:
 769                                         json.dump(info_dict, infof)
 770                                 finally:
 771                                         infof.close()
 772                         except (OSError, IOError):
 773                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 774                                 return
 775
 776                 if not self.params.get('skip_download', False):
 777                         try:
 778                                 success,add_data = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 779                                 info_dict.update(add_data)
 780                         except (OSError, IOError), err:
 781                                 raise UnavailableVideoError
 782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 783                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 784                                 return
 785                         except (ContentTooShortError, ), err:
 786                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 787                                 return
 788
 789                         if success:
 790                                 try:
 791                                         self.post_process(filename, info_dict)
 792                                 except (PostProcessingError), err:
 793                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 794                                         return
 795
 796         def download(self, url_list):
 797                 """Download a given list of URLs."""
 798                 if len(url_list) > 1 and self.fixed_template():
 799                         raise SameFileError(self.params['outtmpl'])
 800
 801                 for url in url_list:
 802                         suitable_found = False
 803                         for ie in self._ies:
 804                                 # Go to next InfoExtractor if not suitable
 805                                 if not ie.suitable(url):
 806                                         continue
 807
 808                                 # Suitable InfoExtractor found
 809                                 suitable_found = True
 810
 811                                 # Extract information from URL and process it
 812                                 ie.extract(url)
 813
 814                                 # Suitable InfoExtractor had been found; go to next URL
 815                                 break
 816
 817                         if not suitable_found:
 818                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 819
 820                 return self._download_retcode
 821
 822         def post_process(self, filename, ie_info):
 823                 """Run the postprocessing chain on the given file."""
 824                 info = dict(ie_info)
 825                 info['filepath'] = filename
 826                 for pp in self._pps:
 827                         info = pp.run(info)
 828                         if info is None:
 829                                 break
 830
 831         def _download_with_rtmpdump(self, filename, url, player_url):
 832                 self.report_destination(filename)
 833                 tmpfilename = self.temp_name(filename)
 834
 835                 # Check for rtmpdump first
 836                 try:
 837                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 838                 except (OSError, IOError):
 839                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 840                         return False
 841
 842                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 843                 # the connection was interrumpted and resuming appears to be
 844                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 845                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 846                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 847                 while retval == 2 or retval == 1:
 848                         prevsize = os.path.getsize(tmpfilename)
 849                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 850                         time.sleep(5.0) # This seems to be needed
 851                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 852                         cursize = os.path.getsize(tmpfilename)
 853                         if prevsize == cursize and retval == 1:
 854                                 break
 855                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 856                         if prevsize == cursize and retval == 2 and cursize > 1024:
 857                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 858                                 retval = 0
 859                                 break
 860                 if retval == 0:
 861                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 862                         self.try_rename(tmpfilename, filename)
 863                         return True
 864                 else:
 865                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 866                         return False
 867
 868         def _do_download(self, filename, url, player_url):
 869                 # Check file already present
 870                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 871                         self.report_file_already_downloaded(filename)
 872                         return True
 873
 874                 # Attempt to download using rtmpdump
 875                 if url.startswith('rtmp'):
 876                         return self._download_with_rtmpdump(filename, url, player_url)
 877
 878                 tmpfilename = self.temp_name(filename)
 879                 stream = None
 880                 open_mode = 'wb'
 881
 882                 # Do not include the Accept-Encoding header
 883                 headers = {'Youtubedl-no-compression': 'True'}
 884                 basic_request = urllib2.Request(url, None, headers)
 885                 request = urllib2.Request(url, None, headers)
 886
 887                 # Establish possible resume length
 888                 if os.path.isfile(tmpfilename):
 889                         resume_len = os.path.getsize(tmpfilename)
 890                 else:
 891                         resume_len = 0
 892
 893                 # Request parameters in case of being able to resume
 894                 if self.params.get('continuedl', False) and resume_len != 0:
 895                         self.report_resuming_byte(resume_len)
 896                         request.add_header('Range', 'bytes=%d-' % resume_len)
 897                         open_mode = 'ab'
 898
 899                 count = 0
 900                 retries = self.params.get('retries', 0)
 901                 while count <= retries:
 902                         # Establish connection
 903                         try:
 904                                 data = urllib2.urlopen(request)
 905                                 break
 906                         except (urllib2.HTTPError, ), err:
 907                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 908                                         # Unexpected HTTP error
 909                                         raise
 910                                 elif err.code == 416:
 911                                         # Unable to resume (requested range not satisfiable)
 912                                         try:
 913                                                 # Open the connection again without the range header
 914                                                 data = urllib2.urlopen(basic_request)
 915                                                 content_length = data.info()['Content-Length']
 916                                         except (urllib2.HTTPError, ), err:
 917                                                 if err.code < 500 or err.code >= 600:
 918                                                         raise
 919                                         else:
 920                                                 # Examine the reported length
 921                                                 if (content_length is not None and
 922                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 923                                                         # The file had already been fully downloaded.
 924                                                         # Explanation to the above condition: in issue #175 it was revealed that
 925                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 926                                                         # changing the file size slightly and causing problems for some users. So
 927                                                         # I decided to implement a suggested change and consider the file
 928                                                         # completely downloaded if the file size differs less than 100 bytes from
 929                                                         # the one in the hard drive.
 930                                                         self.report_file_already_downloaded(filename)
 931                                                         self.try_rename(tmpfilename, filename)
 932                                                         return True
 933                                                 else:
 934                                                         # The length does not match, we start the download over
 935                                                         self.report_unable_to_resume()
 936                                                         open_mode = 'wb'
 937                                                         break
 938                         # Retry
 939                         count += 1
 940                         if count <= retries:
 941                                 self.report_retry(count, retries)
 942
 943                 if count > retries:
 944                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 945                         return False
 946
 947                 data_len = data.info().get('Content-length', None)
 948                 if data_len is not None:
 949                         data_len = long(data_len) + resume_len
 950                 data_len_str = self.format_bytes(data_len)
 951                 byte_counter = 0 + resume_len
 952                 block_size = 1024
 953                 start = time.time()
 954                 while True:
 955                         # Download and write
 956                         before = time.time()
 957                         data_block = data.read(block_size)
 958                         after = time.time()
 959                         if len(data_block) == 0:
 960                                 break
 961                         byte_counter += len(data_block)
 962
 963                         # Open file just in time
 964                         if stream is None:
 965                                 try:
 966                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 967                                         assert stream is not None
 968                                         filename = self.undo_temp_name(tmpfilename)
 969                                         self.report_destination(filename)
 970                                 except (OSError, IOError), err:
 971                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 972                                         return False
 973                         try:
 974                                 stream.write(data_block)
 975                         except (IOError, OSError), err:
 976                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 977                                 return False
 978                         block_size = self.best_block_size(after - before, len(data_block))
 979
 980                         # Progress message
 981                         percent_str = self.calc_percent(byte_counter, data_len)
 982                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 983                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 984                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 985
 986                         # Apply rate limit
 987                         self.slow_down(start, byte_counter - resume_len)
 988
 989                 if stream is None:
 990                         self.trouble(u'\nERROR: Did not get any data blocks')
 991                         return False
 992                 stream.close()
 993                 self.report_finish()
 994                 if data_len is not None and byte_counter != data_len:
 995                         raise ContentTooShortError(byte_counter, long(data_len))
 996                 self.try_rename(tmpfilename, filename)
 997
 998                 # Update file modification time
 999                 filetime = None
1000                 if self.params.get('updatetime', True):
1001                         filetime = self.try_utime(filename, data.info().get('last-modified', None))
1002
1003                 return True, {'filetime': filetime}
1004
1005
1006 class InfoExtractor(object):
1007         """Information Extractor class.
1008
1009         Information extractors are the classes that, given a URL, extract
1010         information from the video (or videos) the URL refers to. This
1011         information includes the real video URL, the video title and simplified
1012         title, author and others. The information is stored in a dictionary
1013         which is then passed to the FileDownloader. The FileDownloader
1014         processes this information possibly downloading the video to the file
1015         system, among other possible outcomes. The dictionaries must include
1016         the following fields:
1017
1018         id:             Video identifier.
1019         url:            Final video URL.
1020         uploader:       Nickname of the video uploader.
1021         title:          Literal title.
1022         stitle:         Simplified title.
1023         ext:            Video filename extension.
1024         format:         Video format.
1025         player_url:     SWF Player URL (may be None).
1026
1027         The following fields are optional. Their primary purpose is to allow
1028         youtube-dl to serve as the backend for a video search function, such
1029         as the one in youtube2mp3.  They are only used when their respective
1030         forced printing functions are called:
1031
1032         thumbnail:      Full URL to a video thumbnail image.
1033         description:    One-line video description.
1034
1035         Subclasses of this one should re-define the _real_initialize() and
1036         _real_extract() methods and define a _VALID_URL regexp.
1037         Probably, they should also be added to the list of extractors.
1038         """
1039
1040         _ready = False
1041         _downloader = None
1042
1043         def __init__(self, downloader=None):
1044                 """Constructor. Receives an optional downloader."""
1045                 self._ready = False
1046                 self.set_downloader(downloader)
1047
1048         def suitable(self, url):
1049                 """Receives a URL and returns True if suitable for this IE."""
1050                 return re.match(self._VALID_URL, url) is not None
1051
1052         def initialize(self):
1053                 """Initializes an instance (authentication, etc)."""
1054                 if not self._ready:
1055                         self._real_initialize()
1056                         self._ready = True
1057
1058         def extract(self, url):
1059                 """Extracts URL information and returns it in list of dicts."""
1060                 self.initialize()
1061                 return self._real_extract(url)
1062
1063         def set_downloader(self, downloader):
1064                 """Sets the downloader for this IE."""
1065                 self._downloader = downloader
1066
1067         def _real_initialize(self):
1068                 """Real initialization process. Redefine in subclasses."""
1069                 pass
1070
1071         def _real_extract(self, url):
1072                 """Real extraction process. Redefine in subclasses."""
1073                 pass
1074
1075
1076 class YoutubeIE(InfoExtractor):
1077         """Information extractor for youtube.com."""
1078
1079         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1080         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1081         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1082         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1083         _NETRC_MACHINE = 'youtube'
1084         # Listed in order of quality
1085         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1086         _video_extensions = {
1087                 '13': '3gp',
1088                 '17': 'mp4',
1089                 '18': 'mp4',
1090                 '22': 'mp4',
1091                 '37': 'mp4',
1092                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1093                 '43': 'webm',
1094                 '45': 'webm',
1095         }
1096         IE_NAME = u'youtube'
1097
1098         def report_lang(self):
1099                 """Report attempt to set language."""
1100                 self._downloader.to_screen(u'[youtube] Setting language')
1101
1102         def report_login(self):
1103                 """Report attempt to log in."""
1104                 self._downloader.to_screen(u'[youtube] Logging in')
1105
1106         def report_age_confirmation(self):
1107                 """Report attempt to confirm age."""
1108                 self._downloader.to_screen(u'[youtube] Confirming age')
1109
1110         def report_video_webpage_download(self, video_id):
1111                 """Report attempt to download video webpage."""
1112                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1113
1114         def report_video_info_webpage_download(self, video_id):
1115                 """Report attempt to download video info webpage."""
1116                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1117
1118         def report_information_extraction(self, video_id):
1119                 """Report attempt to extract video information."""
1120                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1121
1122         def report_unavailable_format(self, video_id, format):
1123                 """Report extracted video URL."""
1124                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1125
1126         def report_rtmp_download(self):
1127                 """Indicate the download will use the RTMP protocol."""
1128                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1129
1130         def _real_initialize(self):
1131                 if self._downloader is None:
1132                         return
1133
1134                 username = None
1135                 password = None
1136                 downloader_params = self._downloader.params
1137
1138                 # Attempt to use provided username and password or .netrc data
1139                 if downloader_params.get('username', None) is not None:
1140                         username = downloader_params['username']
1141                         password = downloader_params['password']
1142                 elif downloader_params.get('usenetrc', False):
1143                         try:
1144                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1145                                 if info is not None:
1146                                         username = info[0]
1147                                         password = info[2]
1148                                 else:
1149                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1150                         except (IOError, netrc.NetrcParseError), err:
1151                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1152                                 return
1153
1154                 # Set language
1155                 request = urllib2.Request(self._LANG_URL)
1156                 try:
1157                         self.report_lang()
1158                         urllib2.urlopen(request).read()
1159                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1160                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1161                         return
1162
1163                 # No authentication to be performed
1164                 if username is None:
1165                         return
1166
1167                 # Log in
1168                 login_form = {
1169                                 'current_form': 'loginForm',
1170                                 'next':         '/',
1171                                 'action_login': 'Log In',
1172                                 'username':     username,
1173                                 'password':     password,
1174                                 }
1175                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1176                 try:
1177                         self.report_login()
1178                         login_results = urllib2.urlopen(request).read()
1179                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1180                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1181                                 return
1182                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1183                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1184                         return
1185
1186                 # Confirm age
1187                 age_form = {
1188                                 'next_url':             '/',
1189                                 'action_confirm':       'Confirm',
1190                                 }
1191                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1192                 try:
1193                         self.report_age_confirmation()
1194                         age_results = urllib2.urlopen(request).read()
1195                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1196                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1197                         return
1198
1199         def _real_extract(self, url):
1200                 # Extract video id from URL
1201                 mobj = re.match(self._VALID_URL, url)
1202                 if mobj is None:
1203                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1204                         return
1205                 video_id = mobj.group(2)
1206
1207                 # Get video webpage
1208                 self.report_video_webpage_download(video_id)
1209                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1210                 try:
1211                         video_webpage = urllib2.urlopen(request).read()
1212                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1213                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1214                         return
1215
1216                 # Attempt to extract SWF player URL
1217                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1218                 if mobj is not None:
1219                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1220                 else:
1221                         player_url = None
1222
1223                 # Get video info
1224                 self.report_video_info_webpage_download(video_id)
1225                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1226                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1227                                         % (video_id, el_type))
1228                         request = urllib2.Request(video_info_url)
1229                         try:
1230                                 video_info_webpage = urllib2.urlopen(request).read()
1231                                 video_info = parse_qs(video_info_webpage)
1232                                 if 'token' in video_info:
1233                                         break
1234                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1235                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1236                                 return
1237                 if 'token' not in video_info:
1238                         if 'reason' in video_info:
1239                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1240                         else:
1241                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1242                         return
1243
1244                 # Start extracting information
1245                 self.report_information_extraction(video_id)
1246
1247                 # uploader
1248                 if 'author' not in video_info:
1249                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1250                         return
1251                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1252
1253                 # title
1254                 if 'title' not in video_info:
1255                         self._downloader.trouble(u'ERROR: unable to extract video title')
1256                         return
1257                 video_title = urllib.unquote_plus(video_info['title'][0])
1258                 video_title = video_title.decode('utf-8')
1259                 video_title = sanitize_title(video_title)
1260
1261                 # simplified title
1262                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1263                 simple_title = simple_title.strip(ur'_')
1264
1265                 # thumbnail image
1266                 if 'thumbnail_url' not in video_info:
1267                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1268                         video_thumbnail = ''
1269                 else:   # don't panic if we can't find it
1270                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1271
1272                 # upload date
1273                 upload_date = u'NA'
1274                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1275                 if mobj is not None:
1276                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1277                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1278                         for expression in format_expressions:
1279                                 try:
1280                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1281                                 except:
1282                                         pass
1283
1284                 # description
1285                 try:
1286                         lxml.etree
1287                 except NameError:
1288                         video_description = u'No description available.'
1289                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1290                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1291                                 if mobj is not None:
1292                                         video_description = mobj.group(1).decode('utf-8')
1293                 else:
1294                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1295                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1296                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1297                         # TODO use another parser
1298
1299                 # token
1300                 video_token = urllib.unquote_plus(video_info['token'][0])
1301
1302                 # Decide which formats to download
1303                 req_format = self._downloader.params.get('format', None)
1304
1305                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1306                         self.report_rtmp_download()
1307                         video_url_list = [(None, video_info['conn'][0])]
1308                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1309                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1310                         url_data = [parse_qs(uds) for uds in url_data_strs]
1311                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1312                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1313
1314                         format_limit = self._downloader.params.get('format_limit', None)
1315                         if format_limit is not None and format_limit in self._available_formats:
1316                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1317                         else:
1318                                 format_list = self._available_formats
1319                         existing_formats = [x for x in format_list if x in url_map]
1320                         if len(existing_formats) == 0:
1321                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1322                                 return
1323                         if req_format is None:
1324                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1325                         elif req_format == 'worst':
1326                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1327                         elif req_format == '-1':
1328                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1329                         else:
1330                                 # Specific format
1331                                 if req_format not in url_map:
1332                                         self._downloader.trouble(u'ERROR: requested format not available')
1333                                         return
1334                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1335                 else:
1336                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1337                         return
1338
1339                 for format_param, video_real_url in video_url_list:
1340                         # At this point we have a new video
1341                         self._downloader.increment_downloads()
1342
1343                         # Extension
1344                         video_extension = self._video_extensions.get(format_param, 'flv')
1345
1346                         try:
1347                                 # Process video information
1348                                 self._downloader.process_info({
1349                                         'id':           video_id.decode('utf-8'),
1350                                         'url':          video_real_url.decode('utf-8'),
1351                                         'uploader':     video_uploader.decode('utf-8'),
1352                                         'upload_date':  upload_date,
1353                                         'title':        video_title,
1354                                         'stitle':       simple_title,
1355                                         'ext':          video_extension.decode('utf-8'),
1356                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1357                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1358                                         'description':  video_description,
1359                                         'player_url':   player_url,
1360                                 })
1361                         except UnavailableVideoError, err:
1362                                 self._downloader.trouble(u'\nERROR: unable to download video')
1363
1364
1365 class MetacafeIE(InfoExtractor):
1366         """Information Extractor for metacafe.com."""
1367
1368         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1369         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1370         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1371         _youtube_ie = None
1372         IE_NAME = u'metacafe'
1373
1374         def __init__(self, youtube_ie, downloader=None):
1375                 InfoExtractor.__init__(self, downloader)
1376                 self._youtube_ie = youtube_ie
1377
1378         def report_disclaimer(self):
1379                 """Report disclaimer retrieval."""
1380                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1381
1382         def report_age_confirmation(self):
1383                 """Report attempt to confirm age."""
1384                 self._downloader.to_screen(u'[metacafe] Confirming age')
1385
1386         def report_download_webpage(self, video_id):
1387                 """Report webpage download."""
1388                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1389
1390         def report_extraction(self, video_id):
1391                 """Report information extraction."""
1392                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1393
1394         def _real_initialize(self):
1395                 # Retrieve disclaimer
1396                 request = urllib2.Request(self._DISCLAIMER)
1397                 try:
1398                         self.report_disclaimer()
1399                         disclaimer = urllib2.urlopen(request).read()
1400                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1401                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1402                         return
1403
1404                 # Confirm age
1405                 disclaimer_form = {
1406                         'filters': '0',
1407                         'submit': "Continue - I'm over 18",
1408                         }
1409                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1410                 try:
1411                         self.report_age_confirmation()
1412                         disclaimer = urllib2.urlopen(request).read()
1413                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1414                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1415                         return
1416
1417         def _real_extract(self, url):
1418                 # Extract id and simplified title from URL
1419                 mobj = re.match(self._VALID_URL, url)
1420                 if mobj is None:
1421                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1422                         return
1423
1424                 video_id = mobj.group(1)
1425
1426                 # Check if video comes from YouTube
1427                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1428                 if mobj2 is not None:
1429                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1430                         return
1431
1432                 # At this point we have a new video
1433                 self._downloader.increment_downloads()
1434
1435                 simple_title = mobj.group(2).decode('utf-8')
1436
1437                 # Retrieve video webpage to extract further information
1438                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1439                 try:
1440                         self.report_download_webpage(video_id)
1441                         webpage = urllib2.urlopen(request).read()
1442                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1443                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1444                         return
1445
1446                 # Extract URL, uploader and title from webpage
1447                 self.report_extraction(video_id)
1448                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1449                 if mobj is not None:
1450                         mediaURL = urllib.unquote(mobj.group(1))
1451                         video_extension = mediaURL[-3:]
1452
1453                         # Extract gdaKey if available
1454                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1455                         if mobj is None:
1456                                 video_url = mediaURL
1457                         else:
1458                                 gdaKey = mobj.group(1)
1459                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1460                 else:
1461                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1462                         if mobj is None:
1463                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1464                                 return
1465                         vardict = parse_qs(mobj.group(1))
1466                         if 'mediaData' not in vardict:
1467                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1468                                 return
1469                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1470                         if mobj is None:
1471                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1472                                 return
1473                         mediaURL = mobj.group(1).replace('\\/', '/')
1474                         video_extension = mediaURL[-3:]
1475                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1476
1477                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1478                 if mobj is None:
1479                         self._downloader.trouble(u'ERROR: unable to extract title')
1480                         return
1481                 video_title = mobj.group(1).decode('utf-8')
1482                 video_title = sanitize_title(video_title)
1483
1484                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1485                 if mobj is None:
1486                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1487                         return
1488                 video_uploader = mobj.group(1)
1489
1490                 try:
1491                         # Process video information
1492                         self._downloader.process_info({
1493                                 'id':           video_id.decode('utf-8'),
1494                                 'url':          video_url.decode('utf-8'),
1495                                 'uploader':     video_uploader.decode('utf-8'),
1496                                 'upload_date':  u'NA',
1497                                 'title':        video_title,
1498                                 'stitle':       simple_title,
1499                                 'ext':          video_extension.decode('utf-8'),
1500                                 'format':       u'NA',
1501                                 'player_url':   None,
1502                         })
1503                 except UnavailableVideoError:
1504                         self._downloader.trouble(u'\nERROR: unable to download video')
1505
1506
1507 class DailymotionIE(InfoExtractor):
1508         """Information Extractor for Dailymotion"""
1509
1510         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1511         IE_NAME = u'dailymotion'
1512
1513         def __init__(self, downloader=None):
1514                 InfoExtractor.__init__(self, downloader)
1515
1516         def report_download_webpage(self, video_id):
1517                 """Report webpage download."""
1518                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1519
1520         def report_extraction(self, video_id):
1521                 """Report information extraction."""
1522                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1523
1524         def _real_initialize(self):
1525                 return
1526
1527         def _real_extract(self, url):
1528                 # Extract id and simplified title from URL
1529                 mobj = re.match(self._VALID_URL, url)
1530                 if mobj is None:
1531                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1532                         return
1533
1534                 # At this point we have a new video
1535                 self._downloader.increment_downloads()
1536                 video_id = mobj.group(1)
1537
1538                 simple_title = mobj.group(2).decode('utf-8')
1539                 video_extension = 'flv'
1540
1541                 # Retrieve video webpage to extract further information
1542                 request = urllib2.Request(url)
1543                 request.add_header('Cookie', 'family_filter=off')
1544                 try:
1545                         self.report_download_webpage(video_id)
1546                         webpage = urllib2.urlopen(request).read()
1547                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1548                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1549                         return
1550
1551                 # Extract URL, uploader and title from webpage
1552                 self.report_extraction(video_id)
1553                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1554                 if mobj is None:
1555                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1556                         return
1557                 sequence = urllib.unquote(mobj.group(1))
1558                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1559                 if mobj is None:
1560                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1561                         return
1562                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1563
1564                 # if needed add http://www.dailymotion.com/ if relative URL
1565
1566                 video_url = mediaURL
1567
1568                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1569                 if mobj is None:
1570                         self._downloader.trouble(u'ERROR: unable to extract title')
1571                         return
1572                 video_title = mobj.group(1).decode('utf-8')
1573                 video_title = sanitize_title(video_title)
1574
1575                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1576                 if mobj is None:
1577                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1578                         return
1579                 video_uploader = mobj.group(1)
1580
1581                 try:
1582                         # Process video information
1583                         self._downloader.process_info({
1584                                 'id':           video_id.decode('utf-8'),
1585                                 'url':          video_url.decode('utf-8'),
1586                                 'uploader':     video_uploader.decode('utf-8'),
1587                                 'upload_date':  u'NA',
1588                                 'title':        video_title,
1589                                 'stitle':       simple_title,
1590                                 'ext':          video_extension.decode('utf-8'),
1591                                 'format':       u'NA',
1592                                 'player_url':   None,
1593                         })
1594                 except UnavailableVideoError:
1595                         self._downloader.trouble(u'\nERROR: unable to download video')
1596
1597
1598 class GoogleIE(InfoExtractor):
1599         """Information extractor for video.google.com."""
1600
1601         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1602         IE_NAME = u'video.google'
1603
1604         def __init__(self, downloader=None):
1605                 InfoExtractor.__init__(self, downloader)
1606
1607         def report_download_webpage(self, video_id):
1608                 """Report webpage download."""
1609                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1610
1611         def report_extraction(self, video_id):
1612                 """Report information extraction."""
1613                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1614
1615         def _real_initialize(self):
1616                 return
1617
1618         def _real_extract(self, url):
1619                 # Extract id from URL
1620                 mobj = re.match(self._VALID_URL, url)
1621                 if mobj is None:
1622                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1623                         return
1624
1625                 # At this point we have a new video
1626                 self._downloader.increment_downloads()
1627                 video_id = mobj.group(1)
1628
1629                 video_extension = 'mp4'
1630
1631                 # Retrieve video webpage to extract further information
1632                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1633                 try:
1634                         self.report_download_webpage(video_id)
1635                         webpage = urllib2.urlopen(request).read()
1636                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1637                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1638                         return
1639
1640                 # Extract URL, uploader, and title from webpage
1641                 self.report_extraction(video_id)
1642                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1643                 if mobj is None:
1644                         video_extension = 'flv'
1645                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1646                 if mobj is None:
1647                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1648                         return
1649                 mediaURL = urllib.unquote(mobj.group(1))
1650                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1651                 mediaURL = mediaURL.replace('\\x26', '\x26')
1652
1653                 video_url = mediaURL
1654
1655                 mobj = re.search(r'<title>(.*)</title>', webpage)
1656                 if mobj is None:
1657                         self._downloader.trouble(u'ERROR: unable to extract title')
1658                         return
1659                 video_title = mobj.group(1).decode('utf-8')
1660                 video_title = sanitize_title(video_title)
1661                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1662
1663                 # Extract video description
1664                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1665                 if mobj is None:
1666                         self._downloader.trouble(u'ERROR: unable to extract video description')
1667                         return
1668                 video_description = mobj.group(1).decode('utf-8')
1669                 if not video_description:
1670                         video_description = 'No description available.'
1671
1672                 # Extract video thumbnail
1673                 if self._downloader.params.get('forcethumbnail', False):
1674                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1675                         try:
1676                                 webpage = urllib2.urlopen(request).read()
1677                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1678                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1679                                 return
1680                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1681                         if mobj is None:
1682                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1683                                 return
1684                         video_thumbnail = mobj.group(1)
1685                 else:   # we need something to pass to process_info
1686                         video_thumbnail = ''
1687
1688                 try:
1689                         # Process video information
1690                         self._downloader.process_info({
1691                                 'id':           video_id.decode('utf-8'),
1692                                 'url':          video_url.decode('utf-8'),
1693                                 'uploader':     u'NA',
1694                                 'upload_date':  u'NA',
1695                                 'title':        video_title,
1696                                 'stitle':       simple_title,
1697                                 'ext':          video_extension.decode('utf-8'),
1698                                 'format':       u'NA',
1699                                 'player_url':   None,
1700                         })
1701                 except UnavailableVideoError:
1702                         self._downloader.trouble(u'\nERROR: unable to download video')
1703
1704
1705 class PhotobucketIE(InfoExtractor):
1706         """Information extractor for photobucket.com."""
1707
1708         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1709         IE_NAME = u'photobucket'
1710
1711         def __init__(self, downloader=None):
1712                 InfoExtractor.__init__(self, downloader)
1713
1714         def report_download_webpage(self, video_id):
1715                 """Report webpage download."""
1716                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1717
1718         def report_extraction(self, video_id):
1719                 """Report information extraction."""
1720                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1721
1722         def _real_initialize(self):
1723                 return
1724
1725         def _real_extract(self, url):
1726                 # Extract id from URL
1727                 mobj = re.match(self._VALID_URL, url)
1728                 if mobj is None:
1729                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1730                         return
1731
1732                 # At this point we have a new video
1733                 self._downloader.increment_downloads()
1734                 video_id = mobj.group(1)
1735
1736                 video_extension = 'flv'
1737
1738                 # Retrieve video webpage to extract further information
1739                 request = urllib2.Request(url)
1740                 try:
1741                         self.report_download_webpage(video_id)
1742                         webpage = urllib2.urlopen(request).read()
1743                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1744                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1745                         return
1746
1747                 # Extract URL, uploader, and title from webpage
1748                 self.report_extraction(video_id)
1749                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1750                 if mobj is None:
1751                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1752                         return
1753                 mediaURL = urllib.unquote(mobj.group(1))
1754
1755                 video_url = mediaURL
1756
1757                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1758                 if mobj is None:
1759                         self._downloader.trouble(u'ERROR: unable to extract title')
1760                         return
1761                 video_title = mobj.group(1).decode('utf-8')
1762                 video_title = sanitize_title(video_title)
1763                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1764
1765                 video_uploader = mobj.group(2).decode('utf-8')
1766
1767                 try:
1768                         # Process video information
1769                         self._downloader.process_info({
1770                                 'id':           video_id.decode('utf-8'),
1771                                 'url':          video_url.decode('utf-8'),
1772                                 'uploader':     video_uploader,
1773                                 'upload_date':  u'NA',
1774                                 'title':        video_title,
1775                                 'stitle':       simple_title,
1776                                 'ext':          video_extension.decode('utf-8'),
1777                                 'format':       u'NA',
1778                                 'player_url':   None,
1779                         })
1780                 except UnavailableVideoError:
1781                         self._downloader.trouble(u'\nERROR: unable to download video')
1782
1783
1784 class YahooIE(InfoExtractor):
1785         """Information extractor for video.yahoo.com."""
1786
1787         # _VALID_URL matches all Yahoo! Video URLs
1788         # _VPAGE_URL matches only the extractable '/watch/' URLs
1789         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1790         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1791         IE_NAME = u'video.yahoo'
1792
1793         def __init__(self, downloader=None):
1794                 InfoExtractor.__init__(self, downloader)
1795
1796         def report_download_webpage(self, video_id):
1797                 """Report webpage download."""
1798                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1799
1800         def report_extraction(self, video_id):
1801                 """Report information extraction."""
1802                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1803
1804         def _real_initialize(self):
1805                 return
1806
1807         def _real_extract(self, url, new_video=True):
1808                 # Extract ID from URL
1809                 mobj = re.match(self._VALID_URL, url)
1810                 if mobj is None:
1811                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1812                         return
1813
1814                 # At this point we have a new video
1815                 self._downloader.increment_downloads()
1816                 video_id = mobj.group(2)
1817                 video_extension = 'flv'
1818
1819                 # Rewrite valid but non-extractable URLs as
1820                 # extractable English language /watch/ URLs
1821                 if re.match(self._VPAGE_URL, url) is None:
1822                         request = urllib2.Request(url)
1823                         try:
1824                                 webpage = urllib2.urlopen(request).read()
1825                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1826                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1827                                 return
1828
1829                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1830                         if mobj is None:
1831                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1832                                 return
1833                         yahoo_id = mobj.group(1)
1834
1835                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1836                         if mobj is None:
1837                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1838                                 return
1839                         yahoo_vid = mobj.group(1)
1840
1841                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1842                         return self._real_extract(url, new_video=False)
1843
1844                 # Retrieve video webpage to extract further information
1845                 request = urllib2.Request(url)
1846                 try:
1847                         self.report_download_webpage(video_id)
1848                         webpage = urllib2.urlopen(request).read()
1849                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1850                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1851                         return
1852
1853                 # Extract uploader and title from webpage
1854                 self.report_extraction(video_id)
1855                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1856                 if mobj is None:
1857                         self._downloader.trouble(u'ERROR: unable to extract video title')
1858                         return
1859                 video_title = mobj.group(1).decode('utf-8')
1860                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1861
1862                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1863                 if mobj is None:
1864                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1865                         return
1866                 video_uploader = mobj.group(1).decode('utf-8')
1867
1868                 # Extract video thumbnail
1869                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1870                 if mobj is None:
1871                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1872                         return
1873                 video_thumbnail = mobj.group(1).decode('utf-8')
1874
1875                 # Extract video description
1876                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1877                 if mobj is None:
1878                         self._downloader.trouble(u'ERROR: unable to extract video description')
1879                         return
1880                 video_description = mobj.group(1).decode('utf-8')
1881                 if not video_description:
1882                         video_description = 'No description available.'
1883
1884                 # Extract video height and width
1885                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1886                 if mobj is None:
1887                         self._downloader.trouble(u'ERROR: unable to extract video height')
1888                         return
1889                 yv_video_height = mobj.group(1)
1890
1891                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1892                 if mobj is None:
1893                         self._downloader.trouble(u'ERROR: unable to extract video width')
1894                         return
1895                 yv_video_width = mobj.group(1)
1896
1897                 # Retrieve video playlist to extract media URL
1898                 # I'm not completely sure what all these options are, but we
1899                 # seem to need most of them, otherwise the server sends a 401.
1900                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1901                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1902                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1903                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1904                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1905                 try:
1906                         self.report_download_webpage(video_id)
1907                         webpage = urllib2.urlopen(request).read()
1908                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1909                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1910                         return
1911
1912                 # Extract media URL from playlist XML
1913                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1914                 if mobj is None:
1915                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1916                         return
1917                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1918                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1919
1920                 try:
1921                         # Process video information
1922                         self._downloader.process_info({
1923                                 'id':           video_id.decode('utf-8'),
1924                                 'url':          video_url,
1925                                 'uploader':     video_uploader,
1926                                 'upload_date':  u'NA',
1927                                 'title':        video_title,
1928                                 'stitle':       simple_title,
1929                                 'ext':          video_extension.decode('utf-8'),
1930                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1931                                 'description':  video_description,
1932                                 'thumbnail':    video_thumbnail,
1933                                 'player_url':   None,
1934                         })
1935                 except UnavailableVideoError:
1936                         self._downloader.trouble(u'\nERROR: unable to download video')
1937
1938
1939 class VimeoIE(InfoExtractor):
1940         """Information extractor for vimeo.com."""
1941
1942         # _VALID_URL matches Vimeo URLs
1943         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1944         IE_NAME = u'vimeo'
1945
1946         def __init__(self, downloader=None):
1947                 InfoExtractor.__init__(self, downloader)
1948
1949         def report_download_webpage(self, video_id):
1950                 """Report webpage download."""
1951                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1952
1953         def report_extraction(self, video_id):
1954                 """Report information extraction."""
1955                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1956
1957         def _real_initialize(self):
1958                 return
1959
1960         def _real_extract(self, url, new_video=True):
1961                 # Extract ID from URL
1962                 mobj = re.match(self._VALID_URL, url)
1963                 if mobj is None:
1964                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1965                         return
1966
1967                 # At this point we have a new video
1968                 self._downloader.increment_downloads()
1969                 video_id = mobj.group(1)
1970
1971                 # Retrieve video webpage to extract further information
1972                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1973                 try:
1974                         self.report_download_webpage(video_id)
1975                         webpage = urllib2.urlopen(request).read()
1976                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1977                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1978                         return
1979
1980                 # Now we begin extracting as much information as we can from what we
1981                 # retrieved. First we extract the information common to all extractors,
1982                 # and latter we extract those that are Vimeo specific.
1983                 self.report_extraction(video_id)
1984
1985                 # Extract title
1986                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1987                 if mobj is None:
1988                         self._downloader.trouble(u'ERROR: unable to extract video title')
1989                         return
1990                 video_title = mobj.group(1).decode('utf-8')
1991                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1992
1993                 # Extract uploader
1994                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1995                 if mobj is None:
1996                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1997                         return
1998                 video_uploader = mobj.group(1).decode('utf-8')
1999
2000                 # Extract video thumbnail
2001                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2002                 if mobj is None:
2003                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2004                         return
2005                 video_thumbnail = mobj.group(1).decode('utf-8')
2006
2007                 # # Extract video description
2008                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2009                 # if mobj is None:
2010                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2011                 #       return
2012                 # video_description = mobj.group(1).decode('utf-8')
2013                 # if not video_description: video_description = 'No description available.'
2014                 video_description = 'Foo.'
2015
2016                 # Vimeo specific: extract request signature
2017                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2018                 if mobj is None:
2019                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2020                         return
2021                 sig = mobj.group(1).decode('utf-8')
2022
2023                 # Vimeo specific: Extract request signature expiration
2024                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2025                 if mobj is None:
2026                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2027                         return
2028                 sig_exp = mobj.group(1).decode('utf-8')
2029
2030                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2031
2032                 try:
2033                         # Process video information
2034                         self._downloader.process_info({
2035                                 'id':           video_id.decode('utf-8'),
2036                                 'url':          video_url,
2037                                 'uploader':     video_uploader,
2038                                 'upload_date':  u'NA',
2039                                 'title':        video_title,
2040                                 'stitle':       simple_title,
2041                                 'ext':          u'mp4',
2042                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2043                                 'description':  video_description,
2044                                 'thumbnail':    video_thumbnail,
2045                                 'description':  video_description,
2046                                 'player_url':   None,
2047                         })
2048                 except UnavailableVideoError:
2049                         self._downloader.trouble(u'ERROR: unable to download video')
2050
2051
2052 class GenericIE(InfoExtractor):
2053         """Generic last-resort information extractor."""
2054
2055         _VALID_URL = r'.*'
2056         IE_NAME = u'generic'
2057
2058         def __init__(self, downloader=None):
2059                 InfoExtractor.__init__(self, downloader)
2060
2061         def report_download_webpage(self, video_id):
2062                 """Report webpage download."""
2063                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2064                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2065
2066         def report_extraction(self, video_id):
2067                 """Report information extraction."""
2068                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2069
2070         def _real_initialize(self):
2071                 return
2072
2073         def _real_extract(self, url):
2074                 # At this point we have a new video
2075                 self._downloader.increment_downloads()
2076
2077                 video_id = url.split('/')[-1]
2078                 request = urllib2.Request(url)
2079                 try:
2080                         self.report_download_webpage(video_id)
2081                         webpage = urllib2.urlopen(request).read()
2082                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2083                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2084                         return
2085                 except ValueError, err:
2086                         # since this is the last-resort InfoExtractor, if
2087                         # this error is thrown, it'll be thrown here
2088                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2089                         return
2090
2091                 self.report_extraction(video_id)
2092                 # Start with something easy: JW Player in SWFObject
2093                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2094                 if mobj is None:
2095                         # Broaden the search a little bit
2096                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2097                 if mobj is None:
2098                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2099                         return
2100
2101                 # It's possible that one of the regexes
2102                 # matched, but returned an empty group:
2103                 if mobj.group(1) is None:
2104                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2105                         return
2106
2107                 video_url = urllib.unquote(mobj.group(1))
2108                 video_id = os.path.basename(video_url)
2109
2110                 # here's a fun little line of code for you:
2111                 video_extension = os.path.splitext(video_id)[1][1:]
2112                 video_id = os.path.splitext(video_id)[0]
2113
2114                 # it's tempting to parse this further, but you would
2115                 # have to take into account all the variations like
2116                 #   Video Title - Site Name
2117                 #   Site Name | Video Title
2118                 #   Video Title - Tagline | Site Name
2119                 # and so on and so forth; it's just not practical
2120                 mobj = re.search(r'<title>(.*)</title>', webpage)
2121                 if mobj is None:
2122                         self._downloader.trouble(u'ERROR: unable to extract title')
2123                         return
2124                 video_title = mobj.group(1).decode('utf-8')
2125                 video_title = sanitize_title(video_title)
2126                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2127
2128                 # video uploader is domain name
2129                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2130                 if mobj is None:
2131                         self._downloader.trouble(u'ERROR: unable to extract title')
2132                         return
2133                 video_uploader = mobj.group(1).decode('utf-8')
2134
2135                 try:
2136                         # Process video information
2137                         self._downloader.process_info({
2138                                 'id':           video_id.decode('utf-8'),
2139                                 'url':          video_url.decode('utf-8'),
2140                                 'uploader':     video_uploader,
2141                                 'upload_date':  u'NA',
2142                                 'title':        video_title,
2143                                 'stitle':       simple_title,
2144                                 'ext':          video_extension.decode('utf-8'),
2145                                 'format':       u'NA',
2146                                 'player_url':   None,
2147                         })
2148                 except UnavailableVideoError, err:
2149                         self._downloader.trouble(u'\nERROR: unable to download video')
2150
2151
2152 class YoutubeSearchIE(InfoExtractor):
2153         """Information Extractor for YouTube search queries."""
2154         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2155         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2156         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2157         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2158         _youtube_ie = None
2159         _max_youtube_results = 1000
2160         IE_NAME = u'youtube:search'
2161
2162         def __init__(self, youtube_ie, downloader=None):
2163                 InfoExtractor.__init__(self, downloader)
2164                 self._youtube_ie = youtube_ie
2165
2166         def report_download_page(self, query, pagenum):
2167                 """Report attempt to download playlist page with given number."""
2168                 query = query.decode(preferredencoding())
2169                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2170
2171         def _real_initialize(self):
2172                 self._youtube_ie.initialize()
2173
2174         def _real_extract(self, query):
2175                 mobj = re.match(self._VALID_URL, query)
2176                 if mobj is None:
2177                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2178                         return
2179
2180                 prefix, query = query.split(':')
2181                 prefix = prefix[8:]
2182                 query = query.encode('utf-8')
2183                 if prefix == '':
2184                         self._download_n_results(query, 1)
2185                         return
2186                 elif prefix == 'all':
2187                         self._download_n_results(query, self._max_youtube_results)
2188                         return
2189                 else:
2190                         try:
2191                                 n = long(prefix)
2192                                 if n <= 0:
2193                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2194                                         return
2195                                 elif n > self._max_youtube_results:
2196                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2197                                         n = self._max_youtube_results
2198                                 self._download_n_results(query, n)
2199                                 return
2200                         except ValueError: # parsing prefix as integer fails
2201                                 self._download_n_results(query, 1)
2202                                 return
2203
2204         def _download_n_results(self, query, n):
2205                 """Downloads a specified number of results for a query"""
2206
2207                 video_ids = []
2208                 already_seen = set()
2209                 pagenum = 1
2210
2211                 while True:
2212                         self.report_download_page(query, pagenum)
2213                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2214                         request = urllib2.Request(result_url)
2215                         try:
2216                                 page = urllib2.urlopen(request).read()
2217                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2218                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2219                                 return
2220
2221                         # Extract video identifiers
2222                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2223                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2224                                 if video_id not in already_seen:
2225                                         video_ids.append(video_id)
2226                                         already_seen.add(video_id)
2227                                         if len(video_ids) == n:
2228                                                 # Specified n videos reached
2229                                                 for id in video_ids:
2230                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2231                                                 return
2232
2233                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2234                                 for id in video_ids:
2235                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2236                                 return
2237
2238                         pagenum = pagenum + 1
2239
2240
2241 class GoogleSearchIE(InfoExtractor):
2242         """Information Extractor for Google Video search queries."""
2243         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2244         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2245         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2246         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2247         _google_ie = None
2248         _max_google_results = 1000
2249         IE_NAME = u'video.google:search'
2250
2251         def __init__(self, google_ie, downloader=None):
2252                 InfoExtractor.__init__(self, downloader)
2253                 self._google_ie = google_ie
2254
2255         def report_download_page(self, query, pagenum):
2256                 """Report attempt to download playlist page with given number."""
2257                 query = query.decode(preferredencoding())
2258                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2259
2260         def _real_initialize(self):
2261                 self._google_ie.initialize()
2262
2263         def _real_extract(self, query):
2264                 mobj = re.match(self._VALID_URL, query)
2265                 if mobj is None:
2266                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2267                         return
2268
2269                 prefix, query = query.split(':')
2270                 prefix = prefix[8:]
2271                 query = query.encode('utf-8')
2272                 if prefix == '':
2273                         self._download_n_results(query, 1)
2274                         return
2275                 elif prefix == 'all':
2276                         self._download_n_results(query, self._max_google_results)
2277                         return
2278                 else:
2279                         try:
2280                                 n = long(prefix)
2281                                 if n <= 0:
2282                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2283                                         return
2284                                 elif n > self._max_google_results:
2285                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2286                                         n = self._max_google_results
2287                                 self._download_n_results(query, n)
2288                                 return
2289                         except ValueError: # parsing prefix as integer fails
2290                                 self._download_n_results(query, 1)
2291                                 return
2292
2293         def _download_n_results(self, query, n):
2294                 """Downloads a specified number of results for a query"""
2295
2296                 video_ids = []
2297                 already_seen = set()
2298                 pagenum = 1
2299
2300                 while True:
2301                         self.report_download_page(query, pagenum)
2302                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2303                         request = urllib2.Request(result_url)
2304                         try:
2305                                 page = urllib2.urlopen(request).read()
2306                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2307                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2308                                 return
2309
2310                         # Extract video identifiers
2311                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2312                                 video_id = mobj.group(1)
2313                                 if video_id not in already_seen:
2314                                         video_ids.append(video_id)
2315                                         already_seen.add(video_id)
2316                                         if len(video_ids) == n:
2317                                                 # Specified n videos reached
2318                                                 for id in video_ids:
2319                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2320                                                 return
2321
2322                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2323                                 for id in video_ids:
2324                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2325                                 return
2326
2327                         pagenum = pagenum + 1
2328
2329
2330 class YahooSearchIE(InfoExtractor):
2331         """Information Extractor for Yahoo! Video search queries."""
2332         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2333         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2334         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2335         _MORE_PAGES_INDICATOR = r'\s*Next'
2336         _yahoo_ie = None
2337         _max_yahoo_results = 1000
2338         IE_NAME = u'video.yahoo:search'
2339
2340         def __init__(self, yahoo_ie, downloader=None):
2341                 InfoExtractor.__init__(self, downloader)
2342                 self._yahoo_ie = yahoo_ie
2343
2344         def report_download_page(self, query, pagenum):
2345                 """Report attempt to download playlist page with given number."""
2346                 query = query.decode(preferredencoding())
2347                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2348
2349         def _real_initialize(self):
2350                 self._yahoo_ie.initialize()
2351
2352         def _real_extract(self, query):
2353                 mobj = re.match(self._VALID_URL, query)
2354                 if mobj is None:
2355                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2356                         return
2357
2358                 prefix, query = query.split(':')
2359                 prefix = prefix[8:]
2360                 query = query.encode('utf-8')
2361                 if prefix == '':
2362                         self._download_n_results(query, 1)
2363                         return
2364                 elif prefix == 'all':
2365                         self._download_n_results(query, self._max_yahoo_results)
2366                         return
2367                 else:
2368                         try:
2369                                 n = long(prefix)
2370                                 if n <= 0:
2371                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2372                                         return
2373                                 elif n > self._max_yahoo_results:
2374                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2375                                         n = self._max_yahoo_results
2376                                 self._download_n_results(query, n)
2377                                 return
2378                         except ValueError: # parsing prefix as integer fails
2379                                 self._download_n_results(query, 1)
2380                                 return
2381
2382         def _download_n_results(self, query, n):
2383                 """Downloads a specified number of results for a query"""
2384
2385                 video_ids = []
2386                 already_seen = set()
2387                 pagenum = 1
2388
2389                 while True:
2390                         self.report_download_page(query, pagenum)
2391                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2392                         request = urllib2.Request(result_url)
2393                         try:
2394                                 page = urllib2.urlopen(request).read()
2395                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2396                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2397                                 return
2398
2399                         # Extract video identifiers
2400                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2401                                 video_id = mobj.group(1)
2402                                 if video_id not in already_seen:
2403                                         video_ids.append(video_id)
2404                                         already_seen.add(video_id)
2405                                         if len(video_ids) == n:
2406                                                 # Specified n videos reached
2407                                                 for id in video_ids:
2408                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2409                                                 return
2410
2411                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2412                                 for id in video_ids:
2413                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2414                                 return
2415
2416                         pagenum = pagenum + 1
2417
2418
2419 class YoutubePlaylistIE(InfoExtractor):
2420         """Information Extractor for YouTube playlists."""
2421
2422         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2423         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2424         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2425         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2426         _youtube_ie = None
2427         IE_NAME = u'youtube:playlist'
2428
2429         def __init__(self, youtube_ie, downloader=None):
2430                 InfoExtractor.__init__(self, downloader)
2431                 self._youtube_ie = youtube_ie
2432
2433         def report_download_page(self, playlist_id, pagenum):
2434                 """Report attempt to download playlist page with given number."""
2435                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2436
2437         def _real_initialize(self):
2438                 self._youtube_ie.initialize()
2439
2440         def _real_extract(self, url):
2441                 # Extract playlist id
2442                 mobj = re.match(self._VALID_URL, url)
2443                 if mobj is None:
2444                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2445                         return
2446
2447                 # Single video case
2448                 if mobj.group(3) is not None:
2449                         self._youtube_ie.extract(mobj.group(3))
2450                         return
2451
2452                 # Download playlist pages
2453                 # prefix is 'p' as default for playlists but there are other types that need extra care
2454                 playlist_prefix = mobj.group(1)
2455                 if playlist_prefix == 'a':
2456                         playlist_access = 'artist'
2457                 else:
2458                         playlist_prefix = 'p'
2459                         playlist_access = 'view_play_list'
2460                 playlist_id = mobj.group(2)
2461                 video_ids = []
2462                 pagenum = 1
2463
2464                 while True:
2465                         self.report_download_page(playlist_id, pagenum)
2466                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2467                         try:
2468                                 page = urllib2.urlopen(request).read()
2469                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2470                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2471                                 return
2472
2473                         # Extract video identifiers
2474                         ids_in_page = []
2475                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2476                                 if mobj.group(1) not in ids_in_page:
2477                                         ids_in_page.append(mobj.group(1))
2478                         video_ids.extend(ids_in_page)
2479
2480                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2481                                 break
2482                         pagenum = pagenum + 1
2483
2484                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2485                 playlistend = self._downloader.params.get('playlistend', -1)
2486                 video_ids = video_ids[playliststart:playlistend]
2487
2488                 for id in video_ids:
2489                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2490                 return
2491
2492
2493 class YoutubeUserIE(InfoExtractor):
2494         """Information Extractor for YouTube users."""
2495
2496         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2497         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2498         _GDATA_PAGE_SIZE = 50
2499         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2500         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2501         _youtube_ie = None
2502         IE_NAME = u'youtube:user'
2503
2504         def __init__(self, youtube_ie, downloader=None):
2505                 InfoExtractor.__init__(self, downloader)
2506                 self._youtube_ie = youtube_ie
2507
2508         def report_download_page(self, username, start_index):
2509                 """Report attempt to download user page."""
2510                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2511                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2512
2513         def _real_initialize(self):
2514                 self._youtube_ie.initialize()
2515
2516         def _real_extract(self, url):
2517                 # Extract username
2518                 mobj = re.match(self._VALID_URL, url)
2519                 if mobj is None:
2520                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2521                         return
2522
2523                 username = mobj.group(1)
2524
2525                 # Download video ids using YouTube Data API. Result size per
2526                 # query is limited (currently to 50 videos) so we need to query
2527                 # page by page until there are no video ids - it means we got
2528                 # all of them.
2529
2530                 video_ids = []
2531                 pagenum = 0
2532
2533                 while True:
2534                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2535                         self.report_download_page(username, start_index)
2536
2537                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2538
2539                         try:
2540                                 page = urllib2.urlopen(request).read()
2541                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2542                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2543                                 return
2544
2545                         # Extract video identifiers
2546                         ids_in_page = []
2547
2548                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2549                                 if mobj.group(1) not in ids_in_page:
2550                                         ids_in_page.append(mobj.group(1))
2551
2552                         video_ids.extend(ids_in_page)
2553
2554                         # A little optimization - if current page is not
2555                         # "full", ie. does not contain PAGE_SIZE video ids then
2556                         # we can assume that this page is the last one - there
2557                         # are no more ids on further pages - no need to query
2558                         # again.
2559
2560                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2561                                 break
2562
2563                         pagenum += 1
2564
2565                 all_ids_count = len(video_ids)
2566                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2567                 playlistend = self._downloader.params.get('playlistend', -1)
2568
2569                 if playlistend == -1:
2570                         video_ids = video_ids[playliststart:]
2571                 else:
2572                         video_ids = video_ids[playliststart:playlistend]
2573
2574                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2575                                 (username, all_ids_count, len(video_ids)))
2576
2577                 for video_id in video_ids:
2578                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2579
2580
2581 class DepositFilesIE(InfoExtractor):
2582         """Information extractor for depositfiles.com"""
2583
2584         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2585         IE_NAME = u'DepositFiles'
2586
2587         def __init__(self, downloader=None):
2588                 InfoExtractor.__init__(self, downloader)
2589
2590         def report_download_webpage(self, file_id):
2591                 """Report webpage download."""
2592                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2593
2594         def report_extraction(self, file_id):
2595                 """Report information extraction."""
2596                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2597
2598         def _real_initialize(self):
2599                 return
2600
2601         def _real_extract(self, url):
2602                 # At this point we have a new file
2603                 self._downloader.increment_downloads()
2604
2605                 file_id = url.split('/')[-1]
2606                 # Rebuild url in english locale
2607                 url = 'http://depositfiles.com/en/files/' + file_id
2608
2609                 # Retrieve file webpage with 'Free download' button pressed
2610                 free_download_indication = { 'gateway_result' : '1' }
2611                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2612                 try:
2613                         self.report_download_webpage(file_id)
2614                         webpage = urllib2.urlopen(request).read()
2615                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2616                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2617                         return
2618
2619                 # Search for the real file URL
2620                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2621                 if (mobj is None) or (mobj.group(1) is None):
2622                         # Try to figure out reason of the error.
2623                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2624                         if (mobj is not None) and (mobj.group(1) is not None):
2625                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2626                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2627                         else:
2628                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2629                         return
2630
2631                 file_url = mobj.group(1)
2632                 file_extension = os.path.splitext(file_url)[1][1:]
2633
2634                 # Search for file title
2635                 mobj = re.search(r'<b title="(.*?)">', webpage)
2636                 if mobj is None:
2637                         self._downloader.trouble(u'ERROR: unable to extract title')
2638                         return
2639                 file_title = mobj.group(1).decode('utf-8')
2640
2641                 try:
2642                         # Process file information
2643                         self._downloader.process_info({
2644                                 'id':           file_id.decode('utf-8'),
2645                                 'url':          file_url.decode('utf-8'),
2646                                 'uploader':     u'NA',
2647                                 'upload_date':  u'NA',
2648                                 'title':        file_title,
2649                                 'stitle':       file_title,
2650                                 'ext':          file_extension.decode('utf-8'),
2651                                 'format':       u'NA',
2652                                 'player_url':   None,
2653                         })
2654                 except UnavailableVideoError, err:
2655                         self._downloader.trouble(u'ERROR: unable to download file')
2656
2657
2658 class FacebookIE(InfoExtractor):
2659         """Information Extractor for Facebook"""
2660
2661         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2662         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2663         _NETRC_MACHINE = 'facebook'
2664         _available_formats = ['highqual', 'lowqual']
2665         _video_extensions = {
2666                 'highqual': 'mp4',
2667                 'lowqual': 'mp4',
2668         }
2669         IE_NAME = u'facebook'
2670
2671         def __init__(self, downloader=None):
2672                 InfoExtractor.__init__(self, downloader)
2673
2674         def _reporter(self, message):
2675                 """Add header and report message."""
2676                 self._downloader.to_screen(u'[facebook] %s' % message)
2677
2678         def report_login(self):
2679                 """Report attempt to log in."""
2680                 self._reporter(u'Logging in')
2681
2682         def report_video_webpage_download(self, video_id):
2683                 """Report attempt to download video webpage."""
2684                 self._reporter(u'%s: Downloading video webpage' % video_id)
2685
2686         def report_information_extraction(self, video_id):
2687                 """Report attempt to extract video information."""
2688                 self._reporter(u'%s: Extracting video information' % video_id)
2689
2690         def _parse_page(self, video_webpage):
2691                 """Extract video information from page"""
2692                 # General data
2693                 data = {'title': r'class="video_title datawrap">(.*?)</',
2694                         'description': r'<div class="datawrap">(.*?)</div>',
2695                         'owner': r'\("video_owner_name", "(.*?)"\)',
2696                         'upload_date': r'data-date="(.*?)"',
2697                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2698                         }
2699                 video_info = {}
2700                 for piece in data.keys():
2701                         mobj = re.search(data[piece], video_webpage)
2702                         if mobj is not None:
2703                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2704
2705                 # Video urls
2706                 video_urls = {}
2707                 for fmt in self._available_formats:
2708                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2709                         if mobj is not None:
2710                                 # URL is in a Javascript segment inside an escaped Unicode format within
2711                                 # the generally utf-8 page
2712                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2713                 video_info['video_urls'] = video_urls
2714
2715                 return video_info
2716
2717         def _real_initialize(self):
2718                 if self._downloader is None:
2719                         return
2720
2721                 useremail = None
2722                 password = None
2723                 downloader_params = self._downloader.params
2724
2725                 # Attempt to use provided username and password or .netrc data
2726                 if downloader_params.get('username', None) is not None:
2727                         useremail = downloader_params['username']
2728                         password = downloader_params['password']
2729                 elif downloader_params.get('usenetrc', False):
2730                         try:
2731                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2732                                 if info is not None:
2733                                         useremail = info[0]
2734                                         password = info[2]
2735                                 else:
2736                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2737                         except (IOError, netrc.NetrcParseError), err:
2738                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2739                                 return
2740
2741                 if useremail is None:
2742                         return
2743
2744                 # Log in
2745                 login_form = {
2746                         'email': useremail,
2747                         'pass': password,
2748                         'login': 'Log+In'
2749                         }
2750                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2751                 try:
2752                         self.report_login()
2753                         login_results = urllib2.urlopen(request).read()
2754                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2755                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2756                                 return
2757                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2758                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2759                         return
2760
2761         def _real_extract(self, url):
2762                 mobj = re.match(self._VALID_URL, url)
2763                 if mobj is None:
2764                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2765                         return
2766                 video_id = mobj.group('ID')
2767
2768                 # Get video webpage
2769                 self.report_video_webpage_download(video_id)
2770                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2771                 try:
2772                         page = urllib2.urlopen(request)
2773                         video_webpage = page.read()
2774                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2775                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2776                         return
2777
2778                 # Start extracting information
2779                 self.report_information_extraction(video_id)
2780
2781                 # Extract information
2782                 video_info = self._parse_page(video_webpage)
2783
2784                 # uploader
2785                 if 'owner' not in video_info:
2786                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2787                         return
2788                 video_uploader = video_info['owner']
2789
2790                 # title
2791                 if 'title' not in video_info:
2792                         self._downloader.trouble(u'ERROR: unable to extract video title')
2793                         return
2794                 video_title = video_info['title']
2795                 video_title = video_title.decode('utf-8')
2796                 video_title = sanitize_title(video_title)
2797
2798                 # simplified title
2799                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2800                 simple_title = simple_title.strip(ur'_')
2801
2802                 # thumbnail image
2803                 if 'thumbnail' not in video_info:
2804                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2805                         video_thumbnail = ''
2806                 else:
2807                         video_thumbnail = video_info['thumbnail']
2808
2809                 # upload date
2810                 upload_date = u'NA'
2811                 if 'upload_date' in video_info:
2812                         upload_time = video_info['upload_date']
2813                         timetuple = email.utils.parsedate_tz(upload_time)
2814                         if timetuple is not None:
2815                                 try:
2816                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2817                                 except:
2818                                         pass
2819
2820                 # description
2821                 video_description = video_info.get('description', 'No description available.')
2822
2823                 url_map = video_info['video_urls']
2824                 if len(url_map.keys()) > 0:
2825                         # Decide which formats to download
2826                         req_format = self._downloader.params.get('format', None)
2827                         format_limit = self._downloader.params.get('format_limit', None)
2828
2829                         if format_limit is not None and format_limit in self._available_formats:
2830                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2831                         else:
2832                                 format_list = self._available_formats
2833                         existing_formats = [x for x in format_list if x in url_map]
2834                         if len(existing_formats) == 0:
2835                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2836                                 return
2837                         if req_format is None:
2838                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2839                         elif req_format == 'worst':
2840                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2841                         elif req_format == '-1':
2842                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2843                         else:
2844                                 # Specific format
2845                                 if req_format not in url_map:
2846                                         self._downloader.trouble(u'ERROR: requested format not available')
2847                                         return
2848                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2849
2850                 for format_param, video_real_url in video_url_list:
2851
2852                         # At this point we have a new video
2853                         self._downloader.increment_downloads()
2854
2855                         # Extension
2856                         video_extension = self._video_extensions.get(format_param, 'mp4')
2857
2858                         try:
2859                                 # Process video information
2860                                 self._downloader.process_info({
2861                                         'id':           video_id.decode('utf-8'),
2862                                         'url':          video_real_url.decode('utf-8'),
2863                                         'uploader':     video_uploader.decode('utf-8'),
2864                                         'upload_date':  upload_date,
2865                                         'title':        video_title,
2866                                         'stitle':       simple_title,
2867                                         'ext':          video_extension.decode('utf-8'),
2868                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2869                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2870                                         'description':  video_description.decode('utf-8'),
2871                                         'player_url':   None,
2872                                 })
2873                         except UnavailableVideoError, err:
2874                                 self._downloader.trouble(u'\nERROR: unable to download video')
2875
2876 class BlipTVIE(InfoExtractor):
2877         """Information extractor for blip.tv"""
2878
2879         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2880         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2881         IE_NAME = u'blip.tv'
2882
2883         def report_extraction(self, file_id):
2884                 """Report information extraction."""
2885                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2886
2887         def _simplify_title(self, title):
2888                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2889                 res = res.strip(ur'_')
2890                 return res
2891
2892         def _real_extract(self, url):
2893                 mobj = re.match(self._VALID_URL, url)
2894                 if mobj is None:
2895                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2896                         return
2897
2898                 if '?' in url:
2899                         cchar = '&'
2900                 else:
2901                         cchar = '?'
2902                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2903                 request = urllib2.Request(json_url)
2904                 self.report_extraction(mobj.group(1))
2905                 try:
2906                         json_code = urllib2.urlopen(request).read()
2907                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2908                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2909                         return
2910                 try:
2911                         json_data = json.loads(json_code)
2912                         if 'Post' in json_data:
2913                                 data = json_data['Post']
2914                         else:
2915                                 data = json_data
2916
2917                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2918                         video_url = data['media']['url']
2919                         umobj = re.match(self._URL_EXT, video_url)
2920                         if umobj is None:
2921                                 raise ValueError('Can not determine filename extension')
2922                         ext = umobj.group(1)
2923
2924                         self._downloader.increment_downloads()
2925
2926                         info = {
2927                                 'id': data['item_id'],
2928                                 'url': video_url,
2929                                 'uploader': data['display_name'],
2930                                 'upload_date': upload_date,
2931                                 'title': data['title'],
2932                                 'stitle': self._simplify_title(data['title']),
2933                                 'ext': ext,
2934                                 'format': data['media']['mimeType'],
2935                                 'thumbnail': data['thumbnailUrl'],
2936                                 'description': data['description'],
2937                                 'player_url': data['embedUrl']
2938                         }
2939                 except (ValueError,KeyError), err:
2940                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2941                         return
2942
2943                 try:
2944                         self._downloader.process_info(info)
2945                 except UnavailableVideoError, err:
2946                         self._downloader.trouble(u'\nERROR: unable to download video')
2947
2948
2949 class MyVideoIE(InfoExtractor):
2950         """Information Extractor for myvideo.de."""
2951
2952         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2953         IE_NAME = u'myvideo'
2954
2955         def __init__(self, downloader=None):
2956                 InfoExtractor.__init__(self, downloader)
2957
2958         def report_download_webpage(self, video_id):
2959                 """Report webpage download."""
2960                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2961
2962         def report_extraction(self, video_id):
2963                 """Report information extraction."""
2964                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2965
2966         def _real_initialize(self):
2967                 return
2968
2969         def _real_extract(self,url):
2970                 mobj = re.match(self._VALID_URL, url)
2971                 if mobj is None:
2972                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2973                         return
2974
2975                 video_id = mobj.group(1)
2976                 simple_title = mobj.group(2).decode('utf-8')
2977                 # should actually not be necessary
2978                 simple_title = sanitize_title(simple_title)
2979                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2980
2981                 # Get video webpage
2982                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2983                 try:
2984                         self.report_download_webpage(video_id)
2985                         webpage = urllib2.urlopen(request).read()
2986                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2987                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2988                         return
2989
2990                 self.report_extraction(video_id)
2991                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2992                                  webpage)
2993                 if mobj is None:
2994                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2995                         return
2996                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2997
2998                 mobj = re.search('<title>([^<]+)</title>', webpage)
2999                 if mobj is None:
3000                         self._downloader.trouble(u'ERROR: unable to extract title')
3001                         return
3002
3003                 video_title = mobj.group(1)
3004                 video_title = sanitize_title(video_title)
3005
3006                 try:
3007                         print(video_url)
3008                         self._downloader.process_info({
3009                                 'id':           video_id,
3010                                 'url':          video_url,
3011                                 'uploader':     u'NA',
3012                                 'upload_date':  u'NA',
3013                                 'title':        video_title,
3014                                 'stitle':       simple_title,
3015                                 'ext':          u'flv',
3016                                 'format':       u'NA',
3017                                 'player_url':   None,
3018                         })
3019                 except UnavailableVideoError:
3020                         self._downloader.trouble(u'\nERROR: Unable to download video')
3021
3022 class ComedyCentralIE(InfoExtractor):
3023         """Information extractor for The Daily Show and Colbert Report """
3024
3025         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3026         IE_NAME = u'comedycentral'
3027
3028         def report_extraction(self, episode_id):
3029                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3030
3031         def report_config_download(self, episode_id):
3032                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3033
3034         def report_index_download(self, episode_id):
3035                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3036
3037         def report_player_url(self, episode_id):
3038                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3039
3040         def _simplify_title(self, title):
3041                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3042                 res = res.strip(ur'_')
3043                 return res
3044
3045         def _real_extract(self, url):
3046                 mobj = re.match(self._VALID_URL, url)
3047                 if mobj is None:
3048                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3049                         return
3050
3051                 if mobj.group('shortname'):
3052                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3053                                 url = 'http://www.thedailyshow.com/full-episodes/'
3054                         else:
3055                                 url = 'http://www.colbertnation.com/full-episodes/'
3056                         mobj = re.match(self._VALID_URL, url)
3057                         assert mobj is not None
3058
3059                 dlNewest = not mobj.group('episode')
3060                 if dlNewest:
3061                         epTitle = mobj.group('showname')
3062                 else:
3063                         epTitle = mobj.group('episode')
3064
3065                 req = urllib2.Request(url)
3066                 self.report_extraction(epTitle)
3067                 try:
3068                         htmlHandle = urllib2.urlopen(req)
3069                         html = htmlHandle.read()
3070                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3071                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3072                         return
3073                 if dlNewest:
3074                         url = htmlHandle.geturl()
3075                         mobj = re.match(self._VALID_URL, url)
3076                         if mobj is None:
3077                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3078                                 return
3079                         if mobj.group('episode') == '':
3080                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3081                                 return
3082                         epTitle = mobj.group('episode')
3083
3084                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3085                 if len(mMovieParams) == 0:
3086                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3087                         return
3088
3089                 playerUrl_raw = mMovieParams[0][0]
3090                 self.report_player_url(epTitle)
3091                 try:
3092                         urlHandle = urllib2.urlopen(playerUrl_raw)
3093                         playerUrl = urlHandle.geturl()
3094                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3095                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3096                         return
3097
3098                 uri = mMovieParams[0][1]
3099                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3100                 self.report_index_download(epTitle)
3101                 try:
3102                         indexXml = urllib2.urlopen(indexUrl).read()
3103                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3104                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3105                         return
3106
3107                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3108                 itemEls = idoc.findall('.//item')
3109                 for itemEl in itemEls:
3110                         mediaId = itemEl.findall('./guid')[0].text
3111                         shortMediaId = mediaId.split(':')[-1]
3112                         showId = mediaId.split(':')[-2].replace('.com', '')
3113                         officialTitle = itemEl.findall('./title')[0].text
3114                         officialDate = itemEl.findall('./pubDate')[0].text
3115
3116                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3117                                                 urllib.urlencode({'uri': mediaId}))
3118                         configReq = urllib2.Request(configUrl)
3119                         self.report_config_download(epTitle)
3120                         try:
3121                                 configXml = urllib2.urlopen(configReq).read()
3122                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3123                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3124                                 return
3125
3126                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3127                         turls = []
3128                         for rendition in cdoc.findall('.//rendition'):
3129                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3130                                 turls.append(finfo)
3131
3132                         if len(turls) == 0:
3133                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3134                                 continue
3135
3136                         # For now, just pick the highest bitrate
3137                         format,video_url = turls[-1]
3138
3139                         self._downloader.increment_downloads()
3140
3141                         effTitle = showId + '-' + epTitle
3142                         info = {
3143                                 'id': shortMediaId,
3144                                 'url': video_url,
3145                                 'uploader': showId,
3146                                 'upload_date': officialDate,
3147                                 'title': effTitle,
3148                                 'stitle': self._simplify_title(effTitle),
3149                                 'ext': 'mp4',
3150                                 'format': format,
3151                                 'thumbnail': None,
3152                                 'description': officialTitle,
3153                                 'player_url': playerUrl
3154                         }
3155
3156                         try:
3157                                 self._downloader.process_info(info)
3158                         except UnavailableVideoError, err:
3159                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3160                                 continue
3161
3162
3163 class EscapistIE(InfoExtractor):
3164         """Information extractor for The Escapist """
3165
3166         _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3167         IE_NAME = u'escapist'
3168
3169         def report_extraction(self, showName):
3170                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3171
3172         def report_config_download(self, showName):
3173                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3174
3175         def _simplify_title(self, title):
3176                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3177                 res = res.strip(ur'_')
3178                 return res
3179
3180         def _real_extract(self, url):
3181                 htmlParser = HTMLParser.HTMLParser()
3182
3183                 mobj = re.match(self._VALID_URL, url)
3184                 if mobj is None:
3185                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3186                         return
3187                 showName = mobj.group('showname')
3188                 videoId = mobj.group('episode')
3189
3190                 self.report_extraction(showName)
3191                 try:
3192                         webPage = urllib2.urlopen(url).read()
3193                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3194                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3195                         return
3196
3197                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3198                 description = htmlParser.unescape(descMatch.group(1))
3199                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3200                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3201                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3202                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3203                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3204                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3205
3206                 self.report_config_download(showName)
3207                 try:
3208                         configJSON = urllib2.urlopen(configUrl).read()
3209                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3210                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3211                         return
3212
3213                 # Technically, it's JavaScript, not JSON
3214                 configJSON = configJSON.replace("'", '"')
3215
3216                 try:
3217                         config = json.loads(configJSON)
3218                 except (ValueError,), err:
3219                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3220                         return
3221
3222                 playlist = config['playlist']
3223                 videoUrl = playlist[1]['url']
3224
3225                 self._downloader.increment_downloads()
3226                 info = {
3227                         'id': videoId,
3228                         'url': videoUrl,
3229                         'uploader': showName,
3230                         'upload_date': None,
3231                         'title': showName,
3232                         'stitle': self._simplify_title(showName),
3233                         'ext': 'flv',
3234                         'format': 'flv',
3235                         'thumbnail': imgUrl,
3236                         'description': description,
3237                         'player_url': playerUrl,
3238                 }
3239
3240                 try:
3241                         self._downloader.process_info(info)
3242                 except UnavailableVideoError, err:
3243                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3244
3245
3246
3247 class PostProcessor(object):
3248         """Post Processor class.
3249
3250         PostProcessor objects can be added to downloaders with their
3251         add_post_processor() method. When the downloader has finished a
3252         successful download, it will take its internal chain of PostProcessors
3253         and start calling the run() method on each one of them, first with
3254         an initial argument and then with the returned value of the previous
3255         PostProcessor.
3256
3257         The chain will be stopped if one of them ever returns None or the end
3258         of the chain is reached.
3259
3260         PostProcessor objects follow a "mutual registration" process similar
3261         to InfoExtractor objects.
3262         """
3263
3264         _downloader = None
3265
3266         def __init__(self, downloader=None):
3267                 self._downloader = downloader
3268
3269         def set_downloader(self, downloader):
3270                 """Sets the downloader for this PP."""
3271                 self._downloader = downloader
3272
3273         def run(self, information):
3274                 """Run the PostProcessor.
3275
3276                 The "information" argument is a dictionary like the ones
3277                 composed by InfoExtractors. The only difference is that this
3278                 one has an extra field called "filepath" that points to the
3279                 downloaded file.
3280
3281                 When this method returns None, the postprocessing chain is
3282                 stopped. However, this method may return an information
3283                 dictionary that will be passed to the next postprocessing
3284                 object in the chain. It can be the one it received after
3285                 changing some fields.
3286
3287                 In addition, this method may raise a PostProcessingError
3288                 exception that will be taken into account by the downloader
3289                 it was called from.
3290                 """
3291                 return information # by default, do nothing
3292
3293
3294 class FFmpegExtractAudioPP(PostProcessor):
3295
3296         def __init__(self, downloader=None, preferredcodec=None):
3297                 PostProcessor.__init__(self, downloader)
3298                 if preferredcodec is None:
3299                         preferredcodec = 'best'
3300                 self._preferredcodec = preferredcodec
3301
3302         @staticmethod
3303         def get_audio_codec(path):
3304                 try:
3305                         cmd = ['ffprobe', '-show_streams', '--', path]
3306                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3307                         output = handle.communicate()[0]
3308                         if handle.wait() != 0:
3309                                 return None
3310                 except (IOError, OSError):
3311                         return None
3312                 audio_codec = None
3313                 for line in output.split('\n'):
3314                         if line.startswith('codec_name='):
3315                                 audio_codec = line.split('=')[1].strip()
3316                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3317                                 return audio_codec
3318                 return None
3319
3320         @staticmethod
3321         def run_ffmpeg(path, out_path, codec, more_opts):
3322                 try:
3323                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3324                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3325                         return (ret == 0)
3326                 except (IOError, OSError):
3327                         return False
3328
3329         def run(self, information):
3330                 path = information['filepath']
3331
3332                 filecodec = self.get_audio_codec(path)
3333                 if filecodec is None:
3334                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3335                         return None
3336
3337                 more_opts = []
3338                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3339                         if filecodec == 'aac' or filecodec == 'mp3':
3340                                 # Lossless if possible
3341                                 acodec = 'copy'
3342                                 extension = filecodec
3343                                 if filecodec == 'aac':
3344                                         more_opts = ['-f', 'adts']
3345                         else:
3346                                 # MP3 otherwise.
3347                                 acodec = 'libmp3lame'
3348                                 extension = 'mp3'
3349                                 more_opts = ['-ab', '128k']
3350                 else:
3351                         # We convert the audio (lossy)
3352                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3353                         extension = self._preferredcodec
3354                         more_opts = ['-ab', '128k']
3355                         if self._preferredcodec == 'aac':
3356                                 more_opts += ['-f', 'adts']
3357
3358                 (prefix, ext) = os.path.splitext(path)
3359                 new_path = prefix + '.' + extension
3360                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3361                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3362
3363                 if not status:
3364                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3365                         return None
3366
3367                 # Try to update the date time for extracted audio file.
3368                 if information.get('filetime') is not None:
3369                         try:
3370                                 os.utime(new_path, (time.time(), information['filetime']))
3371                         except:
3372                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3373
3374                 try:
3375                         os.remove(path)
3376                 except (IOError, OSError):
3377                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3378                         return None
3379
3380                 information['filepath'] = new_path
3381                 return information
3382
3383
3384 def updateSelf(downloader, filename):
3385         ''' Update the program file with the latest version from the repository '''
3386         # Note: downloader only used for options
3387         if not os.access(filename, os.W_OK):
3388                 sys.exit('ERROR: no write permissions on %s' % filename)
3389
3390         downloader.to_screen('Updating to latest version...')
3391
3392         try:
3393                 try:
3394                         urlh = urllib.urlopen(UPDATE_URL)
3395                         newcontent = urlh.read()
3396                 finally:
3397                         urlh.close()
3398         except (IOError, OSError), err:
3399                 sys.exit('ERROR: unable to download latest version')
3400
3401         try:
3402                 outf = open(filename, 'wb')
3403                 try:
3404                         outf.write(newcontent)
3405                 finally:
3406                         outf.close()
3407         except (IOError, OSError), err:
3408                 sys.exit('ERROR: unable to overwrite current version')
3409
3410         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3411
3412 def parseOpts():
3413         # Deferred imports
3414         import getpass
3415         import optparse
3416
3417         def _format_option_string(option):
3418                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3419
3420                 opts = []
3421
3422                 if option._short_opts: opts.append(option._short_opts[0])
3423                 if option._long_opts: opts.append(option._long_opts[0])
3424                 if len(opts) > 1: opts.insert(1, ', ')
3425
3426                 if option.takes_value(): opts.append(' %s' % option.metavar)
3427
3428                 return "".join(opts)
3429
3430         def _find_term_columns():
3431                 columns = os.environ.get('COLUMNS', None)
3432                 if columns:
3433                         return int(columns)
3434
3435                 try:
3436                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3437                         out,err = sp.communicate()
3438                         return int(out.split()[1])
3439                 except:
3440                         pass
3441                 return None
3442
3443         max_width = 80
3444         max_help_position = 80
3445
3446         # No need to wrap help messages if we're on a wide console
3447         columns = _find_term_columns()
3448         if columns: max_width = columns
3449
3450         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3451         fmt.format_option_strings = _format_option_string
3452
3453         kw = {
3454                 'version'   : __version__,
3455                 'formatter' : fmt,
3456                 'usage' : '%prog [options] url [url...]',
3457                 'conflict_handler' : 'resolve',
3458         }
3459
3460         parser = optparse.OptionParser(**kw)
3461
3462         # option groups
3463         general        = optparse.OptionGroup(parser, 'General Options')
3464         selection      = optparse.OptionGroup(parser, 'Video Selection')
3465         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3466         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3467         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3468         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3469         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3470
3471         general.add_option('-h', '--help',
3472                         action='help', help='print this help text and exit')
3473         general.add_option('-v', '--version',
3474                         action='version', help='print program version and exit')
3475         general.add_option('-U', '--update',
3476                         action='store_true', dest='update_self', help='update this program to latest version')
3477         general.add_option('-i', '--ignore-errors',
3478                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3479         general.add_option('-r', '--rate-limit',
3480                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3481         general.add_option('-R', '--retries',
3482                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3483         general.add_option('--dump-user-agent',
3484                         action='store_true', dest='dump_user_agent',
3485                         help='display the current browser identification', default=False)
3486         general.add_option('--list-extractors',
3487                         action='store_true', dest='list_extractors',
3488                         help='List all supported extractors and the URLs they would handle', default=False)
3489
3490         selection.add_option('--playlist-start',
3491                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3492         selection.add_option('--playlist-end',
3493                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3494         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3495         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3496
3497         authentication.add_option('-u', '--username',
3498                         dest='username', metavar='USERNAME', help='account username')
3499         authentication.add_option('-p', '--password',
3500                         dest='password', metavar='PASSWORD', help='account password')
3501         authentication.add_option('-n', '--netrc',
3502                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3503
3504
3505         video_format.add_option('-f', '--format',
3506                         action='store', dest='format', metavar='FORMAT', help='video format code')
3507         video_format.add_option('--all-formats',
3508                         action='store_const', dest='format', help='download all available video formats', const='-1')
3509         video_format.add_option('--max-quality',
3510                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3511
3512
3513         verbosity.add_option('-q', '--quiet',
3514                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3515         verbosity.add_option('-s', '--simulate',
3516                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3517         verbosity.add_option('--skip-download',
3518                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3519         verbosity.add_option('-g', '--get-url',
3520                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3521         verbosity.add_option('-e', '--get-title',
3522                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3523         verbosity.add_option('--get-thumbnail',
3524                         action='store_true', dest='getthumbnail',
3525                         help='simulate, quiet but print thumbnail URL', default=False)
3526         verbosity.add_option('--get-description',
3527                         action='store_true', dest='getdescription',
3528                         help='simulate, quiet but print video description', default=False)
3529         verbosity.add_option('--get-filename',
3530                         action='store_true', dest='getfilename',
3531                         help='simulate, quiet but print output filename', default=False)
3532         verbosity.add_option('--get-format',
3533                         action='store_true', dest='getformat',
3534                         help='simulate, quiet but print output format', default=False)
3535         verbosity.add_option('--no-progress',
3536                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3537         verbosity.add_option('--console-title',
3538                         action='store_true', dest='consoletitle',
3539                         help='display progress in console titlebar', default=False)
3540
3541
3542         filesystem.add_option('-t', '--title',
3543                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3544         filesystem.add_option('-l', '--literal',
3545                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3546         filesystem.add_option('-A', '--auto-number',
3547                         action='store_true', dest='autonumber',
3548                         help='number downloaded files starting from 00000', default=False)
3549         filesystem.add_option('-o', '--output',
3550                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3551         filesystem.add_option('-a', '--batch-file',
3552                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3553         filesystem.add_option('-w', '--no-overwrites',
3554                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3555         filesystem.add_option('-c', '--continue',
3556                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3557         filesystem.add_option('--cookies',
3558                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3559         filesystem.add_option('--no-part',
3560                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3561         filesystem.add_option('--no-mtime',
3562                         action='store_false', dest='updatetime',
3563                         help='do not use the Last-modified header to set the file modification time', default=True)
3564         filesystem.add_option('--write-description',
3565                         action='store_true', dest='writedescription',
3566                         help='write video description to a .description file', default=False)
3567         filesystem.add_option('--write-info-json',
3568                         action='store_true', dest='writeinfojson',
3569                         help='write video metadata to a .info.json file', default=False)
3570
3571
3572         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3573                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3574         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3575                         help='"best", "aac" or "mp3"; best by default')
3576
3577
3578         parser.add_option_group(general)
3579         parser.add_option_group(selection)
3580         parser.add_option_group(filesystem)
3581         parser.add_option_group(verbosity)
3582         parser.add_option_group(video_format)
3583         parser.add_option_group(authentication)
3584         parser.add_option_group(postproc)
3585
3586         opts, args = parser.parse_args()
3587
3588         return parser, opts, args
3589
3590 def gen_extractors():
3591         """ Return a list of an instance of every supported extractor.
3592         The order does matter; the first extractor matched is the one handling the URL.
3593         """
3594         youtube_ie = YoutubeIE()
3595         google_ie = GoogleIE()
3596         yahoo_ie = YahooIE()
3597         return [
3598                 youtube_ie,
3599                 MetacafeIE(youtube_ie),
3600                 DailymotionIE(),
3601                 YoutubePlaylistIE(youtube_ie),
3602                 YoutubeUserIE(youtube_ie),
3603                 YoutubeSearchIE(youtube_ie),
3604                 google_ie,
3605                 GoogleSearchIE(google_ie),
3606                 PhotobucketIE(),
3607                 yahoo_ie,
3608                 YahooSearchIE(yahoo_ie),
3609                 DepositFilesIE(),
3610                 FacebookIE(),
3611                 BlipTVIE(),
3612                 VimeoIE(),
3613                 MyVideoIE(),
3614                 ComedyCentralIE(),
3615                 EscapistIE(),
3616
3617                 GenericIE()
3618         ]
3619
3620 def main():
3621         parser, opts, args = parseOpts()
3622
3623         # Open appropriate CookieJar
3624         if opts.cookiefile is None:
3625                 jar = cookielib.CookieJar()
3626         else:
3627                 try:
3628                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3629                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3630                                 jar.load()
3631                 except (IOError, OSError), err:
3632                         sys.exit(u'ERROR: unable to open cookie file')
3633
3634         # Dump user agent
3635         if opts.dump_user_agent:
3636                 print std_headers['User-Agent']
3637                 sys.exit(0)
3638
3639         # Batch file verification
3640         batchurls = []
3641         if opts.batchfile is not None:
3642                 try:
3643                         if opts.batchfile == '-':
3644                                 batchfd = sys.stdin
3645                         else:
3646                                 batchfd = open(opts.batchfile, 'r')
3647                         batchurls = batchfd.readlines()
3648                         batchurls = [x.strip() for x in batchurls]
3649                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3650                 except IOError:
3651                         sys.exit(u'ERROR: batch file could not be read')
3652         all_urls = batchurls + args
3653
3654         # General configuration
3655         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3656         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3657         urllib2.install_opener(opener)
3658         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3659
3660         extractors = gen_extractors()
3661
3662         if opts.list_extractors:
3663                 for ie in extractors:
3664                         print(ie.IE_NAME)
3665                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3666                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3667                         for mu in matchedUrls:
3668                                 print(u'  ' + mu)
3669                 sys.exit(0)
3670
3671         # Conflicting, missing and erroneous options
3672         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3673                 parser.error(u'using .netrc conflicts with giving username/password')
3674         if opts.password is not None and opts.username is None:
3675                 parser.error(u'account username missing')
3676         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3677                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3678         if opts.usetitle and opts.useliteral:
3679                 parser.error(u'using title conflicts with using literal title')
3680         if opts.username is not None and opts.password is None:
3681                 opts.password = getpass.getpass(u'Type account password and press return:')
3682         if opts.ratelimit is not None:
3683                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3684                 if numeric_limit is None:
3685                         parser.error(u'invalid rate limit specified')
3686                 opts.ratelimit = numeric_limit
3687         if opts.retries is not None:
3688                 try:
3689                         opts.retries = long(opts.retries)
3690                 except (TypeError, ValueError), err:
3691                         parser.error(u'invalid retry count specified')
3692         try:
3693                 opts.playliststart = int(opts.playliststart)
3694                 if opts.playliststart <= 0:
3695                         raise ValueError(u'Playlist start must be positive')
3696         except (TypeError, ValueError), err:
3697                 parser.error(u'invalid playlist start number specified')
3698         try:
3699                 opts.playlistend = int(opts.playlistend)
3700                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3701                         raise ValueError(u'Playlist end must be greater than playlist start')
3702         except (TypeError, ValueError), err:
3703                 parser.error(u'invalid playlist end number specified')
3704         if opts.extractaudio:
3705                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3706                         parser.error(u'invalid audio format specified')
3707
3708         # File downloader
3709         fd = FileDownloader({
3710                 'usenetrc': opts.usenetrc,
3711                 'username': opts.username,
3712                 'password': opts.password,
3713                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3714                 'forceurl': opts.geturl,
3715                 'forcetitle': opts.gettitle,
3716                 'forcethumbnail': opts.getthumbnail,
3717                 'forcedescription': opts.getdescription,
3718                 'forcefilename': opts.getfilename,
3719                 'forceformat': opts.getformat,
3720                 'simulate': opts.simulate,
3721                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3722                 'format': opts.format,
3723                 'format_limit': opts.format_limit,
3724                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3725                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3726                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3727                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3728                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3729                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3730                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3731                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3732                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3733                         or u'%(id)s.%(ext)s'),
3734                 'ignoreerrors': opts.ignoreerrors,
3735                 'ratelimit': opts.ratelimit,
3736                 'nooverwrites': opts.nooverwrites,
3737                 'retries': opts.retries,
3738                 'continuedl': opts.continue_dl,
3739                 'noprogress': opts.noprogress,
3740                 'playliststart': opts.playliststart,
3741                 'playlistend': opts.playlistend,
3742                 'logtostderr': opts.outtmpl == '-',
3743                 'consoletitle': opts.consoletitle,
3744                 'nopart': opts.nopart,
3745                 'updatetime': opts.updatetime,
3746                 'writedescription': opts.writedescription,
3747                 'writeinfojson': opts.writeinfojson,
3748                 'matchtitle': opts.matchtitle,
3749                 'rejecttitle': opts.rejecttitle,
3750                 })
3751         for extractor in extractors:
3752                 fd.add_info_extractor(extractor)
3753
3754         # PostProcessors
3755         if opts.extractaudio:
3756                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3757
3758         # Update version
3759         if opts.update_self:
3760                 updateSelf(fd, sys.argv[0])
3761
3762         # Maybe do nothing
3763         if len(all_urls) < 1:
3764                 if not opts.update_self:
3765                         parser.error(u'you must provide at least one URL')
3766                 else:
3767                         sys.exit()
3768         retcode = fd.download(all_urls)
3769
3770         # Dump cookie jar if requested
3771         if opts.cookiefile is not None:
3772                 try:
3773                         jar.save()
3774                 except (IOError, OSError), err:
3775                         sys.exit(u'ERROR: unable to save cookie jar')
3776
3777         sys.exit(retcode)
3778
3779
3780 if __name__ == '__main__':
3781         try:
3782                 main()
3783         except DownloadError:
3784                 sys.exit(1)
3785         except SameFileError:
3786                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3787         except KeyboardInterrupt:
3788                 sys.exit(u'\nERROR: Interrupted by user')
3789
3790 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: