youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.09.15'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import HTMLParser
  27 import httplib
  28 import locale
  29 import math
  30 import netrc
  31 import os
  32 import os.path
  33 import re
  34 import socket
  35 import string
  36 import subprocess
  37 import sys
  38 import time
  39 import urllib
  40 import urllib2
  41 import warnings
  42 import zlib
  43
  44 if os.name == 'nt':
  45         import ctypes
  46
  47 try:
  48         import email.utils
  49 except ImportError: # Python 2.4
  50         import email.Utils
  51 try:
  52         import cStringIO as StringIO
  53 except ImportError:
  54         import StringIO
  55
  56 # parse_qs was moved from the cgi module to the urlparse module recently.
  57 try:
  58         from urlparse import parse_qs
  59 except ImportError:
  60         from cgi import parse_qs
  61
  62 try:
  63         import lxml.etree
  64 except ImportError:
  65         pass # Handled below
  66
  67 try:
  68         import xml.etree.ElementTree
  69 except ImportError: # Python<2.5: Not officially supported, but let it slip
  70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  71
  72 std_headers = {
  73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  76         'Accept-Encoding': 'gzip, deflate',
  77         'Accept-Language': 'en-us,en;q=0.5',
  78 }
  79
  80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280
 281 class DownloadError(Exception):
 282         """Download Error exception.
 283
 284         This exception may be thrown by FileDownloader objects if they are not
 285         configured to continue on errors. They will contain the appropriate
 286         error message.
 287         """
 288         pass
 289
 290
 291 class SameFileError(Exception):
 292         """Same File exception.
 293
 294         This exception will be thrown by FileDownloader objects if they detect
 295         multiple files would have to be downloaded to the same file on disk.
 296         """
 297         pass
 298
 299
 300 class PostProcessingError(Exception):
 301         """Post Processing exception.
 302
 303         This exception may be raised by PostProcessor's .run() method to
 304         indicate an error in the postprocessing task.
 305         """
 306         pass
 307
 308
 309 class UnavailableVideoError(Exception):
 310         """Unavailable Format exception.
 311
 312         This exception will be thrown when a video is requested
 313         in a format that is not available for that video.
 314         """
 315         pass
 316
 317
 318 class ContentTooShortError(Exception):
 319         """Content Too Short exception.
 320
 321         This exception may be raised by FileDownloader objects when a file they
 322         download is too small for what the server announced first, indicating
 323         the connection was probably interrupted.
 324         """
 325         # Both in bytes
 326         downloaded = None
 327         expected = None
 328
 329         def __init__(self, downloaded, expected):
 330                 self.downloaded = downloaded
 331                 self.expected = expected
 332
 333
 334 class YoutubeDLHandler(urllib2.HTTPHandler):
 335         """Handler for HTTP requests and responses.
 336
 337         This class, when installed with an OpenerDirector, automatically adds
 338         the standard headers to every HTTP request and handles gzipped and
 339         deflated responses from web servers. If compression is to be avoided in
 340         a particular request, the original request in the program code only has
 341         to include the HTTP header "Youtubedl-No-Compression", which will be
 342         removed before making the real request.
 343
 344         Part of this code was copied from:
 345
 346         http://techknack.net/python-urllib2-handlers/
 347
 348         Andrew Rowls, the author of that code, agreed to release it to the
 349         public domain.
 350         """
 351
 352         @staticmethod
 353         def deflate(data):
 354                 try:
 355                         return zlib.decompress(data, -zlib.MAX_WBITS)
 356                 except zlib.error:
 357                         return zlib.decompress(data)
 358
 359         @staticmethod
 360         def addinfourl_wrapper(stream, headers, url, code):
 361                 if hasattr(urllib2.addinfourl, 'getcode'):
 362                         return urllib2.addinfourl(stream, headers, url, code)
 363                 ret = urllib2.addinfourl(stream, headers, url)
 364                 ret.code = code
 365                 return ret
 366
 367         def http_request(self, req):
 368                 for h in std_headers:
 369                         if h in req.headers:
 370                                 del req.headers[h]
 371                         req.add_header(h, std_headers[h])
 372                 if 'Youtubedl-no-compression' in req.headers:
 373                         if 'Accept-encoding' in req.headers:
 374                                 del req.headers['Accept-encoding']
 375                         del req.headers['Youtubedl-no-compression']
 376                 return req
 377
 378         def http_response(self, req, resp):
 379                 old_resp = resp
 380                 # gzip
 381                 if resp.headers.get('Content-encoding', '') == 'gzip':
 382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 384                         resp.msg = old_resp.msg
 385                 # deflate
 386                 if resp.headers.get('Content-encoding', '') == 'deflate':
 387                         gz = StringIO.StringIO(self.deflate(resp.read()))
 388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 389                         resp.msg = old_resp.msg
 390                 return resp
 391
 392
 393 class FileDownloader(object):
 394         """File Downloader class.
 395
 396         File downloader objects are the ones responsible of downloading the
 397         actual video file and writing it to disk if the user has requested
 398         it, among some other tasks. In most cases there should be one per
 399         program. As, given a video URL, the downloader doesn't know how to
 400         extract all the needed information, task that InfoExtractors do, it
 401         has to pass the URL to one of them.
 402
 403         For this, file downloader objects have a method that allows
 404         InfoExtractors to be registered in a given order. When it is passed
 405         a URL, the file downloader handles it to the first InfoExtractor it
 406         finds that reports being able to handle it. The InfoExtractor extracts
 407         all the information about the video or videos the URL refers to, and
 408         asks the FileDownloader to process the video information, possibly
 409         downloading the video.
 410
 411         File downloaders accept a lot of parameters. In order not to saturate
 412         the object constructor with arguments, it receives a dictionary of
 413         options instead. These options are available through the params
 414         attribute for the InfoExtractors to use. The FileDownloader also
 415         registers itself as the downloader in charge for the InfoExtractors
 416         that are added to it, so this is a "mutual registration".
 417
 418         Available options:
 419
 420         username:         Username for authentication purposes.
 421         password:         Password for authentication purposes.
 422         usenetrc:         Use netrc for authentication instead.
 423         quiet:            Do not print messages to stdout.
 424         forceurl:         Force printing final URL.
 425         forcetitle:       Force printing title.
 426         forcethumbnail:   Force printing thumbnail URL.
 427         forcedescription: Force printing description.
 428         forcefilename:    Force printing final filename.
 429         simulate:         Do not download the video files.
 430         format:           Video format code.
 431         format_limit:     Highest quality format to try.
 432         outtmpl:          Template for output names.
 433         ignoreerrors:     Do not stop on download errors.
 434         ratelimit:        Download speed limit, in bytes/sec.
 435         nooverwrites:     Prevent overwriting files.
 436         retries:          Number of times to retry for HTTP error 5xx
 437         continuedl:       Try to continue downloads if possible.
 438         noprogress:       Do not print the progress bar.
 439         playliststart:    Playlist item to start at.
 440         playlistend:      Playlist item to end at.
 441         matchtitle:       Download only matching titles.
 442         rejecttitle:      Reject downloads for matching titles.
 443         logtostderr:      Log messages to stderr instead of stdout.
 444         consoletitle:     Display progress in console window's titlebar.
 445         nopart:           Do not use temporary .part files.
 446         updatetime:       Use the Last-modified header to set output file timestamps.
 447         writedescription: Write the video description to a .description file
 448         writeinfojson:    Write the video description to a .info.json file
 449         """
 450
 451         params = None
 452         _ies = []
 453         _pps = []
 454         _download_retcode = None
 455         _num_downloads = None
 456         _screen_file = None
 457
 458         def __init__(self, params):
 459                 """Create a FileDownloader object with the given options."""
 460                 self._ies = []
 461                 self._pps = []
 462                 self._download_retcode = 0
 463                 self._num_downloads = 0
 464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 465                 self.params = params
 466
 467         @staticmethod
 468         def format_bytes(bytes):
 469                 if bytes is None:
 470                         return 'N/A'
 471                 if type(bytes) is str:
 472                         bytes = float(bytes)
 473                 if bytes == 0.0:
 474                         exponent = 0
 475                 else:
 476                         exponent = long(math.log(bytes, 1024.0))
 477                 suffix = 'bkMGTPEZY'[exponent]
 478                 converted = float(bytes) / float(1024 ** exponent)
 479                 return '%.2f%s' % (converted, suffix)
 480
 481         @staticmethod
 482         def calc_percent(byte_counter, data_len):
 483                 if data_len is None:
 484                         return '---.-%'
 485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 486
 487         @staticmethod
 488         def calc_eta(start, now, total, current):
 489                 if total is None:
 490                         return '--:--'
 491                 dif = now - start
 492                 if current == 0 or dif < 0.001: # One millisecond
 493                         return '--:--'
 494                 rate = float(current) / dif
 495                 eta = long((float(total) - float(current)) / rate)
 496                 (eta_mins, eta_secs) = divmod(eta, 60)
 497                 if eta_mins > 99:
 498                         return '--:--'
 499                 return '%02d:%02d' % (eta_mins, eta_secs)
 500
 501         @staticmethod
 502         def calc_speed(start, now, bytes):
 503                 dif = now - start
 504                 if bytes == 0 or dif < 0.001: # One millisecond
 505                         return '%10s' % '---b/s'
 506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 507
 508         @staticmethod
 509         def best_block_size(elapsed_time, bytes):
 510                 new_min = max(bytes / 2.0, 1.0)
 511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 512                 if elapsed_time < 0.001:
 513                         return long(new_max)
 514                 rate = bytes / elapsed_time
 515                 if rate > new_max:
 516                         return long(new_max)
 517                 if rate < new_min:
 518                         return long(new_min)
 519                 return long(rate)
 520
 521         @staticmethod
 522         def parse_bytes(bytestr):
 523                 """Parse a string indicating a byte quantity into a long integer."""
 524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 525                 if matchobj is None:
 526                         return None
 527                 number = float(matchobj.group(1))
 528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 529                 return long(round(number * multiplier))
 530
 531         def add_info_extractor(self, ie):
 532                 """Add an InfoExtractor object to the end of the list."""
 533                 self._ies.append(ie)
 534                 ie.set_downloader(self)
 535
 536         def add_post_processor(self, pp):
 537                 """Add a PostProcessor object to the end of the chain."""
 538                 self._pps.append(pp)
 539                 pp.set_downloader(self)
 540
 541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 542                 """Print message to stdout if not in quiet mode."""
 543                 try:
 544                         if not self.params.get('quiet', False):
 545                                 terminator = [u'\n', u''][skip_eol]
 546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 547                         self._screen_file.flush()
 548                 except (UnicodeEncodeError), err:
 549                         if not ignore_encoding_errors:
 550                                 raise
 551
 552         def to_stderr(self, message):
 553                 """Print message to stderr."""
 554                 print >>sys.stderr, message.encode(preferredencoding())
 555
 556         def to_cons_title(self, message):
 557                 """Set console/terminal window title to message."""
 558                 if not self.params.get('consoletitle', False):
 559                         return
 560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 561                         # c_wchar_p() might not be necessary if `message` is
 562                         # already of type unicode()
 563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 564                 elif 'TERM' in os.environ:
 565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 566
 567         def fixed_template(self):
 568                 """Checks if the output template is fixed."""
 569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 570
 571         def trouble(self, message=None):
 572                 """Determine action to take when a download problem appears.
 573
 574                 Depending on if the downloader has been configured to ignore
 575                 download errors or not, this method may throw an exception or
 576                 not when errors are found, after printing the message.
 577                 """
 578                 if message is not None:
 579                         self.to_stderr(message)
 580                 if not self.params.get('ignoreerrors', False):
 581                         raise DownloadError(message)
 582                 self._download_retcode = 1
 583
 584         def slow_down(self, start_time, byte_counter):
 585                 """Sleep if the download speed is over the rate limit."""
 586                 rate_limit = self.params.get('ratelimit', None)
 587                 if rate_limit is None or byte_counter == 0:
 588                         return
 589                 now = time.time()
 590                 elapsed = now - start_time
 591                 if elapsed <= 0.0:
 592                         return
 593                 speed = float(byte_counter) / elapsed
 594                 if speed > rate_limit:
 595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 596
 597         def temp_name(self, filename):
 598                 """Returns a temporary filename for the given filename."""
 599                 if self.params.get('nopart', False) or filename == u'-' or \
 600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 601                         return filename
 602                 return filename + u'.part'
 603
 604         def undo_temp_name(self, filename):
 605                 if filename.endswith(u'.part'):
 606                         return filename[:-len(u'.part')]
 607                 return filename
 608
 609         def try_rename(self, old_filename, new_filename):
 610                 try:
 611                         if old_filename == new_filename:
 612                                 return
 613                         os.rename(old_filename, new_filename)
 614                 except (IOError, OSError), err:
 615                         self.trouble(u'ERROR: unable to rename file')
 616
 617         def try_utime(self, filename, last_modified_hdr):
 618                 """Try to set the last-modified time of the given file."""
 619                 if last_modified_hdr is None:
 620                         return
 621                 if not os.path.isfile(filename):
 622                         return
 623                 timestr = last_modified_hdr
 624                 if timestr is None:
 625                         return
 626                 filetime = timeconvert(timestr)
 627                 if filetime is None:
 628                         return
 629                 try:
 630                         os.utime(filename, (time.time(), filetime))
 631                 except:
 632                         pass
 633
 634         def report_writedescription(self, descfn):
 635                 """ Report that the description file is being written """
 636                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 637
 638         def report_writeinfojson(self, infofn):
 639                 """ Report that the metadata file has been written """
 640                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 641
 642         def report_destination(self, filename):
 643                 """Report destination filename."""
 644                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 645
 646         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 647                 """Report download progress."""
 648                 if self.params.get('noprogress', False):
 649                         return
 650                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 651                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 652                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 653                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 654
 655         def report_resuming_byte(self, resume_len):
 656                 """Report attempt to resume at given byte."""
 657                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 658
 659         def report_retry(self, count, retries):
 660                 """Report retry in case of HTTP error 5xx"""
 661                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 662
 663         def report_file_already_downloaded(self, file_name):
 664                 """Report file has already been fully downloaded."""
 665                 try:
 666                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 667                 except (UnicodeEncodeError), err:
 668                         self.to_screen(u'[download] The file has already been downloaded')
 669
 670         def report_unable_to_resume(self):
 671                 """Report it was impossible to resume download."""
 672                 self.to_screen(u'[download] Unable to resume')
 673
 674         def report_finish(self):
 675                 """Report download finished."""
 676                 if self.params.get('noprogress', False):
 677                         self.to_screen(u'[download] Download completed')
 678                 else:
 679                         self.to_screen(u'')
 680
 681         def increment_downloads(self):
 682                 """Increment the ordinal that assigns a number to each file."""
 683                 self._num_downloads += 1
 684
 685         def prepare_filename(self, info_dict):
 686                 """Generate the output filename."""
 687                 try:
 688                         template_dict = dict(info_dict)
 689                         template_dict['epoch'] = unicode(long(time.time()))
 690                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 691                         filename = self.params['outtmpl'] % template_dict
 692                         return filename
 693                 except (ValueError, KeyError), err:
 694                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 695                         return None
 696
 697         def process_info(self, info_dict):
 698                 """Process a single dictionary returned by an InfoExtractor."""
 699                 filename = self.prepare_filename(info_dict)
 700
 701                 # Forced printings
 702                 if self.params.get('forcetitle', False):
 703                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 704                 if self.params.get('forceurl', False):
 705                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 706                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 707                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 708                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 709                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 710                 if self.params.get('forcefilename', False) and filename is not None:
 711                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 712
 713                 # Do nothing else if in simulate mode
 714                 if self.params.get('simulate', False):
 715                         return
 716
 717                 if filename is None:
 718                         return
 719
 720                 matchtitle=self.params.get('matchtitle',False)
 721                 rejecttitle=self.params.get('rejecttitle',False)
 722                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 723                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 724                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 725                         return
 726                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 727                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 728                         return
 729
 730                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 731                         self.to_stderr(u'WARNING: file exists and will be skipped')
 732                         return
 733
 734                 try:
 735                         dn = os.path.dirname(filename)
 736                         if dn != '' and not os.path.exists(dn):
 737                                 os.makedirs(dn)
 738                 except (OSError, IOError), err:
 739                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 740                         return
 741
 742                 if self.params.get('writedescription', False):
 743                         try:
 744                                 descfn = filename + '.description'
 745                                 self.report_writedescription(descfn)
 746                                 descfile = open(descfn, 'wb')
 747                                 try:
 748                                         descfile.write(info_dict['description'].encode('utf-8'))
 749                                 finally:
 750                                         descfile.close()
 751                         except (OSError, IOError):
 752                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 753                                 return
 754
 755                 if self.params.get('writeinfojson', False):
 756                         infofn = filename + '.info.json'
 757                         self.report_writeinfojson(infofn)
 758                         try:
 759                                 json.dump
 760                         except (NameError,AttributeError):
 761                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 762                                 return
 763                         try:
 764                                 infof = open(infofn, 'wb')
 765                                 try:
 766                                         json.dump(info_dict, infof)
 767                                 finally:
 768                                         infof.close()
 769                         except (OSError, IOError):
 770                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 771                                 return
 772
 773                 if not self.params.get('skip_download', False):
 774                         try:
 775                                 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 776                         except (OSError, IOError), err:
 777                                 raise UnavailableVideoError
 778                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 779                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 780                                 return
 781                         except (ContentTooShortError, ), err:
 782                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 783                                 return
 784
 785                         if success:
 786                                 try:
 787                                         self.post_process(filename, info_dict)
 788                                 except (PostProcessingError), err:
 789                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 790                                         return
 791
 792         def download(self, url_list):
 793                 """Download a given list of URLs."""
 794                 if len(url_list) > 1 and self.fixed_template():
 795                         raise SameFileError(self.params['outtmpl'])
 796
 797                 for url in url_list:
 798                         suitable_found = False
 799                         for ie in self._ies:
 800                                 # Go to next InfoExtractor if not suitable
 801                                 if not ie.suitable(url):
 802                                         continue
 803
 804                                 # Suitable InfoExtractor found
 805                                 suitable_found = True
 806
 807                                 # Extract information from URL and process it
 808                                 ie.extract(url)
 809
 810                                 # Suitable InfoExtractor had been found; go to next URL
 811                                 break
 812
 813                         if not suitable_found:
 814                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 815
 816                 return self._download_retcode
 817
 818         def post_process(self, filename, ie_info):
 819                 """Run the postprocessing chain on the given file."""
 820                 info = dict(ie_info)
 821                 info['filepath'] = filename
 822                 for pp in self._pps:
 823                         info = pp.run(info)
 824                         if info is None:
 825                                 break
 826
 827         def _download_with_rtmpdump(self, filename, url, player_url):
 828                 self.report_destination(filename)
 829                 tmpfilename = self.temp_name(filename)
 830
 831                 # Check for rtmpdump first
 832                 try:
 833                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 834                 except (OSError, IOError):
 835                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 836                         return False
 837
 838                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 839                 # the connection was interrumpted and resuming appears to be
 840                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 841                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 842                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 843                 while retval == 2 or retval == 1:
 844                         prevsize = os.path.getsize(tmpfilename)
 845                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 846                         time.sleep(5.0) # This seems to be needed
 847                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 848                         cursize = os.path.getsize(tmpfilename)
 849                         if prevsize == cursize and retval == 1:
 850                                 break
 851                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 852                         if prevsize == cursize and retval == 2 and cursize > 1024:
 853                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 854                                 retval = 0
 855                                 break
 856                 if retval == 0:
 857                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 858                         self.try_rename(tmpfilename, filename)
 859                         return True
 860                 else:
 861                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 862                         return False
 863
 864         def _do_download(self, filename, url, player_url):
 865                 # Check file already present
 866                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 867                         self.report_file_already_downloaded(filename)
 868                         return True
 869
 870                 # Attempt to download using rtmpdump
 871                 if url.startswith('rtmp'):
 872                         return self._download_with_rtmpdump(filename, url, player_url)
 873
 874                 tmpfilename = self.temp_name(filename)
 875                 stream = None
 876                 open_mode = 'wb'
 877
 878                 # Do not include the Accept-Encoding header
 879                 headers = {'Youtubedl-no-compression': 'True'}
 880                 basic_request = urllib2.Request(url, None, headers)
 881                 request = urllib2.Request(url, None, headers)
 882
 883                 # Establish possible resume length
 884                 if os.path.isfile(tmpfilename):
 885                         resume_len = os.path.getsize(tmpfilename)
 886                 else:
 887                         resume_len = 0
 888
 889                 # Request parameters in case of being able to resume
 890                 if self.params.get('continuedl', False) and resume_len != 0:
 891                         self.report_resuming_byte(resume_len)
 892                         request.add_header('Range', 'bytes=%d-' % resume_len)
 893                         open_mode = 'ab'
 894
 895                 count = 0
 896                 retries = self.params.get('retries', 0)
 897                 while count <= retries:
 898                         # Establish connection
 899                         try:
 900                                 data = urllib2.urlopen(request)
 901                                 break
 902                         except (urllib2.HTTPError, ), err:
 903                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 904                                         # Unexpected HTTP error
 905                                         raise
 906                                 elif err.code == 416:
 907                                         # Unable to resume (requested range not satisfiable)
 908                                         try:
 909                                                 # Open the connection again without the range header
 910                                                 data = urllib2.urlopen(basic_request)
 911                                                 content_length = data.info()['Content-Length']
 912                                         except (urllib2.HTTPError, ), err:
 913                                                 if err.code < 500 or err.code >= 600:
 914                                                         raise
 915                                         else:
 916                                                 # Examine the reported length
 917                                                 if (content_length is not None and
 918                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 919                                                         # The file had already been fully downloaded.
 920                                                         # Explanation to the above condition: in issue #175 it was revealed that
 921                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 922                                                         # changing the file size slightly and causing problems for some users. So
 923                                                         # I decided to implement a suggested change and consider the file
 924                                                         # completely downloaded if the file size differs less than 100 bytes from
 925                                                         # the one in the hard drive.
 926                                                         self.report_file_already_downloaded(filename)
 927                                                         self.try_rename(tmpfilename, filename)
 928                                                         return True
 929                                                 else:
 930                                                         # The length does not match, we start the download over
 931                                                         self.report_unable_to_resume()
 932                                                         open_mode = 'wb'
 933                                                         break
 934                         # Retry
 935                         count += 1
 936                         if count <= retries:
 937                                 self.report_retry(count, retries)
 938
 939                 if count > retries:
 940                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 941                         return False
 942
 943                 data_len = data.info().get('Content-length', None)
 944                 if data_len is not None:
 945                         data_len = long(data_len) + resume_len
 946                 data_len_str = self.format_bytes(data_len)
 947                 byte_counter = 0 + resume_len
 948                 block_size = 1024
 949                 start = time.time()
 950                 while True:
 951                         # Download and write
 952                         before = time.time()
 953                         data_block = data.read(block_size)
 954                         after = time.time()
 955                         if len(data_block) == 0:
 956                                 break
 957                         byte_counter += len(data_block)
 958
 959                         # Open file just in time
 960                         if stream is None:
 961                                 try:
 962                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 963                                         assert stream is not None
 964                                         filename = self.undo_temp_name(tmpfilename)
 965                                         self.report_destination(filename)
 966                                 except (OSError, IOError), err:
 967                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 968                                         return False
 969                         try:
 970                                 stream.write(data_block)
 971                         except (IOError, OSError), err:
 972                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 973                                 return False
 974                         block_size = self.best_block_size(after - before, len(data_block))
 975
 976                         # Progress message
 977                         percent_str = self.calc_percent(byte_counter, data_len)
 978                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 979                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 980                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 981
 982                         # Apply rate limit
 983                         self.slow_down(start, byte_counter - resume_len)
 984
 985                 if stream is None:
 986                         self.trouble(u'\nERROR: Did not get any data blocks')
 987                         return False
 988                 stream.close()
 989                 self.report_finish()
 990                 if data_len is not None and byte_counter != data_len:
 991                         raise ContentTooShortError(byte_counter, long(data_len))
 992                 self.try_rename(tmpfilename, filename)
 993
 994                 # Update file modification time
 995                 if self.params.get('updatetime', True):
 996                         self.try_utime(filename, data.info().get('last-modified', None))
 997
 998                 return True
 999
1000
1001 class InfoExtractor(object):
1002         """Information Extractor class.
1003
1004         Information extractors are the classes that, given a URL, extract
1005         information from the video (or videos) the URL refers to. This
1006         information includes the real video URL, the video title and simplified
1007         title, author and others. The information is stored in a dictionary
1008         which is then passed to the FileDownloader. The FileDownloader
1009         processes this information possibly downloading the video to the file
1010         system, among other possible outcomes. The dictionaries must include
1011         the following fields:
1012
1013         id:             Video identifier.
1014         url:            Final video URL.
1015         uploader:       Nickname of the video uploader.
1016         title:          Literal title.
1017         stitle:         Simplified title.
1018         ext:            Video filename extension.
1019         format:         Video format.
1020         player_url:     SWF Player URL (may be None).
1021
1022         The following fields are optional. Their primary purpose is to allow
1023         youtube-dl to serve as the backend for a video search function, such
1024         as the one in youtube2mp3.  They are only used when their respective
1025         forced printing functions are called:
1026
1027         thumbnail:      Full URL to a video thumbnail image.
1028         description:    One-line video description.
1029
1030         Subclasses of this one should re-define the _real_initialize() and
1031         _real_extract() methods and define a _VALID_URL regexp.
1032         Probably, they should also be added to the list of extractors.
1033         """
1034
1035         _ready = False
1036         _downloader = None
1037
1038         def __init__(self, downloader=None):
1039                 """Constructor. Receives an optional downloader."""
1040                 self._ready = False
1041                 self.set_downloader(downloader)
1042
1043         def suitable(self, url):
1044                 """Receives a URL and returns True if suitable for this IE."""
1045                 return re.match(self._VALID_URL, url) is not None
1046
1047         def initialize(self):
1048                 """Initializes an instance (authentication, etc)."""
1049                 if not self._ready:
1050                         self._real_initialize()
1051                         self._ready = True
1052
1053         def extract(self, url):
1054                 """Extracts URL information and returns it in list of dicts."""
1055                 self.initialize()
1056                 return self._real_extract(url)
1057
1058         def set_downloader(self, downloader):
1059                 """Sets the downloader for this IE."""
1060                 self._downloader = downloader
1061
1062         def _real_initialize(self):
1063                 """Real initialization process. Redefine in subclasses."""
1064                 pass
1065
1066         def _real_extract(self, url):
1067                 """Real extraction process. Redefine in subclasses."""
1068                 pass
1069
1070
1071 class YoutubeIE(InfoExtractor):
1072         """Information extractor for youtube.com."""
1073
1074         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1075         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1076         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1077         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1078         _NETRC_MACHINE = 'youtube'
1079         # Listed in order of quality
1080         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1081         _video_extensions = {
1082                 '13': '3gp',
1083                 '17': 'mp4',
1084                 '18': 'mp4',
1085                 '22': 'mp4',
1086                 '37': 'mp4',
1087                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1088                 '43': 'webm',
1089                 '45': 'webm',
1090         }
1091         IE_NAME = u'youtube'
1092
1093         def report_lang(self):
1094                 """Report attempt to set language."""
1095                 self._downloader.to_screen(u'[youtube] Setting language')
1096
1097         def report_login(self):
1098                 """Report attempt to log in."""
1099                 self._downloader.to_screen(u'[youtube] Logging in')
1100
1101         def report_age_confirmation(self):
1102                 """Report attempt to confirm age."""
1103                 self._downloader.to_screen(u'[youtube] Confirming age')
1104
1105         def report_video_webpage_download(self, video_id):
1106                 """Report attempt to download video webpage."""
1107                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1108
1109         def report_video_info_webpage_download(self, video_id):
1110                 """Report attempt to download video info webpage."""
1111                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1112
1113         def report_information_extraction(self, video_id):
1114                 """Report attempt to extract video information."""
1115                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1116
1117         def report_unavailable_format(self, video_id, format):
1118                 """Report extracted video URL."""
1119                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1120
1121         def report_rtmp_download(self):
1122                 """Indicate the download will use the RTMP protocol."""
1123                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1124
1125         def _real_initialize(self):
1126                 if self._downloader is None:
1127                         return
1128
1129                 username = None
1130                 password = None
1131                 downloader_params = self._downloader.params
1132
1133                 # Attempt to use provided username and password or .netrc data
1134                 if downloader_params.get('username', None) is not None:
1135                         username = downloader_params['username']
1136                         password = downloader_params['password']
1137                 elif downloader_params.get('usenetrc', False):
1138                         try:
1139                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1140                                 if info is not None:
1141                                         username = info[0]
1142                                         password = info[2]
1143                                 else:
1144                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1145                         except (IOError, netrc.NetrcParseError), err:
1146                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1147                                 return
1148
1149                 # Set language
1150                 request = urllib2.Request(self._LANG_URL)
1151                 try:
1152                         self.report_lang()
1153                         urllib2.urlopen(request).read()
1154                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1155                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1156                         return
1157
1158                 # No authentication to be performed
1159                 if username is None:
1160                         return
1161
1162                 # Log in
1163                 login_form = {
1164                                 'current_form': 'loginForm',
1165                                 'next':         '/',
1166                                 'action_login': 'Log In',
1167                                 'username':     username,
1168                                 'password':     password,
1169                                 }
1170                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1171                 try:
1172                         self.report_login()
1173                         login_results = urllib2.urlopen(request).read()
1174                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1175                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1176                                 return
1177                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1178                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1179                         return
1180
1181                 # Confirm age
1182                 age_form = {
1183                                 'next_url':             '/',
1184                                 'action_confirm':       'Confirm',
1185                                 }
1186                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1187                 try:
1188                         self.report_age_confirmation()
1189                         age_results = urllib2.urlopen(request).read()
1190                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1191                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1192                         return
1193
1194         def _real_extract(self, url):
1195                 # Extract video id from URL
1196                 mobj = re.match(self._VALID_URL, url)
1197                 if mobj is None:
1198                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1199                         return
1200                 video_id = mobj.group(2)
1201
1202                 # Get video webpage
1203                 self.report_video_webpage_download(video_id)
1204                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1205                 try:
1206                         video_webpage = urllib2.urlopen(request).read()
1207                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1208                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1209                         return
1210
1211                 # Attempt to extract SWF player URL
1212                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1213                 if mobj is not None:
1214                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1215                 else:
1216                         player_url = None
1217
1218                 # Get video info
1219                 self.report_video_info_webpage_download(video_id)
1220                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1221                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1222                                         % (video_id, el_type))
1223                         request = urllib2.Request(video_info_url)
1224                         try:
1225                                 video_info_webpage = urllib2.urlopen(request).read()
1226                                 video_info = parse_qs(video_info_webpage)
1227                                 if 'token' in video_info:
1228                                         break
1229                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1230                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1231                                 return
1232                 if 'token' not in video_info:
1233                         if 'reason' in video_info:
1234                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1235                         else:
1236                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1237                         return
1238
1239                 # Start extracting information
1240                 self.report_information_extraction(video_id)
1241
1242                 # uploader
1243                 if 'author' not in video_info:
1244                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1245                         return
1246                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1247
1248                 # title
1249                 if 'title' not in video_info:
1250                         self._downloader.trouble(u'ERROR: unable to extract video title')
1251                         return
1252                 video_title = urllib.unquote_plus(video_info['title'][0])
1253                 video_title = video_title.decode('utf-8')
1254                 video_title = sanitize_title(video_title)
1255
1256                 # simplified title
1257                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1258                 simple_title = simple_title.strip(ur'_')
1259
1260                 # thumbnail image
1261                 if 'thumbnail_url' not in video_info:
1262                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1263                         video_thumbnail = ''
1264                 else:   # don't panic if we can't find it
1265                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1266
1267                 # upload date
1268                 upload_date = u'NA'
1269                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1270                 if mobj is not None:
1271                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1272                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1273                         for expression in format_expressions:
1274                                 try:
1275                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1276                                 except:
1277                                         pass
1278
1279                 # description
1280                 try:
1281                         lxml.etree
1282                 except NameError:
1283                         video_description = u'No description available.'
1284                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1285                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1286                                 if mobj is not None:
1287                                         video_description = mobj.group(1).decode('utf-8')
1288                 else:
1289                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1290                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1291                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1292                         # TODO use another parser
1293
1294                 # token
1295                 video_token = urllib.unquote_plus(video_info['token'][0])
1296
1297                 # Decide which formats to download
1298                 req_format = self._downloader.params.get('format', None)
1299
1300                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1301                         self.report_rtmp_download()
1302                         video_url_list = [(None, video_info['conn'][0])]
1303                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1304                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1305                         url_data = [parse_qs(uds) for uds in url_data_strs]
1306                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1307                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1308
1309                         format_limit = self._downloader.params.get('format_limit', None)
1310                         if format_limit is not None and format_limit in self._available_formats:
1311                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1312                         else:
1313                                 format_list = self._available_formats
1314                         existing_formats = [x for x in format_list if x in url_map]
1315                         if len(existing_formats) == 0:
1316                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1317                                 return
1318                         if req_format is None:
1319                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1320                         elif req_format == '-1':
1321                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1322                         else:
1323                                 # Specific format
1324                                 if req_format not in url_map:
1325                                         self._downloader.trouble(u'ERROR: requested format not available')
1326                                         return
1327                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1328                 else:
1329                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1330                         return
1331
1332                 for format_param, video_real_url in video_url_list:
1333                         # At this point we have a new video
1334                         self._downloader.increment_downloads()
1335
1336                         # Extension
1337                         video_extension = self._video_extensions.get(format_param, 'flv')
1338
1339                         try:
1340                                 # Process video information
1341                                 self._downloader.process_info({
1342                                         'id':           video_id.decode('utf-8'),
1343                                         'url':          video_real_url.decode('utf-8'),
1344                                         'uploader':     video_uploader.decode('utf-8'),
1345                                         'upload_date':  upload_date,
1346                                         'title':        video_title,
1347                                         'stitle':       simple_title,
1348                                         'ext':          video_extension.decode('utf-8'),
1349                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1350                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1351                                         'description':  video_description,
1352                                         'player_url':   player_url,
1353                                 })
1354                         except UnavailableVideoError, err:
1355                                 self._downloader.trouble(u'\nERROR: unable to download video')
1356
1357
1358 class MetacafeIE(InfoExtractor):
1359         """Information Extractor for metacafe.com."""
1360
1361         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1362         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1363         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1364         _youtube_ie = None
1365         IE_NAME = u'metacafe'
1366
1367         def __init__(self, youtube_ie, downloader=None):
1368                 InfoExtractor.__init__(self, downloader)
1369                 self._youtube_ie = youtube_ie
1370
1371         def report_disclaimer(self):
1372                 """Report disclaimer retrieval."""
1373                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1374
1375         def report_age_confirmation(self):
1376                 """Report attempt to confirm age."""
1377                 self._downloader.to_screen(u'[metacafe] Confirming age')
1378
1379         def report_download_webpage(self, video_id):
1380                 """Report webpage download."""
1381                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1382
1383         def report_extraction(self, video_id):
1384                 """Report information extraction."""
1385                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1386
1387         def _real_initialize(self):
1388                 # Retrieve disclaimer
1389                 request = urllib2.Request(self._DISCLAIMER)
1390                 try:
1391                         self.report_disclaimer()
1392                         disclaimer = urllib2.urlopen(request).read()
1393                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1394                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1395                         return
1396
1397                 # Confirm age
1398                 disclaimer_form = {
1399                         'filters': '0',
1400                         'submit': "Continue - I'm over 18",
1401                         }
1402                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1403                 try:
1404                         self.report_age_confirmation()
1405                         disclaimer = urllib2.urlopen(request).read()
1406                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1407                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1408                         return
1409
1410         def _real_extract(self, url):
1411                 # Extract id and simplified title from URL
1412                 mobj = re.match(self._VALID_URL, url)
1413                 if mobj is None:
1414                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1415                         return
1416
1417                 video_id = mobj.group(1)
1418
1419                 # Check if video comes from YouTube
1420                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1421                 if mobj2 is not None:
1422                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1423                         return
1424
1425                 # At this point we have a new video
1426                 self._downloader.increment_downloads()
1427
1428                 simple_title = mobj.group(2).decode('utf-8')
1429
1430                 # Retrieve video webpage to extract further information
1431                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1432                 try:
1433                         self.report_download_webpage(video_id)
1434                         webpage = urllib2.urlopen(request).read()
1435                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1436                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1437                         return
1438
1439                 # Extract URL, uploader and title from webpage
1440                 self.report_extraction(video_id)
1441                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1442                 if mobj is not None:
1443                         mediaURL = urllib.unquote(mobj.group(1))
1444                         video_extension = mediaURL[-3:]
1445
1446                         # Extract gdaKey if available
1447                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1448                         if mobj is None:
1449                                 video_url = mediaURL
1450                         else:
1451                                 gdaKey = mobj.group(1)
1452                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1453                 else:
1454                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1455                         if mobj is None:
1456                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1457                                 return
1458                         vardict = parse_qs(mobj.group(1))
1459                         if 'mediaData' not in vardict:
1460                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1461                                 return
1462                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1463                         if mobj is None:
1464                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1465                                 return
1466                         mediaURL = mobj.group(1).replace('\\/', '/')
1467                         video_extension = mediaURL[-3:]
1468                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1469
1470                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1471                 if mobj is None:
1472                         self._downloader.trouble(u'ERROR: unable to extract title')
1473                         return
1474                 video_title = mobj.group(1).decode('utf-8')
1475                 video_title = sanitize_title(video_title)
1476
1477                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1478                 if mobj is None:
1479                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1480                         return
1481                 video_uploader = mobj.group(1)
1482
1483                 try:
1484                         # Process video information
1485                         self._downloader.process_info({
1486                                 'id':           video_id.decode('utf-8'),
1487                                 'url':          video_url.decode('utf-8'),
1488                                 'uploader':     video_uploader.decode('utf-8'),
1489                                 'upload_date':  u'NA',
1490                                 'title':        video_title,
1491                                 'stitle':       simple_title,
1492                                 'ext':          video_extension.decode('utf-8'),
1493                                 'format':       u'NA',
1494                                 'player_url':   None,
1495                         })
1496                 except UnavailableVideoError:
1497                         self._downloader.trouble(u'\nERROR: unable to download video')
1498
1499
1500 class DailymotionIE(InfoExtractor):
1501         """Information Extractor for Dailymotion"""
1502
1503         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1504         IE_NAME = u'dailymotion'
1505
1506         def __init__(self, downloader=None):
1507                 InfoExtractor.__init__(self, downloader)
1508
1509         def report_download_webpage(self, video_id):
1510                 """Report webpage download."""
1511                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1512
1513         def report_extraction(self, video_id):
1514                 """Report information extraction."""
1515                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1516
1517         def _real_initialize(self):
1518                 return
1519
1520         def _real_extract(self, url):
1521                 # Extract id and simplified title from URL
1522                 mobj = re.match(self._VALID_URL, url)
1523                 if mobj is None:
1524                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1525                         return
1526
1527                 # At this point we have a new video
1528                 self._downloader.increment_downloads()
1529                 video_id = mobj.group(1)
1530
1531                 simple_title = mobj.group(2).decode('utf-8')
1532                 video_extension = 'flv'
1533
1534                 # Retrieve video webpage to extract further information
1535                 request = urllib2.Request(url)
1536                 request.add_header('Cookie', 'family_filter=off')
1537                 try:
1538                         self.report_download_webpage(video_id)
1539                         webpage = urllib2.urlopen(request).read()
1540                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1541                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1542                         return
1543
1544                 # Extract URL, uploader and title from webpage
1545                 self.report_extraction(video_id)
1546                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1547                 if mobj is None:
1548                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1549                         return
1550                 sequence = urllib.unquote(mobj.group(1))
1551                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1552                 if mobj is None:
1553                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1554                         return
1555                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1556
1557                 # if needed add http://www.dailymotion.com/ if relative URL
1558
1559                 video_url = mediaURL
1560
1561                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1562                 if mobj is None:
1563                         self._downloader.trouble(u'ERROR: unable to extract title')
1564                         return
1565                 video_title = mobj.group(1).decode('utf-8')
1566                 video_title = sanitize_title(video_title)
1567
1568                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1569                 if mobj is None:
1570                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1571                         return
1572                 video_uploader = mobj.group(1)
1573
1574                 try:
1575                         # Process video information
1576                         self._downloader.process_info({
1577                                 'id':           video_id.decode('utf-8'),
1578                                 'url':          video_url.decode('utf-8'),
1579                                 'uploader':     video_uploader.decode('utf-8'),
1580                                 'upload_date':  u'NA',
1581                                 'title':        video_title,
1582                                 'stitle':       simple_title,
1583                                 'ext':          video_extension.decode('utf-8'),
1584                                 'format':       u'NA',
1585                                 'player_url':   None,
1586                         })
1587                 except UnavailableVideoError:
1588                         self._downloader.trouble(u'\nERROR: unable to download video')
1589
1590
1591 class GoogleIE(InfoExtractor):
1592         """Information extractor for video.google.com."""
1593
1594         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1595         IE_NAME = u'video.google'
1596
1597         def __init__(self, downloader=None):
1598                 InfoExtractor.__init__(self, downloader)
1599
1600         def report_download_webpage(self, video_id):
1601                 """Report webpage download."""
1602                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1603
1604         def report_extraction(self, video_id):
1605                 """Report information extraction."""
1606                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1607
1608         def _real_initialize(self):
1609                 return
1610
1611         def _real_extract(self, url):
1612                 # Extract id from URL
1613                 mobj = re.match(self._VALID_URL, url)
1614                 if mobj is None:
1615                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1616                         return
1617
1618                 # At this point we have a new video
1619                 self._downloader.increment_downloads()
1620                 video_id = mobj.group(1)
1621
1622                 video_extension = 'mp4'
1623
1624                 # Retrieve video webpage to extract further information
1625                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1626                 try:
1627                         self.report_download_webpage(video_id)
1628                         webpage = urllib2.urlopen(request).read()
1629                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1630                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1631                         return
1632
1633                 # Extract URL, uploader, and title from webpage
1634                 self.report_extraction(video_id)
1635                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1636                 if mobj is None:
1637                         video_extension = 'flv'
1638                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1639                 if mobj is None:
1640                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1641                         return
1642                 mediaURL = urllib.unquote(mobj.group(1))
1643                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1644                 mediaURL = mediaURL.replace('\\x26', '\x26')
1645
1646                 video_url = mediaURL
1647
1648                 mobj = re.search(r'<title>(.*)</title>', webpage)
1649                 if mobj is None:
1650                         self._downloader.trouble(u'ERROR: unable to extract title')
1651                         return
1652                 video_title = mobj.group(1).decode('utf-8')
1653                 video_title = sanitize_title(video_title)
1654                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1655
1656                 # Extract video description
1657                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1658                 if mobj is None:
1659                         self._downloader.trouble(u'ERROR: unable to extract video description')
1660                         return
1661                 video_description = mobj.group(1).decode('utf-8')
1662                 if not video_description:
1663                         video_description = 'No description available.'
1664
1665                 # Extract video thumbnail
1666                 if self._downloader.params.get('forcethumbnail', False):
1667                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1668                         try:
1669                                 webpage = urllib2.urlopen(request).read()
1670                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1671                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1672                                 return
1673                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1674                         if mobj is None:
1675                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1676                                 return
1677                         video_thumbnail = mobj.group(1)
1678                 else:   # we need something to pass to process_info
1679                         video_thumbnail = ''
1680
1681                 try:
1682                         # Process video information
1683                         self._downloader.process_info({
1684                                 'id':           video_id.decode('utf-8'),
1685                                 'url':          video_url.decode('utf-8'),
1686                                 'uploader':     u'NA',
1687                                 'upload_date':  u'NA',
1688                                 'title':        video_title,
1689                                 'stitle':       simple_title,
1690                                 'ext':          video_extension.decode('utf-8'),
1691                                 'format':       u'NA',
1692                                 'player_url':   None,
1693                         })
1694                 except UnavailableVideoError:
1695                         self._downloader.trouble(u'\nERROR: unable to download video')
1696
1697
1698 class PhotobucketIE(InfoExtractor):
1699         """Information extractor for photobucket.com."""
1700
1701         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1702         IE_NAME = u'photobucket'
1703
1704         def __init__(self, downloader=None):
1705                 InfoExtractor.__init__(self, downloader)
1706
1707         def report_download_webpage(self, video_id):
1708                 """Report webpage download."""
1709                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1710
1711         def report_extraction(self, video_id):
1712                 """Report information extraction."""
1713                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1714
1715         def _real_initialize(self):
1716                 return
1717
1718         def _real_extract(self, url):
1719                 # Extract id from URL
1720                 mobj = re.match(self._VALID_URL, url)
1721                 if mobj is None:
1722                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1723                         return
1724
1725                 # At this point we have a new video
1726                 self._downloader.increment_downloads()
1727                 video_id = mobj.group(1)
1728
1729                 video_extension = 'flv'
1730
1731                 # Retrieve video webpage to extract further information
1732                 request = urllib2.Request(url)
1733                 try:
1734                         self.report_download_webpage(video_id)
1735                         webpage = urllib2.urlopen(request).read()
1736                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1737                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1738                         return
1739
1740                 # Extract URL, uploader, and title from webpage
1741                 self.report_extraction(video_id)
1742                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1743                 if mobj is None:
1744                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1745                         return
1746                 mediaURL = urllib.unquote(mobj.group(1))
1747
1748                 video_url = mediaURL
1749
1750                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1751                 if mobj is None:
1752                         self._downloader.trouble(u'ERROR: unable to extract title')
1753                         return
1754                 video_title = mobj.group(1).decode('utf-8')
1755                 video_title = sanitize_title(video_title)
1756                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1757
1758                 video_uploader = mobj.group(2).decode('utf-8')
1759
1760                 try:
1761                         # Process video information
1762                         self._downloader.process_info({
1763                                 'id':           video_id.decode('utf-8'),
1764                                 'url':          video_url.decode('utf-8'),
1765                                 'uploader':     video_uploader,
1766                                 'upload_date':  u'NA',
1767                                 'title':        video_title,
1768                                 'stitle':       simple_title,
1769                                 'ext':          video_extension.decode('utf-8'),
1770                                 'format':       u'NA',
1771                                 'player_url':   None,
1772                         })
1773                 except UnavailableVideoError:
1774                         self._downloader.trouble(u'\nERROR: unable to download video')
1775
1776
1777 class YahooIE(InfoExtractor):
1778         """Information extractor for video.yahoo.com."""
1779
1780         # _VALID_URL matches all Yahoo! Video URLs
1781         # _VPAGE_URL matches only the extractable '/watch/' URLs
1782         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1783         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1784         IE_NAME = u'video.yahoo'
1785
1786         def __init__(self, downloader=None):
1787                 InfoExtractor.__init__(self, downloader)
1788
1789         def report_download_webpage(self, video_id):
1790                 """Report webpage download."""
1791                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1792
1793         def report_extraction(self, video_id):
1794                 """Report information extraction."""
1795                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1796
1797         def _real_initialize(self):
1798                 return
1799
1800         def _real_extract(self, url, new_video=True):
1801                 # Extract ID from URL
1802                 mobj = re.match(self._VALID_URL, url)
1803                 if mobj is None:
1804                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1805                         return
1806
1807                 # At this point we have a new video
1808                 self._downloader.increment_downloads()
1809                 video_id = mobj.group(2)
1810                 video_extension = 'flv'
1811
1812                 # Rewrite valid but non-extractable URLs as
1813                 # extractable English language /watch/ URLs
1814                 if re.match(self._VPAGE_URL, url) is None:
1815                         request = urllib2.Request(url)
1816                         try:
1817                                 webpage = urllib2.urlopen(request).read()
1818                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1819                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1820                                 return
1821
1822                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1823                         if mobj is None:
1824                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1825                                 return
1826                         yahoo_id = mobj.group(1)
1827
1828                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1829                         if mobj is None:
1830                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1831                                 return
1832                         yahoo_vid = mobj.group(1)
1833
1834                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1835                         return self._real_extract(url, new_video=False)
1836
1837                 # Retrieve video webpage to extract further information
1838                 request = urllib2.Request(url)
1839                 try:
1840                         self.report_download_webpage(video_id)
1841                         webpage = urllib2.urlopen(request).read()
1842                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1843                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1844                         return
1845
1846                 # Extract uploader and title from webpage
1847                 self.report_extraction(video_id)
1848                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1849                 if mobj is None:
1850                         self._downloader.trouble(u'ERROR: unable to extract video title')
1851                         return
1852                 video_title = mobj.group(1).decode('utf-8')
1853                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1854
1855                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1856                 if mobj is None:
1857                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1858                         return
1859                 video_uploader = mobj.group(1).decode('utf-8')
1860
1861                 # Extract video thumbnail
1862                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1863                 if mobj is None:
1864                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1865                         return
1866                 video_thumbnail = mobj.group(1).decode('utf-8')
1867
1868                 # Extract video description
1869                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1870                 if mobj is None:
1871                         self._downloader.trouble(u'ERROR: unable to extract video description')
1872                         return
1873                 video_description = mobj.group(1).decode('utf-8')
1874                 if not video_description:
1875                         video_description = 'No description available.'
1876
1877                 # Extract video height and width
1878                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1879                 if mobj is None:
1880                         self._downloader.trouble(u'ERROR: unable to extract video height')
1881                         return
1882                 yv_video_height = mobj.group(1)
1883
1884                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1885                 if mobj is None:
1886                         self._downloader.trouble(u'ERROR: unable to extract video width')
1887                         return
1888                 yv_video_width = mobj.group(1)
1889
1890                 # Retrieve video playlist to extract media URL
1891                 # I'm not completely sure what all these options are, but we
1892                 # seem to need most of them, otherwise the server sends a 401.
1893                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1894                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1895                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1896                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1897                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1898                 try:
1899                         self.report_download_webpage(video_id)
1900                         webpage = urllib2.urlopen(request).read()
1901                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1902                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1903                         return
1904
1905                 # Extract media URL from playlist XML
1906                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1907                 if mobj is None:
1908                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1909                         return
1910                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1911                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1912
1913                 try:
1914                         # Process video information
1915                         self._downloader.process_info({
1916                                 'id':           video_id.decode('utf-8'),
1917                                 'url':          video_url,
1918                                 'uploader':     video_uploader,
1919                                 'upload_date':  u'NA',
1920                                 'title':        video_title,
1921                                 'stitle':       simple_title,
1922                                 'ext':          video_extension.decode('utf-8'),
1923                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1924                                 'description':  video_description,
1925                                 'thumbnail':    video_thumbnail,
1926                                 'player_url':   None,
1927                         })
1928                 except UnavailableVideoError:
1929                         self._downloader.trouble(u'\nERROR: unable to download video')
1930
1931
1932 class VimeoIE(InfoExtractor):
1933         """Information extractor for vimeo.com."""
1934
1935         # _VALID_URL matches Vimeo URLs
1936         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1937         IE_NAME = u'vimeo'
1938
1939         def __init__(self, downloader=None):
1940                 InfoExtractor.__init__(self, downloader)
1941
1942         def report_download_webpage(self, video_id):
1943                 """Report webpage download."""
1944                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1945
1946         def report_extraction(self, video_id):
1947                 """Report information extraction."""
1948                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1949
1950         def _real_initialize(self):
1951                 return
1952
1953         def _real_extract(self, url, new_video=True):
1954                 # Extract ID from URL
1955                 mobj = re.match(self._VALID_URL, url)
1956                 if mobj is None:
1957                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1958                         return
1959
1960                 # At this point we have a new video
1961                 self._downloader.increment_downloads()
1962                 video_id = mobj.group(1)
1963
1964                 # Retrieve video webpage to extract further information
1965                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1966                 try:
1967                         self.report_download_webpage(video_id)
1968                         webpage = urllib2.urlopen(request).read()
1969                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1970                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1971                         return
1972
1973                 # Now we begin extracting as much information as we can from what we
1974                 # retrieved. First we extract the information common to all extractors,
1975                 # and latter we extract those that are Vimeo specific.
1976                 self.report_extraction(video_id)
1977
1978                 # Extract title
1979                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1980                 if mobj is None:
1981                         self._downloader.trouble(u'ERROR: unable to extract video title')
1982                         return
1983                 video_title = mobj.group(1).decode('utf-8')
1984                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1985
1986                 # Extract uploader
1987                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1988                 if mobj is None:
1989                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1990                         return
1991                 video_uploader = mobj.group(1).decode('utf-8')
1992
1993                 # Extract video thumbnail
1994                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1995                 if mobj is None:
1996                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1997                         return
1998                 video_thumbnail = mobj.group(1).decode('utf-8')
1999
2000                 # # Extract video description
2001                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2002                 # if mobj is None:
2003                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2004                 #       return
2005                 # video_description = mobj.group(1).decode('utf-8')
2006                 # if not video_description: video_description = 'No description available.'
2007                 video_description = 'Foo.'
2008
2009                 # Vimeo specific: extract request signature
2010                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2011                 if mobj is None:
2012                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2013                         return
2014                 sig = mobj.group(1).decode('utf-8')
2015
2016                 # Vimeo specific: Extract request signature expiration
2017                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2018                 if mobj is None:
2019                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2020                         return
2021                 sig_exp = mobj.group(1).decode('utf-8')
2022
2023                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2024
2025                 try:
2026                         # Process video information
2027                         self._downloader.process_info({
2028                                 'id':           video_id.decode('utf-8'),
2029                                 'url':          video_url,
2030                                 'uploader':     video_uploader,
2031                                 'upload_date':  u'NA',
2032                                 'title':        video_title,
2033                                 'stitle':       simple_title,
2034                                 'ext':          u'mp4',
2035                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2036                                 'description':  video_description,
2037                                 'thumbnail':    video_thumbnail,
2038                                 'description':  video_description,
2039                                 'player_url':   None,
2040                         })
2041                 except UnavailableVideoError:
2042                         self._downloader.trouble(u'ERROR: unable to download video')
2043
2044
2045 class GenericIE(InfoExtractor):
2046         """Generic last-resort information extractor."""
2047
2048         _VALID_URL = r'.*'
2049         IE_NAME = u'generic'
2050
2051         def __init__(self, downloader=None):
2052                 InfoExtractor.__init__(self, downloader)
2053
2054         def report_download_webpage(self, video_id):
2055                 """Report webpage download."""
2056                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2057                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2058
2059         def report_extraction(self, video_id):
2060                 """Report information extraction."""
2061                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2062
2063         def _real_initialize(self):
2064                 return
2065
2066         def _real_extract(self, url):
2067                 # At this point we have a new video
2068                 self._downloader.increment_downloads()
2069
2070                 video_id = url.split('/')[-1]
2071                 request = urllib2.Request(url)
2072                 try:
2073                         self.report_download_webpage(video_id)
2074                         webpage = urllib2.urlopen(request).read()
2075                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2076                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2077                         return
2078                 except ValueError, err:
2079                         # since this is the last-resort InfoExtractor, if
2080                         # this error is thrown, it'll be thrown here
2081                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2082                         return
2083
2084                 self.report_extraction(video_id)
2085                 # Start with something easy: JW Player in SWFObject
2086                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2087                 if mobj is None:
2088                         # Broaden the search a little bit
2089                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2090                 if mobj is None:
2091                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2092                         return
2093
2094                 # It's possible that one of the regexes
2095                 # matched, but returned an empty group:
2096                 if mobj.group(1) is None:
2097                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2098                         return
2099
2100                 video_url = urllib.unquote(mobj.group(1))
2101                 video_id = os.path.basename(video_url)
2102
2103                 # here's a fun little line of code for you:
2104                 video_extension = os.path.splitext(video_id)[1][1:]
2105                 video_id = os.path.splitext(video_id)[0]
2106
2107                 # it's tempting to parse this further, but you would
2108                 # have to take into account all the variations like
2109                 #   Video Title - Site Name
2110                 #   Site Name | Video Title
2111                 #   Video Title - Tagline | Site Name
2112                 # and so on and so forth; it's just not practical
2113                 mobj = re.search(r'<title>(.*)</title>', webpage)
2114                 if mobj is None:
2115                         self._downloader.trouble(u'ERROR: unable to extract title')
2116                         return
2117                 video_title = mobj.group(1).decode('utf-8')
2118                 video_title = sanitize_title(video_title)
2119                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2120
2121                 # video uploader is domain name
2122                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2123                 if mobj is None:
2124                         self._downloader.trouble(u'ERROR: unable to extract title')
2125                         return
2126                 video_uploader = mobj.group(1).decode('utf-8')
2127
2128                 try:
2129                         # Process video information
2130                         self._downloader.process_info({
2131                                 'id':           video_id.decode('utf-8'),
2132                                 'url':          video_url.decode('utf-8'),
2133                                 'uploader':     video_uploader,
2134                                 'upload_date':  u'NA',
2135                                 'title':        video_title,
2136                                 'stitle':       simple_title,
2137                                 'ext':          video_extension.decode('utf-8'),
2138                                 'format':       u'NA',
2139                                 'player_url':   None,
2140                         })
2141                 except UnavailableVideoError, err:
2142                         self._downloader.trouble(u'\nERROR: unable to download video')
2143
2144
2145 class YoutubeSearchIE(InfoExtractor):
2146         """Information Extractor for YouTube search queries."""
2147         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2148         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2149         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2150         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2151         _youtube_ie = None
2152         _max_youtube_results = 1000
2153         IE_NAME = u'youtube:search'
2154
2155         def __init__(self, youtube_ie, downloader=None):
2156                 InfoExtractor.__init__(self, downloader)
2157                 self._youtube_ie = youtube_ie
2158
2159         def report_download_page(self, query, pagenum):
2160                 """Report attempt to download playlist page with given number."""
2161                 query = query.decode(preferredencoding())
2162                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2163
2164         def _real_initialize(self):
2165                 self._youtube_ie.initialize()
2166
2167         def _real_extract(self, query):
2168                 mobj = re.match(self._VALID_URL, query)
2169                 if mobj is None:
2170                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2171                         return
2172
2173                 prefix, query = query.split(':')
2174                 prefix = prefix[8:]
2175                 query = query.encode('utf-8')
2176                 if prefix == '':
2177                         self._download_n_results(query, 1)
2178                         return
2179                 elif prefix == 'all':
2180                         self._download_n_results(query, self._max_youtube_results)
2181                         return
2182                 else:
2183                         try:
2184                                 n = long(prefix)
2185                                 if n <= 0:
2186                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2187                                         return
2188                                 elif n > self._max_youtube_results:
2189                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2190                                         n = self._max_youtube_results
2191                                 self._download_n_results(query, n)
2192                                 return
2193                         except ValueError: # parsing prefix as integer fails
2194                                 self._download_n_results(query, 1)
2195                                 return
2196
2197         def _download_n_results(self, query, n):
2198                 """Downloads a specified number of results for a query"""
2199
2200                 video_ids = []
2201                 already_seen = set()
2202                 pagenum = 1
2203
2204                 while True:
2205                         self.report_download_page(query, pagenum)
2206                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2207                         request = urllib2.Request(result_url)
2208                         try:
2209                                 page = urllib2.urlopen(request).read()
2210                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2211                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2212                                 return
2213
2214                         # Extract video identifiers
2215                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2216                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2217                                 if video_id not in already_seen:
2218                                         video_ids.append(video_id)
2219                                         already_seen.add(video_id)
2220                                         if len(video_ids) == n:
2221                                                 # Specified n videos reached
2222                                                 for id in video_ids:
2223                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2224                                                 return
2225
2226                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2227                                 for id in video_ids:
2228                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2229                                 return
2230
2231                         pagenum = pagenum + 1
2232
2233
2234 class GoogleSearchIE(InfoExtractor):
2235         """Information Extractor for Google Video search queries."""
2236         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2237         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2238         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2239         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2240         _google_ie = None
2241         _max_google_results = 1000
2242         IE_NAME = u'video.google:search'
2243
2244         def __init__(self, google_ie, downloader=None):
2245                 InfoExtractor.__init__(self, downloader)
2246                 self._google_ie = google_ie
2247
2248         def report_download_page(self, query, pagenum):
2249                 """Report attempt to download playlist page with given number."""
2250                 query = query.decode(preferredencoding())
2251                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2252
2253         def _real_initialize(self):
2254                 self._google_ie.initialize()
2255
2256         def _real_extract(self, query):
2257                 mobj = re.match(self._VALID_URL, query)
2258                 if mobj is None:
2259                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2260                         return
2261
2262                 prefix, query = query.split(':')
2263                 prefix = prefix[8:]
2264                 query = query.encode('utf-8')
2265                 if prefix == '':
2266                         self._download_n_results(query, 1)
2267                         return
2268                 elif prefix == 'all':
2269                         self._download_n_results(query, self._max_google_results)
2270                         return
2271                 else:
2272                         try:
2273                                 n = long(prefix)
2274                                 if n <= 0:
2275                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2276                                         return
2277                                 elif n > self._max_google_results:
2278                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2279                                         n = self._max_google_results
2280                                 self._download_n_results(query, n)
2281                                 return
2282                         except ValueError: # parsing prefix as integer fails
2283                                 self._download_n_results(query, 1)
2284                                 return
2285
2286         def _download_n_results(self, query, n):
2287                 """Downloads a specified number of results for a query"""
2288
2289                 video_ids = []
2290                 already_seen = set()
2291                 pagenum = 1
2292
2293                 while True:
2294                         self.report_download_page(query, pagenum)
2295                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2296                         request = urllib2.Request(result_url)
2297                         try:
2298                                 page = urllib2.urlopen(request).read()
2299                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2300                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2301                                 return
2302
2303                         # Extract video identifiers
2304                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2305                                 video_id = mobj.group(1)
2306                                 if video_id not in already_seen:
2307                                         video_ids.append(video_id)
2308                                         already_seen.add(video_id)
2309                                         if len(video_ids) == n:
2310                                                 # Specified n videos reached
2311                                                 for id in video_ids:
2312                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2313                                                 return
2314
2315                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2316                                 for id in video_ids:
2317                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2318                                 return
2319
2320                         pagenum = pagenum + 1
2321
2322
2323 class YahooSearchIE(InfoExtractor):
2324         """Information Extractor for Yahoo! Video search queries."""
2325         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2326         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2327         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2328         _MORE_PAGES_INDICATOR = r'\s*Next'
2329         _yahoo_ie = None
2330         _max_yahoo_results = 1000
2331         IE_NAME = u'video.yahoo:search'
2332
2333         def __init__(self, yahoo_ie, downloader=None):
2334                 InfoExtractor.__init__(self, downloader)
2335                 self._yahoo_ie = yahoo_ie
2336
2337         def report_download_page(self, query, pagenum):
2338                 """Report attempt to download playlist page with given number."""
2339                 query = query.decode(preferredencoding())
2340                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2341
2342         def _real_initialize(self):
2343                 self._yahoo_ie.initialize()
2344
2345         def _real_extract(self, query):
2346                 mobj = re.match(self._VALID_URL, query)
2347                 if mobj is None:
2348                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2349                         return
2350
2351                 prefix, query = query.split(':')
2352                 prefix = prefix[8:]
2353                 query = query.encode('utf-8')
2354                 if prefix == '':
2355                         self._download_n_results(query, 1)
2356                         return
2357                 elif prefix == 'all':
2358                         self._download_n_results(query, self._max_yahoo_results)
2359                         return
2360                 else:
2361                         try:
2362                                 n = long(prefix)
2363                                 if n <= 0:
2364                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2365                                         return
2366                                 elif n > self._max_yahoo_results:
2367                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2368                                         n = self._max_yahoo_results
2369                                 self._download_n_results(query, n)
2370                                 return
2371                         except ValueError: # parsing prefix as integer fails
2372                                 self._download_n_results(query, 1)
2373                                 return
2374
2375         def _download_n_results(self, query, n):
2376                 """Downloads a specified number of results for a query"""
2377
2378                 video_ids = []
2379                 already_seen = set()
2380                 pagenum = 1
2381
2382                 while True:
2383                         self.report_download_page(query, pagenum)
2384                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2385                         request = urllib2.Request(result_url)
2386                         try:
2387                                 page = urllib2.urlopen(request).read()
2388                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2389                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2390                                 return
2391
2392                         # Extract video identifiers
2393                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2394                                 video_id = mobj.group(1)
2395                                 if video_id not in already_seen:
2396                                         video_ids.append(video_id)
2397                                         already_seen.add(video_id)
2398                                         if len(video_ids) == n:
2399                                                 # Specified n videos reached
2400                                                 for id in video_ids:
2401                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2402                                                 return
2403
2404                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2405                                 for id in video_ids:
2406                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2407                                 return
2408
2409                         pagenum = pagenum + 1
2410
2411
2412 class YoutubePlaylistIE(InfoExtractor):
2413         """Information Extractor for YouTube playlists."""
2414
2415         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2416         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2417         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2418         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2419         _youtube_ie = None
2420         IE_NAME = u'youtube:playlist'
2421
2422         def __init__(self, youtube_ie, downloader=None):
2423                 InfoExtractor.__init__(self, downloader)
2424                 self._youtube_ie = youtube_ie
2425
2426         def report_download_page(self, playlist_id, pagenum):
2427                 """Report attempt to download playlist page with given number."""
2428                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2429
2430         def _real_initialize(self):
2431                 self._youtube_ie.initialize()
2432
2433         def _real_extract(self, url):
2434                 # Extract playlist id
2435                 mobj = re.match(self._VALID_URL, url)
2436                 if mobj is None:
2437                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2438                         return
2439
2440                 # Single video case
2441                 if mobj.group(3) is not None:
2442                         self._youtube_ie.extract(mobj.group(3))
2443                         return
2444
2445                 # Download playlist pages
2446                 # prefix is 'p' as default for playlists but there are other types that need extra care
2447                 playlist_prefix = mobj.group(1)
2448                 if playlist_prefix == 'a':
2449                         playlist_access = 'artist'
2450                 else:
2451                         playlist_prefix = 'p'
2452                         playlist_access = 'view_play_list'
2453                 playlist_id = mobj.group(2)
2454                 video_ids = []
2455                 pagenum = 1
2456
2457                 while True:
2458                         self.report_download_page(playlist_id, pagenum)
2459                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2460                         try:
2461                                 page = urllib2.urlopen(request).read()
2462                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2463                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2464                                 return
2465
2466                         # Extract video identifiers
2467                         ids_in_page = []
2468                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2469                                 if mobj.group(1) not in ids_in_page:
2470                                         ids_in_page.append(mobj.group(1))
2471                         video_ids.extend(ids_in_page)
2472
2473                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2474                                 break
2475                         pagenum = pagenum + 1
2476
2477                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2478                 playlistend = self._downloader.params.get('playlistend', -1)
2479                 video_ids = video_ids[playliststart:playlistend]
2480
2481                 for id in video_ids:
2482                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2483                 return
2484
2485
2486 class YoutubeUserIE(InfoExtractor):
2487         """Information Extractor for YouTube users."""
2488
2489         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2490         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2491         _GDATA_PAGE_SIZE = 50
2492         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2493         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2494         _youtube_ie = None
2495         IE_NAME = u'youtube:user'
2496
2497         def __init__(self, youtube_ie, downloader=None):
2498                 InfoExtractor.__init__(self, downloader)
2499                 self._youtube_ie = youtube_ie
2500
2501         def report_download_page(self, username, start_index):
2502                 """Report attempt to download user page."""
2503                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2504                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2505
2506         def _real_initialize(self):
2507                 self._youtube_ie.initialize()
2508
2509         def _real_extract(self, url):
2510                 # Extract username
2511                 mobj = re.match(self._VALID_URL, url)
2512                 if mobj is None:
2513                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2514                         return
2515
2516                 username = mobj.group(1)
2517
2518                 # Download video ids using YouTube Data API. Result size per
2519                 # query is limited (currently to 50 videos) so we need to query
2520                 # page by page until there are no video ids - it means we got
2521                 # all of them.
2522
2523                 video_ids = []
2524                 pagenum = 0
2525
2526                 while True:
2527                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2528                         self.report_download_page(username, start_index)
2529
2530                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2531
2532                         try:
2533                                 page = urllib2.urlopen(request).read()
2534                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2536                                 return
2537
2538                         # Extract video identifiers
2539                         ids_in_page = []
2540
2541                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2542                                 if mobj.group(1) not in ids_in_page:
2543                                         ids_in_page.append(mobj.group(1))
2544
2545                         video_ids.extend(ids_in_page)
2546
2547                         # A little optimization - if current page is not
2548                         # "full", ie. does not contain PAGE_SIZE video ids then
2549                         # we can assume that this page is the last one - there
2550                         # are no more ids on further pages - no need to query
2551                         # again.
2552
2553                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2554                                 break
2555
2556                         pagenum += 1
2557
2558                 all_ids_count = len(video_ids)
2559                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2560                 playlistend = self._downloader.params.get('playlistend', -1)
2561
2562                 if playlistend == -1:
2563                         video_ids = video_ids[playliststart:]
2564                 else:
2565                         video_ids = video_ids[playliststart:playlistend]
2566
2567                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2568                                 (username, all_ids_count, len(video_ids)))
2569
2570                 for video_id in video_ids:
2571                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2572
2573
2574 class DepositFilesIE(InfoExtractor):
2575         """Information extractor for depositfiles.com"""
2576
2577         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2578         IE_NAME = u'DepositFiles'
2579
2580         def __init__(self, downloader=None):
2581                 InfoExtractor.__init__(self, downloader)
2582
2583         def report_download_webpage(self, file_id):
2584                 """Report webpage download."""
2585                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2586
2587         def report_extraction(self, file_id):
2588                 """Report information extraction."""
2589                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2590
2591         def _real_initialize(self):
2592                 return
2593
2594         def _real_extract(self, url):
2595                 # At this point we have a new file
2596                 self._downloader.increment_downloads()
2597
2598                 file_id = url.split('/')[-1]
2599                 # Rebuild url in english locale
2600                 url = 'http://depositfiles.com/en/files/' + file_id
2601
2602                 # Retrieve file webpage with 'Free download' button pressed
2603                 free_download_indication = { 'gateway_result' : '1' }
2604                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2605                 try:
2606                         self.report_download_webpage(file_id)
2607                         webpage = urllib2.urlopen(request).read()
2608                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2609                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2610                         return
2611
2612                 # Search for the real file URL
2613                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2614                 if (mobj is None) or (mobj.group(1) is None):
2615                         # Try to figure out reason of the error.
2616                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2617                         if (mobj is not None) and (mobj.group(1) is not None):
2618                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2619                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2620                         else:
2621                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2622                         return
2623
2624                 file_url = mobj.group(1)
2625                 file_extension = os.path.splitext(file_url)[1][1:]
2626
2627                 # Search for file title
2628                 mobj = re.search(r'<b title="(.*?)">', webpage)
2629                 if mobj is None:
2630                         self._downloader.trouble(u'ERROR: unable to extract title')
2631                         return
2632                 file_title = mobj.group(1).decode('utf-8')
2633
2634                 try:
2635                         # Process file information
2636                         self._downloader.process_info({
2637                                 'id':           file_id.decode('utf-8'),
2638                                 'url':          file_url.decode('utf-8'),
2639                                 'uploader':     u'NA',
2640                                 'upload_date':  u'NA',
2641                                 'title':        file_title,
2642                                 'stitle':       file_title,
2643                                 'ext':          file_extension.decode('utf-8'),
2644                                 'format':       u'NA',
2645                                 'player_url':   None,
2646                         })
2647                 except UnavailableVideoError, err:
2648                         self._downloader.trouble(u'ERROR: unable to download file')
2649
2650
2651 class FacebookIE(InfoExtractor):
2652         """Information Extractor for Facebook"""
2653
2654         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2655         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2656         _NETRC_MACHINE = 'facebook'
2657         _available_formats = ['highqual', 'lowqual']
2658         _video_extensions = {
2659                 'highqual': 'mp4',
2660                 'lowqual': 'mp4',
2661         }
2662         IE_NAME = u'facebook'
2663
2664         def __init__(self, downloader=None):
2665                 InfoExtractor.__init__(self, downloader)
2666
2667         def _reporter(self, message):
2668                 """Add header and report message."""
2669                 self._downloader.to_screen(u'[facebook] %s' % message)
2670
2671         def report_login(self):
2672                 """Report attempt to log in."""
2673                 self._reporter(u'Logging in')
2674
2675         def report_video_webpage_download(self, video_id):
2676                 """Report attempt to download video webpage."""
2677                 self._reporter(u'%s: Downloading video webpage' % video_id)
2678
2679         def report_information_extraction(self, video_id):
2680                 """Report attempt to extract video information."""
2681                 self._reporter(u'%s: Extracting video information' % video_id)
2682
2683         def _parse_page(self, video_webpage):
2684                 """Extract video information from page"""
2685                 # General data
2686                 data = {'title': r'class="video_title datawrap">(.*?)</',
2687                         'description': r'<div class="datawrap">(.*?)</div>',
2688                         'owner': r'\("video_owner_name", "(.*?)"\)',
2689                         'upload_date': r'data-date="(.*?)"',
2690                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2691                         }
2692                 video_info = {}
2693                 for piece in data.keys():
2694                         mobj = re.search(data[piece], video_webpage)
2695                         if mobj is not None:
2696                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2697
2698                 # Video urls
2699                 video_urls = {}
2700                 for fmt in self._available_formats:
2701                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2702                         if mobj is not None:
2703                                 # URL is in a Javascript segment inside an escaped Unicode format within
2704                                 # the generally utf-8 page
2705                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2706                 video_info['video_urls'] = video_urls
2707
2708                 return video_info
2709
2710         def _real_initialize(self):
2711                 if self._downloader is None:
2712                         return
2713
2714                 useremail = None
2715                 password = None
2716                 downloader_params = self._downloader.params
2717
2718                 # Attempt to use provided username and password or .netrc data
2719                 if downloader_params.get('username', None) is not None:
2720                         useremail = downloader_params['username']
2721                         password = downloader_params['password']
2722                 elif downloader_params.get('usenetrc', False):
2723                         try:
2724                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2725                                 if info is not None:
2726                                         useremail = info[0]
2727                                         password = info[2]
2728                                 else:
2729                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2730                         except (IOError, netrc.NetrcParseError), err:
2731                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2732                                 return
2733
2734                 if useremail is None:
2735                         return
2736
2737                 # Log in
2738                 login_form = {
2739                         'email': useremail,
2740                         'pass': password,
2741                         'login': 'Log+In'
2742                         }
2743                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2744                 try:
2745                         self.report_login()
2746                         login_results = urllib2.urlopen(request).read()
2747                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2748                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2749                                 return
2750                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2751                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2752                         return
2753
2754         def _real_extract(self, url):
2755                 mobj = re.match(self._VALID_URL, url)
2756                 if mobj is None:
2757                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2758                         return
2759                 video_id = mobj.group('ID')
2760
2761                 # Get video webpage
2762                 self.report_video_webpage_download(video_id)
2763                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2764                 try:
2765                         page = urllib2.urlopen(request)
2766                         video_webpage = page.read()
2767                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2768                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2769                         return
2770
2771                 # Start extracting information
2772                 self.report_information_extraction(video_id)
2773
2774                 # Extract information
2775                 video_info = self._parse_page(video_webpage)
2776
2777                 # uploader
2778                 if 'owner' not in video_info:
2779                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2780                         return
2781                 video_uploader = video_info['owner']
2782
2783                 # title
2784                 if 'title' not in video_info:
2785                         self._downloader.trouble(u'ERROR: unable to extract video title')
2786                         return
2787                 video_title = video_info['title']
2788                 video_title = video_title.decode('utf-8')
2789                 video_title = sanitize_title(video_title)
2790
2791                 # simplified title
2792                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2793                 simple_title = simple_title.strip(ur'_')
2794
2795                 # thumbnail image
2796                 if 'thumbnail' not in video_info:
2797                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2798                         video_thumbnail = ''
2799                 else:
2800                         video_thumbnail = video_info['thumbnail']
2801
2802                 # upload date
2803                 upload_date = u'NA'
2804                 if 'upload_date' in video_info:
2805                         upload_time = video_info['upload_date']
2806                         timetuple = email.utils.parsedate_tz(upload_time)
2807                         if timetuple is not None:
2808                                 try:
2809                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2810                                 except:
2811                                         pass
2812
2813                 # description
2814                 video_description = video_info.get('description', 'No description available.')
2815
2816                 url_map = video_info['video_urls']
2817                 if len(url_map.keys()) > 0:
2818                         # Decide which formats to download
2819                         req_format = self._downloader.params.get('format', None)
2820                         format_limit = self._downloader.params.get('format_limit', None)
2821
2822                         if format_limit is not None and format_limit in self._available_formats:
2823                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2824                         else:
2825                                 format_list = self._available_formats
2826                         existing_formats = [x for x in format_list if x in url_map]
2827                         if len(existing_formats) == 0:
2828                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2829                                 return
2830                         if req_format is None:
2831                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2832                         elif req_format == '-1':
2833                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2834                         else:
2835                                 # Specific format
2836                                 if req_format not in url_map:
2837                                         self._downloader.trouble(u'ERROR: requested format not available')
2838                                         return
2839                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2840
2841                 for format_param, video_real_url in video_url_list:
2842
2843                         # At this point we have a new video
2844                         self._downloader.increment_downloads()
2845
2846                         # Extension
2847                         video_extension = self._video_extensions.get(format_param, 'mp4')
2848
2849                         try:
2850                                 # Process video information
2851                                 self._downloader.process_info({
2852                                         'id':           video_id.decode('utf-8'),
2853                                         'url':          video_real_url.decode('utf-8'),
2854                                         'uploader':     video_uploader.decode('utf-8'),
2855                                         'upload_date':  upload_date,
2856                                         'title':        video_title,
2857                                         'stitle':       simple_title,
2858                                         'ext':          video_extension.decode('utf-8'),
2859                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2860                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2861                                         'description':  video_description.decode('utf-8'),
2862                                         'player_url':   None,
2863                                 })
2864                         except UnavailableVideoError, err:
2865                                 self._downloader.trouble(u'\nERROR: unable to download video')
2866
2867 class BlipTVIE(InfoExtractor):
2868         """Information extractor for blip.tv"""
2869
2870         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2871         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2872         IE_NAME = u'blip.tv'
2873
2874         def report_extraction(self, file_id):
2875                 """Report information extraction."""
2876                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2877
2878         def _simplify_title(self, title):
2879                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2880                 res = res.strip(ur'_')
2881                 return res
2882
2883         def _real_extract(self, url):
2884                 mobj = re.match(self._VALID_URL, url)
2885                 if mobj is None:
2886                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2887                         return
2888
2889                 if '?' in url:
2890                         cchar = '&'
2891                 else:
2892                         cchar = '?'
2893                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2894                 request = urllib2.Request(json_url)
2895                 self.report_extraction(mobj.group(1))
2896                 try:
2897                         json_code = urllib2.urlopen(request).read()
2898                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2899                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2900                         return
2901                 try:
2902                         json_data = json.loads(json_code)
2903                         if 'Post' in json_data:
2904                                 data = json_data['Post']
2905                         else:
2906                                 data = json_data
2907
2908                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2909                         video_url = data['media']['url']
2910                         umobj = re.match(self._URL_EXT, video_url)
2911                         if umobj is None:
2912                                 raise ValueError('Can not determine filename extension')
2913                         ext = umobj.group(1)
2914
2915                         self._downloader.increment_downloads()
2916
2917                         info = {
2918                                 'id': data['item_id'],
2919                                 'url': video_url,
2920                                 'uploader': data['display_name'],
2921                                 'upload_date': upload_date,
2922                                 'title': data['title'],
2923                                 'stitle': self._simplify_title(data['title']),
2924                                 'ext': ext,
2925                                 'format': data['media']['mimeType'],
2926                                 'thumbnail': data['thumbnailUrl'],
2927                                 'description': data['description'],
2928                                 'player_url': data['embedUrl']
2929                         }
2930                 except (ValueError,KeyError), err:
2931                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2932                         return
2933
2934                 try:
2935                         self._downloader.process_info(info)
2936                 except UnavailableVideoError, err:
2937                         self._downloader.trouble(u'\nERROR: unable to download video')
2938
2939
2940 class MyVideoIE(InfoExtractor):
2941         """Information Extractor for myvideo.de."""
2942
2943         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2944         IE_NAME = u'myvideo'
2945
2946         def __init__(self, downloader=None):
2947                 InfoExtractor.__init__(self, downloader)
2948
2949         def report_download_webpage(self, video_id):
2950                 """Report webpage download."""
2951                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2952
2953         def report_extraction(self, video_id):
2954                 """Report information extraction."""
2955                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2956
2957         def _real_initialize(self):
2958                 return
2959
2960         def _real_extract(self,url):
2961                 mobj = re.match(self._VALID_URL, url)
2962                 if mobj is None:
2963                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2964                         return
2965
2966                 video_id = mobj.group(1)
2967                 simple_title = mobj.group(2).decode('utf-8')
2968                 # should actually not be necessary
2969                 simple_title = sanitize_title(simple_title)
2970                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2971
2972                 # Get video webpage
2973                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2974                 try:
2975                         self.report_download_webpage(video_id)
2976                         webpage = urllib2.urlopen(request).read()
2977                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2978                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2979                         return
2980
2981                 self.report_extraction(video_id)
2982                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2983                                  webpage)
2984                 if mobj is None:
2985                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2986                         return
2987                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2988
2989                 mobj = re.search('<title>([^<]+)</title>', webpage)
2990                 if mobj is None:
2991                         self._downloader.trouble(u'ERROR: unable to extract title')
2992                         return
2993
2994                 video_title = mobj.group(1)
2995                 video_title = sanitize_title(video_title)
2996
2997                 try:
2998                         print(video_url)
2999                         self._downloader.process_info({
3000                                 'id':           video_id,
3001                                 'url':          video_url,
3002                                 'uploader':     u'NA',
3003                                 'upload_date':  u'NA',
3004                                 'title':        video_title,
3005                                 'stitle':       simple_title,
3006                                 'ext':          u'flv',
3007                                 'format':       u'NA',
3008                                 'player_url':   None,
3009                         })
3010                 except UnavailableVideoError:
3011                         self._downloader.trouble(u'\nERROR: Unable to download video')
3012
3013 class ComedyCentralIE(InfoExtractor):
3014         """Information extractor for The Daily Show and Colbert Report """
3015
3016         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3017         IE_NAME = u'comedycentral'
3018
3019         def report_extraction(self, episode_id):
3020                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3021
3022         def report_config_download(self, episode_id):
3023                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3024
3025         def report_index_download(self, episode_id):
3026                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3027
3028         def report_player_url(self, episode_id):
3029                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3030
3031         def _simplify_title(self, title):
3032                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3033                 res = res.strip(ur'_')
3034                 return res
3035
3036         def _real_extract(self, url):
3037                 mobj = re.match(self._VALID_URL, url)
3038                 if mobj is None:
3039                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3040                         return
3041
3042                 if mobj.group('shortname'):
3043                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3044                                 url = 'http://www.thedailyshow.com/full-episodes/'
3045                         else:
3046                                 url = 'http://www.colbertnation.com/full-episodes/'
3047                         mobj = re.match(self._VALID_URL, url)
3048                         assert mobj is not None
3049
3050                 dlNewest = not mobj.group('episode')
3051                 if dlNewest:
3052                         epTitle = mobj.group('showname')
3053                 else:
3054                         epTitle = mobj.group('episode')
3055
3056                 req = urllib2.Request(url)
3057                 self.report_extraction(epTitle)
3058                 try:
3059                         htmlHandle = urllib2.urlopen(req)
3060                         html = htmlHandle.read()
3061                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3062                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3063                         return
3064                 if dlNewest:
3065                         url = htmlHandle.geturl()
3066                         mobj = re.match(self._VALID_URL, url)
3067                         if mobj is None:
3068                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3069                                 return
3070                         if mobj.group('episode') == '':
3071                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3072                                 return
3073                         epTitle = mobj.group('episode')
3074
3075                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3076                 if len(mMovieParams) == 0:
3077                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3078                         return
3079
3080                 playerUrl_raw = mMovieParams[0][0]
3081                 self.report_player_url(epTitle)
3082                 try:
3083                         urlHandle = urllib2.urlopen(playerUrl_raw)
3084                         playerUrl = urlHandle.geturl()
3085                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3086                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3087                         return
3088
3089                 uri = mMovieParams[0][1]
3090                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3091                 self.report_index_download(epTitle)
3092                 try:
3093                         indexXml = urllib2.urlopen(indexUrl).read()
3094                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3095                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3096                         return
3097
3098                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3099                 itemEls = idoc.findall('.//item')
3100                 for itemEl in itemEls:
3101                         mediaId = itemEl.findall('./guid')[0].text
3102                         shortMediaId = mediaId.split(':')[-1]
3103                         showId = mediaId.split(':')[-2].replace('.com', '')
3104                         officialTitle = itemEl.findall('./title')[0].text
3105                         officialDate = itemEl.findall('./pubDate')[0].text
3106
3107                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3108                                                 urllib.urlencode({'uri': mediaId}))
3109                         configReq = urllib2.Request(configUrl)
3110                         self.report_config_download(epTitle)
3111                         try:
3112                                 configXml = urllib2.urlopen(configReq).read()
3113                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3114                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3115                                 return
3116
3117                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3118                         turls = []
3119                         for rendition in cdoc.findall('.//rendition'):
3120                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3121                                 turls.append(finfo)
3122
3123                         if len(turls) == 0:
3124                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3125                                 continue
3126
3127                         # For now, just pick the highest bitrate
3128                         format,video_url = turls[-1]
3129
3130                         self._downloader.increment_downloads()
3131
3132                         effTitle = showId + '-' + epTitle
3133                         info = {
3134                                 'id': shortMediaId,
3135                                 'url': video_url,
3136                                 'uploader': showId,
3137                                 'upload_date': officialDate,
3138                                 'title': effTitle,
3139                                 'stitle': self._simplify_title(effTitle),
3140                                 'ext': 'mp4',
3141                                 'format': format,
3142                                 'thumbnail': None,
3143                                 'description': officialTitle,
3144                                 'player_url': playerUrl
3145                         }
3146
3147                         try:
3148                                 self._downloader.process_info(info)
3149                         except UnavailableVideoError, err:
3150                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3151                                 continue
3152
3153
3154 class EscapistIE(InfoExtractor):
3155         """Information extractor for The Escapist """
3156
3157         _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3158         IE_NAME = u'escapist'
3159
3160         def report_extraction(self, showName):
3161                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3162
3163         def report_config_download(self, showName):
3164                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3165
3166         def _simplify_title(self, title):
3167                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3168                 res = res.strip(ur'_')
3169                 return res
3170
3171         def _real_extract(self, url):
3172                 htmlParser = HTMLParser.HTMLParser()
3173
3174                 mobj = re.match(self._VALID_URL, url)
3175                 if mobj is None:
3176                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3177                         return
3178                 showName = mobj.group('showname')
3179                 videoId = mobj.group('episode')
3180
3181                 self.report_extraction(showName)
3182                 try:
3183                         webPage = urllib2.urlopen(url).read()
3184                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3185                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3186                         return
3187
3188                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3189                 description = htmlParser.unescape(descMatch.group(1))
3190                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3191                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3192                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3193                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3194                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3195                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3196
3197                 self.report_config_download(showName)
3198                 try:
3199                         configJSON = urllib2.urlopen(configUrl).read()
3200                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3201                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3202                         return
3203
3204                 # Technically, it's JavaScript, not JSON
3205                 configJSON = configJSON.replace("'", '"')
3206
3207                 try:
3208                         config = json.loads(configJSON)
3209                 except (ValueError,), err:
3210                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3211                         return
3212
3213                 playlist = config['playlist']
3214                 videoUrl = playlist[1]['url']
3215
3216                 self._downloader.increment_downloads()
3217                 info = {
3218                         'id': videoId,
3219                         'url': videoUrl,
3220                         'uploader': showName,
3221                         'upload_date': None,
3222                         'title': showName,
3223                         'stitle': self._simplify_title(showName),
3224                         'ext': 'flv',
3225                         'format': 'flv',
3226                         'thumbnail': imgUrl,
3227                         'description': description,
3228                         'player_url': playerUrl,
3229                 }
3230
3231                 try:
3232                         self._downloader.process_info(info)
3233                 except UnavailableVideoError, err:
3234                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3235
3236
3237
3238 class PostProcessor(object):
3239         """Post Processor class.
3240
3241         PostProcessor objects can be added to downloaders with their
3242         add_post_processor() method. When the downloader has finished a
3243         successful download, it will take its internal chain of PostProcessors
3244         and start calling the run() method on each one of them, first with
3245         an initial argument and then with the returned value of the previous
3246         PostProcessor.
3247
3248         The chain will be stopped if one of them ever returns None or the end
3249         of the chain is reached.
3250
3251         PostProcessor objects follow a "mutual registration" process similar
3252         to InfoExtractor objects.
3253         """
3254
3255         _downloader = None
3256
3257         def __init__(self, downloader=None):
3258                 self._downloader = downloader
3259
3260         def set_downloader(self, downloader):
3261                 """Sets the downloader for this PP."""
3262                 self._downloader = downloader
3263
3264         def run(self, information):
3265                 """Run the PostProcessor.
3266
3267                 The "information" argument is a dictionary like the ones
3268                 composed by InfoExtractors. The only difference is that this
3269                 one has an extra field called "filepath" that points to the
3270                 downloaded file.
3271
3272                 When this method returns None, the postprocessing chain is
3273                 stopped. However, this method may return an information
3274                 dictionary that will be passed to the next postprocessing
3275                 object in the chain. It can be the one it received after
3276                 changing some fields.
3277
3278                 In addition, this method may raise a PostProcessingError
3279                 exception that will be taken into account by the downloader
3280                 it was called from.
3281                 """
3282                 return information # by default, do nothing
3283
3284
3285 class FFmpegExtractAudioPP(PostProcessor):
3286
3287         def __init__(self, downloader=None, preferredcodec=None):
3288                 PostProcessor.__init__(self, downloader)
3289                 if preferredcodec is None:
3290                         preferredcodec = 'best'
3291                 self._preferredcodec = preferredcodec
3292
3293         @staticmethod
3294         def get_audio_codec(path):
3295                 try:
3296                         cmd = ['ffprobe', '-show_streams', '--', path]
3297                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3298                         output = handle.communicate()[0]
3299                         if handle.wait() != 0:
3300                                 return None
3301                 except (IOError, OSError):
3302                         return None
3303                 audio_codec = None
3304                 for line in output.split('\n'):
3305                         if line.startswith('codec_name='):
3306                                 audio_codec = line.split('=')[1].strip()
3307                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3308                                 return audio_codec
3309                 return None
3310
3311         @staticmethod
3312         def run_ffmpeg(path, out_path, codec, more_opts):
3313                 try:
3314                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3315                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3316                         return (ret == 0)
3317                 except (IOError, OSError):
3318                         return False
3319
3320         def run(self, information):
3321                 path = information['filepath']
3322
3323                 filecodec = self.get_audio_codec(path)
3324                 if filecodec is None:
3325                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3326                         return None
3327
3328                 more_opts = []
3329                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3330                         if filecodec == 'aac' or filecodec == 'mp3':
3331                                 # Lossless if possible
3332                                 acodec = 'copy'
3333                                 extension = filecodec
3334                                 if filecodec == 'aac':
3335                                         more_opts = ['-f', 'adts']
3336                         else:
3337                                 # MP3 otherwise.
3338                                 acodec = 'libmp3lame'
3339                                 extension = 'mp3'
3340                                 more_opts = ['-ab', '128k']
3341                 else:
3342                         # We convert the audio (lossy)
3343                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3344                         extension = self._preferredcodec
3345                         more_opts = ['-ab', '128k']
3346                         if self._preferredcodec == 'aac':
3347                                 more_opts += ['-f', 'adts']
3348
3349                 (prefix, ext) = os.path.splitext(path)
3350                 new_path = prefix + '.' + extension
3351                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3352                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3353
3354                 if not status:
3355                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3356                         return None
3357
3358                 try:
3359                         os.remove(path)
3360                 except (IOError, OSError):
3361                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3362                         return None
3363
3364                 information['filepath'] = new_path
3365                 return information
3366
3367
3368 def updateSelf(downloader, filename):
3369         ''' Update the program file with the latest version from the repository '''
3370         # Note: downloader only used for options
3371         if not os.access(filename, os.W_OK):
3372                 sys.exit('ERROR: no write permissions on %s' % filename)
3373
3374         downloader.to_screen('Updating to latest version...')
3375
3376         try:
3377                 try:
3378                         urlh = urllib.urlopen(UPDATE_URL)
3379                         newcontent = urlh.read()
3380                 finally:
3381                         urlh.close()
3382         except (IOError, OSError), err:
3383                 sys.exit('ERROR: unable to download latest version')
3384
3385         try:
3386                 outf = open(filename, 'wb')
3387                 try:
3388                         outf.write(newcontent)
3389                 finally:
3390                         outf.close()
3391         except (IOError, OSError), err:
3392                 sys.exit('ERROR: unable to overwrite current version')
3393
3394         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3395
3396 def parseOpts():
3397         # Deferred imports
3398         import getpass
3399         import optparse
3400
3401         def _format_option_string(option):
3402                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3403
3404                 opts = []
3405
3406                 if option._short_opts: opts.append(option._short_opts[0])
3407                 if option._long_opts: opts.append(option._long_opts[0])
3408                 if len(opts) > 1: opts.insert(1, ', ')
3409
3410                 if option.takes_value(): opts.append(' %s' % option.metavar)
3411
3412                 return "".join(opts)
3413
3414         def _find_term_columns():
3415                 columns = os.environ.get('COLUMNS', None)
3416                 if columns:
3417                         return int(columns)
3418
3419                 try:
3420                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3421                         out,err = sp.communicate()
3422                         return int(out.split()[1])
3423                 except:
3424                         pass
3425                 return None
3426
3427         max_width = 80
3428         max_help_position = 80
3429
3430         # No need to wrap help messages if we're on a wide console
3431         columns = _find_term_columns()
3432         if columns: max_width = columns
3433
3434         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3435         fmt.format_option_strings = _format_option_string
3436
3437         kw = {
3438                 'version'   : __version__,
3439                 'formatter' : fmt,
3440                 'usage' : '%prog [options] url [url...]',
3441                 'conflict_handler' : 'resolve',
3442         }
3443
3444         parser = optparse.OptionParser(**kw)
3445
3446         # option groups
3447         general        = optparse.OptionGroup(parser, 'General Options')
3448         selection      = optparse.OptionGroup(parser, 'Video Selection')
3449         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3450         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3451         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3452         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3453         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3454
3455         general.add_option('-h', '--help',
3456                         action='help', help='print this help text and exit')
3457         general.add_option('-v', '--version',
3458                         action='version', help='print program version and exit')
3459         general.add_option('-U', '--update',
3460                         action='store_true', dest='update_self', help='update this program to latest version')
3461         general.add_option('-i', '--ignore-errors',
3462                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3463         general.add_option('-r', '--rate-limit',
3464                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3465         general.add_option('-R', '--retries',
3466                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3467         general.add_option('--dump-user-agent',
3468                         action='store_true', dest='dump_user_agent',
3469                         help='display the current browser identification', default=False)
3470         general.add_option('--list-extractors',
3471                         action='store_true', dest='list_extractors',
3472                         help='List all supported extractors and the URLs they would handle', default=False)
3473
3474         selection.add_option('--playlist-start',
3475                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3476         selection.add_option('--playlist-end',
3477                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3478         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3479         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3480
3481         authentication.add_option('-u', '--username',
3482                         dest='username', metavar='USERNAME', help='account username')
3483         authentication.add_option('-p', '--password',
3484                         dest='password', metavar='PASSWORD', help='account password')
3485         authentication.add_option('-n', '--netrc',
3486                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3487
3488
3489         video_format.add_option('-f', '--format',
3490                         action='store', dest='format', metavar='FORMAT', help='video format code')
3491         video_format.add_option('--all-formats',
3492                         action='store_const', dest='format', help='download all available video formats', const='-1')
3493         video_format.add_option('--max-quality',
3494                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3495
3496
3497         verbosity.add_option('-q', '--quiet',
3498                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3499         verbosity.add_option('-s', '--simulate',
3500                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3501         verbosity.add_option('--skip-download',
3502                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3503         verbosity.add_option('-g', '--get-url',
3504                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3505         verbosity.add_option('-e', '--get-title',
3506                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3507         verbosity.add_option('--get-thumbnail',
3508                         action='store_true', dest='getthumbnail',
3509                         help='simulate, quiet but print thumbnail URL', default=False)
3510         verbosity.add_option('--get-description',
3511                         action='store_true', dest='getdescription',
3512                         help='simulate, quiet but print video description', default=False)
3513         verbosity.add_option('--get-filename',
3514                         action='store_true', dest='getfilename',
3515                         help='simulate, quiet but print output filename', default=False)
3516         verbosity.add_option('--no-progress',
3517                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3518         verbosity.add_option('--console-title',
3519                         action='store_true', dest='consoletitle',
3520                         help='display progress in console titlebar', default=False)
3521
3522
3523         filesystem.add_option('-t', '--title',
3524                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3525         filesystem.add_option('-l', '--literal',
3526                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3527         filesystem.add_option('-A', '--auto-number',
3528                         action='store_true', dest='autonumber',
3529                         help='number downloaded files starting from 00000', default=False)
3530         filesystem.add_option('-o', '--output',
3531                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3532         filesystem.add_option('-a', '--batch-file',
3533                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3534         filesystem.add_option('-w', '--no-overwrites',
3535                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3536         filesystem.add_option('-c', '--continue',
3537                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3538         filesystem.add_option('--cookies',
3539                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3540         filesystem.add_option('--no-part',
3541                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3542         filesystem.add_option('--no-mtime',
3543                         action='store_false', dest='updatetime',
3544                         help='do not use the Last-modified header to set the file modification time', default=True)
3545         filesystem.add_option('--write-description',
3546                         action='store_true', dest='writedescription',
3547                         help='write video description to a .description file', default=False)
3548         filesystem.add_option('--write-info-json',
3549                         action='store_true', dest='writeinfojson',
3550                         help='write video metadata to a .info.json file', default=False)
3551
3552
3553         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3554                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3555         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3556                         help='"best", "aac" or "mp3"; best by default')
3557
3558
3559         parser.add_option_group(general)
3560         parser.add_option_group(selection)
3561         parser.add_option_group(filesystem)
3562         parser.add_option_group(verbosity)
3563         parser.add_option_group(video_format)
3564         parser.add_option_group(authentication)
3565         parser.add_option_group(postproc)
3566
3567         opts, args = parser.parse_args()
3568
3569         return parser, opts, args
3570
3571 def gen_extractors():
3572         """ Return a list of an instance of every supported extractor.
3573         The order does matter; the first extractor matched is the one handling the URL.
3574         """
3575         youtube_ie = YoutubeIE()
3576         google_ie = GoogleIE()
3577         yahoo_ie = YahooIE()
3578         return [
3579                 youtube_ie,
3580                 MetacafeIE(youtube_ie),
3581                 DailymotionIE(),
3582                 YoutubePlaylistIE(youtube_ie),
3583                 YoutubeUserIE(youtube_ie),
3584                 YoutubeSearchIE(youtube_ie),
3585                 google_ie,
3586                 GoogleSearchIE(google_ie),
3587                 PhotobucketIE(),
3588                 yahoo_ie,
3589                 YahooSearchIE(yahoo_ie),
3590                 DepositFilesIE(),
3591                 FacebookIE(),
3592                 BlipTVIE(),
3593                 VimeoIE(),
3594                 MyVideoIE(),
3595                 ComedyCentralIE(),
3596                 EscapistIE(),
3597
3598                 GenericIE()
3599         ]
3600
3601 def main():
3602         parser, opts, args = parseOpts()
3603
3604         # Open appropriate CookieJar
3605         if opts.cookiefile is None:
3606                 jar = cookielib.CookieJar()
3607         else:
3608                 try:
3609                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3610                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3611                                 jar.load()
3612                 except (IOError, OSError), err:
3613                         sys.exit(u'ERROR: unable to open cookie file')
3614
3615         # Dump user agent
3616         if opts.dump_user_agent:
3617                 print std_headers['User-Agent']
3618                 sys.exit(0)
3619
3620         # Batch file verification
3621         batchurls = []
3622         if opts.batchfile is not None:
3623                 try:
3624                         if opts.batchfile == '-':
3625                                 batchfd = sys.stdin
3626                         else:
3627                                 batchfd = open(opts.batchfile, 'r')
3628                         batchurls = batchfd.readlines()
3629                         batchurls = [x.strip() for x in batchurls]
3630                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3631                 except IOError:
3632                         sys.exit(u'ERROR: batch file could not be read')
3633         all_urls = batchurls + args
3634
3635         # General configuration
3636         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3637         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3638         urllib2.install_opener(opener)
3639         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3640
3641         extractors = gen_extractors()
3642
3643         if opts.list_extractors:
3644                 for ie in extractors:
3645                         print(ie.IE_NAME)
3646                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3647                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3648                         for mu in matchedUrls:
3649                                 print(u'  ' + mu)
3650                 sys.exit(0)
3651
3652         # Conflicting, missing and erroneous options
3653         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3654                 parser.error(u'using .netrc conflicts with giving username/password')
3655         if opts.password is not None and opts.username is None:
3656                 parser.error(u'account username missing')
3657         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3658                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3659         if opts.usetitle and opts.useliteral:
3660                 parser.error(u'using title conflicts with using literal title')
3661         if opts.username is not None and opts.password is None:
3662                 opts.password = getpass.getpass(u'Type account password and press return:')
3663         if opts.ratelimit is not None:
3664                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3665                 if numeric_limit is None:
3666                         parser.error(u'invalid rate limit specified')
3667                 opts.ratelimit = numeric_limit
3668         if opts.retries is not None:
3669                 try:
3670                         opts.retries = long(opts.retries)
3671                 except (TypeError, ValueError), err:
3672                         parser.error(u'invalid retry count specified')
3673         try:
3674                 opts.playliststart = int(opts.playliststart)
3675                 if opts.playliststart <= 0:
3676                         raise ValueError(u'Playlist start must be positive')
3677         except (TypeError, ValueError), err:
3678                 parser.error(u'invalid playlist start number specified')
3679         try:
3680                 opts.playlistend = int(opts.playlistend)
3681                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3682                         raise ValueError(u'Playlist end must be greater than playlist start')
3683         except (TypeError, ValueError), err:
3684                 parser.error(u'invalid playlist end number specified')
3685         if opts.extractaudio:
3686                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3687                         parser.error(u'invalid audio format specified')
3688
3689         # File downloader
3690         fd = FileDownloader({
3691                 'usenetrc': opts.usenetrc,
3692                 'username': opts.username,
3693                 'password': opts.password,
3694                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3695                 'forceurl': opts.geturl,
3696                 'forcetitle': opts.gettitle,
3697                 'forcethumbnail': opts.getthumbnail,
3698                 'forcedescription': opts.getdescription,
3699                 'forcefilename': opts.getfilename,
3700                 'simulate': opts.simulate,
3701                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3702                 'format': opts.format,
3703                 'format_limit': opts.format_limit,
3704                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3705                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3706                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3707                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3708                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3709                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3710                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3711                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3712                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3713                         or u'%(id)s.%(ext)s'),
3714                 'ignoreerrors': opts.ignoreerrors,
3715                 'ratelimit': opts.ratelimit,
3716                 'nooverwrites': opts.nooverwrites,
3717                 'retries': opts.retries,
3718                 'continuedl': opts.continue_dl,
3719                 'noprogress': opts.noprogress,
3720                 'playliststart': opts.playliststart,
3721                 'playlistend': opts.playlistend,
3722                 'logtostderr': opts.outtmpl == '-',
3723                 'consoletitle': opts.consoletitle,
3724                 'nopart': opts.nopart,
3725                 'updatetime': opts.updatetime,
3726                 'writedescription': opts.writedescription,
3727                 'writeinfojson': opts.writeinfojson,
3728                 'matchtitle': opts.matchtitle,
3729                 'rejecttitle': opts.rejecttitle,
3730                 })
3731         for extractor in extractors:
3732                 fd.add_info_extractor(extractor)
3733
3734         # PostProcessors
3735         if opts.extractaudio:
3736                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3737
3738         # Update version
3739         if opts.update_self:
3740                 updateSelf(fd, sys.argv[0])
3741
3742         # Maybe do nothing
3743         if len(all_urls) < 1:
3744                 if not opts.update_self:
3745                         parser.error(u'you must provide at least one URL')
3746                 else:
3747                         sys.exit()
3748         retcode = fd.download(all_urls)
3749
3750         # Dump cookie jar if requested
3751         if opts.cookiefile is not None:
3752                 try:
3753                         jar.save()
3754                 except (IOError, OSError), err:
3755                         sys.exit(u'ERROR: unable to save cookie jar')
3756
3757         sys.exit(retcode)
3758
3759
3760 if __name__ == '__main__':
3761         try:
3762                 main()
3763         except DownloadError:
3764                 sys.exit(1)
3765         except SameFileError:
3766                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3767         except KeyboardInterrupt:
3768                 sys.exit(u'\nERROR: Interrupted by user')
3769
3770 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: