youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.09.18'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import HTMLParser
  27 import httplib
  28 import locale
  29 import math
  30 import netrc
  31 import os
  32 import os.path
  33 import re
  34 import socket
  35 import string
  36 import subprocess
  37 import sys
  38 import time
  39 import urllib
  40 import urllib2
  41 import warnings
  42 import zlib
  43
  44 if os.name == 'nt':
  45         import ctypes
  46
  47 try:
  48         import email.utils
  49 except ImportError: # Python 2.4
  50         import email.Utils
  51 try:
  52         import cStringIO as StringIO
  53 except ImportError:
  54         import StringIO
  55
  56 # parse_qs was moved from the cgi module to the urlparse module recently.
  57 try:
  58         from urlparse import parse_qs
  59 except ImportError:
  60         from cgi import parse_qs
  61
  62 try:
  63         import lxml.etree
  64 except ImportError:
  65         pass # Handled below
  66
  67 try:
  68         import xml.etree.ElementTree
  69 except ImportError: # Python<2.5: Not officially supported, but let it slip
  70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  71
  72 std_headers = {
  73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  76         'Accept-Encoding': 'gzip, deflate',
  77         'Accept-Language': 'en-us,en;q=0.5',
  78 }
  79
  80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280
 281 class DownloadError(Exception):
 282         """Download Error exception.
 283
 284         This exception may be thrown by FileDownloader objects if they are not
 285         configured to continue on errors. They will contain the appropriate
 286         error message.
 287         """
 288         pass
 289
 290
 291 class SameFileError(Exception):
 292         """Same File exception.
 293
 294         This exception will be thrown by FileDownloader objects if they detect
 295         multiple files would have to be downloaded to the same file on disk.
 296         """
 297         pass
 298
 299
 300 class PostProcessingError(Exception):
 301         """Post Processing exception.
 302
 303         This exception may be raised by PostProcessor's .run() method to
 304         indicate an error in the postprocessing task.
 305         """
 306         pass
 307
 308
 309 class UnavailableVideoError(Exception):
 310         """Unavailable Format exception.
 311
 312         This exception will be thrown when a video is requested
 313         in a format that is not available for that video.
 314         """
 315         pass
 316
 317
 318 class ContentTooShortError(Exception):
 319         """Content Too Short exception.
 320
 321         This exception may be raised by FileDownloader objects when a file they
 322         download is too small for what the server announced first, indicating
 323         the connection was probably interrupted.
 324         """
 325         # Both in bytes
 326         downloaded = None
 327         expected = None
 328
 329         def __init__(self, downloaded, expected):
 330                 self.downloaded = downloaded
 331                 self.expected = expected
 332
 333
 334 class YoutubeDLHandler(urllib2.HTTPHandler):
 335         """Handler for HTTP requests and responses.
 336
 337         This class, when installed with an OpenerDirector, automatically adds
 338         the standard headers to every HTTP request and handles gzipped and
 339         deflated responses from web servers. If compression is to be avoided in
 340         a particular request, the original request in the program code only has
 341         to include the HTTP header "Youtubedl-No-Compression", which will be
 342         removed before making the real request.
 343
 344         Part of this code was copied from:
 345
 346         http://techknack.net/python-urllib2-handlers/
 347
 348         Andrew Rowls, the author of that code, agreed to release it to the
 349         public domain.
 350         """
 351
 352         @staticmethod
 353         def deflate(data):
 354                 try:
 355                         return zlib.decompress(data, -zlib.MAX_WBITS)
 356                 except zlib.error:
 357                         return zlib.decompress(data)
 358
 359         @staticmethod
 360         def addinfourl_wrapper(stream, headers, url, code):
 361                 if hasattr(urllib2.addinfourl, 'getcode'):
 362                         return urllib2.addinfourl(stream, headers, url, code)
 363                 ret = urllib2.addinfourl(stream, headers, url)
 364                 ret.code = code
 365                 return ret
 366
 367         def http_request(self, req):
 368                 for h in std_headers:
 369                         if h in req.headers:
 370                                 del req.headers[h]
 371                         req.add_header(h, std_headers[h])
 372                 if 'Youtubedl-no-compression' in req.headers:
 373                         if 'Accept-encoding' in req.headers:
 374                                 del req.headers['Accept-encoding']
 375                         del req.headers['Youtubedl-no-compression']
 376                 return req
 377
 378         def http_response(self, req, resp):
 379                 old_resp = resp
 380                 # gzip
 381                 if resp.headers.get('Content-encoding', '') == 'gzip':
 382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 384                         resp.msg = old_resp.msg
 385                 # deflate
 386                 if resp.headers.get('Content-encoding', '') == 'deflate':
 387                         gz = StringIO.StringIO(self.deflate(resp.read()))
 388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 389                         resp.msg = old_resp.msg
 390                 return resp
 391
 392
 393 class FileDownloader(object):
 394         """File Downloader class.
 395
 396         File downloader objects are the ones responsible of downloading the
 397         actual video file and writing it to disk if the user has requested
 398         it, among some other tasks. In most cases there should be one per
 399         program. As, given a video URL, the downloader doesn't know how to
 400         extract all the needed information, task that InfoExtractors do, it
 401         has to pass the URL to one of them.
 402
 403         For this, file downloader objects have a method that allows
 404         InfoExtractors to be registered in a given order. When it is passed
 405         a URL, the file downloader handles it to the first InfoExtractor it
 406         finds that reports being able to handle it. The InfoExtractor extracts
 407         all the information about the video or videos the URL refers to, and
 408         asks the FileDownloader to process the video information, possibly
 409         downloading the video.
 410
 411         File downloaders accept a lot of parameters. In order not to saturate
 412         the object constructor with arguments, it receives a dictionary of
 413         options instead. These options are available through the params
 414         attribute for the InfoExtractors to use. The FileDownloader also
 415         registers itself as the downloader in charge for the InfoExtractors
 416         that are added to it, so this is a "mutual registration".
 417
 418         Available options:
 419
 420         username:         Username for authentication purposes.
 421         password:         Password for authentication purposes.
 422         usenetrc:         Use netrc for authentication instead.
 423         quiet:            Do not print messages to stdout.
 424         forceurl:         Force printing final URL.
 425         forcetitle:       Force printing title.
 426         forcethumbnail:   Force printing thumbnail URL.
 427         forcedescription: Force printing description.
 428         forcefilename:    Force printing final filename.
 429         simulate:         Do not download the video files.
 430         format:           Video format code.
 431         format_limit:     Highest quality format to try.
 432         outtmpl:          Template for output names.
 433         ignoreerrors:     Do not stop on download errors.
 434         ratelimit:        Download speed limit, in bytes/sec.
 435         nooverwrites:     Prevent overwriting files.
 436         retries:          Number of times to retry for HTTP error 5xx
 437         continuedl:       Try to continue downloads if possible.
 438         noprogress:       Do not print the progress bar.
 439         playliststart:    Playlist item to start at.
 440         playlistend:      Playlist item to end at.
 441         matchtitle:       Download only matching titles.
 442         rejecttitle:      Reject downloads for matching titles.
 443         logtostderr:      Log messages to stderr instead of stdout.
 444         consoletitle:     Display progress in console window's titlebar.
 445         nopart:           Do not use temporary .part files.
 446         updatetime:       Use the Last-modified header to set output file timestamps.
 447         writedescription: Write the video description to a .description file
 448         writeinfojson:    Write the video description to a .info.json file
 449         """
 450
 451         params = None
 452         _ies = []
 453         _pps = []
 454         _download_retcode = None
 455         _num_downloads = None
 456         _screen_file = None
 457
 458         def __init__(self, params):
 459                 """Create a FileDownloader object with the given options."""
 460                 self._ies = []
 461                 self._pps = []
 462                 self._download_retcode = 0
 463                 self._num_downloads = 0
 464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 465                 self.params = params
 466
 467         @staticmethod
 468         def format_bytes(bytes):
 469                 if bytes is None:
 470                         return 'N/A'
 471                 if type(bytes) is str:
 472                         bytes = float(bytes)
 473                 if bytes == 0.0:
 474                         exponent = 0
 475                 else:
 476                         exponent = long(math.log(bytes, 1024.0))
 477                 suffix = 'bkMGTPEZY'[exponent]
 478                 converted = float(bytes) / float(1024 ** exponent)
 479                 return '%.2f%s' % (converted, suffix)
 480
 481         @staticmethod
 482         def calc_percent(byte_counter, data_len):
 483                 if data_len is None:
 484                         return '---.-%'
 485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 486
 487         @staticmethod
 488         def calc_eta(start, now, total, current):
 489                 if total is None:
 490                         return '--:--'
 491                 dif = now - start
 492                 if current == 0 or dif < 0.001: # One millisecond
 493                         return '--:--'
 494                 rate = float(current) / dif
 495                 eta = long((float(total) - float(current)) / rate)
 496                 (eta_mins, eta_secs) = divmod(eta, 60)
 497                 if eta_mins > 99:
 498                         return '--:--'
 499                 return '%02d:%02d' % (eta_mins, eta_secs)
 500
 501         @staticmethod
 502         def calc_speed(start, now, bytes):
 503                 dif = now - start
 504                 if bytes == 0 or dif < 0.001: # One millisecond
 505                         return '%10s' % '---b/s'
 506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 507
 508         @staticmethod
 509         def best_block_size(elapsed_time, bytes):
 510                 new_min = max(bytes / 2.0, 1.0)
 511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 512                 if elapsed_time < 0.001:
 513                         return long(new_max)
 514                 rate = bytes / elapsed_time
 515                 if rate > new_max:
 516                         return long(new_max)
 517                 if rate < new_min:
 518                         return long(new_min)
 519                 return long(rate)
 520
 521         @staticmethod
 522         def parse_bytes(bytestr):
 523                 """Parse a string indicating a byte quantity into a long integer."""
 524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 525                 if matchobj is None:
 526                         return None
 527                 number = float(matchobj.group(1))
 528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 529                 return long(round(number * multiplier))
 530
 531         def add_info_extractor(self, ie):
 532                 """Add an InfoExtractor object to the end of the list."""
 533                 self._ies.append(ie)
 534                 ie.set_downloader(self)
 535
 536         def add_post_processor(self, pp):
 537                 """Add a PostProcessor object to the end of the chain."""
 538                 self._pps.append(pp)
 539                 pp.set_downloader(self)
 540
 541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 542                 """Print message to stdout if not in quiet mode."""
 543                 try:
 544                         if not self.params.get('quiet', False):
 545                                 terminator = [u'\n', u''][skip_eol]
 546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 547                         self._screen_file.flush()
 548                 except (UnicodeEncodeError), err:
 549                         if not ignore_encoding_errors:
 550                                 raise
 551
 552         def to_stderr(self, message):
 553                 """Print message to stderr."""
 554                 print >>sys.stderr, message.encode(preferredencoding())
 555
 556         def to_cons_title(self, message):
 557                 """Set console/terminal window title to message."""
 558                 if not self.params.get('consoletitle', False):
 559                         return
 560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 561                         # c_wchar_p() might not be necessary if `message` is
 562                         # already of type unicode()
 563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 564                 elif 'TERM' in os.environ:
 565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 566
 567         def fixed_template(self):
 568                 """Checks if the output template is fixed."""
 569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 570
 571         def trouble(self, message=None):
 572                 """Determine action to take when a download problem appears.
 573
 574                 Depending on if the downloader has been configured to ignore
 575                 download errors or not, this method may throw an exception or
 576                 not when errors are found, after printing the message.
 577                 """
 578                 if message is not None:
 579                         self.to_stderr(message)
 580                 if not self.params.get('ignoreerrors', False):
 581                         raise DownloadError(message)
 582                 self._download_retcode = 1
 583
 584         def slow_down(self, start_time, byte_counter):
 585                 """Sleep if the download speed is over the rate limit."""
 586                 rate_limit = self.params.get('ratelimit', None)
 587                 if rate_limit is None or byte_counter == 0:
 588                         return
 589                 now = time.time()
 590                 elapsed = now - start_time
 591                 if elapsed <= 0.0:
 592                         return
 593                 speed = float(byte_counter) / elapsed
 594                 if speed > rate_limit:
 595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 596
 597         def temp_name(self, filename):
 598                 """Returns a temporary filename for the given filename."""
 599                 if self.params.get('nopart', False) or filename == u'-' or \
 600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 601                         return filename
 602                 return filename + u'.part'
 603
 604         def undo_temp_name(self, filename):
 605                 if filename.endswith(u'.part'):
 606                         return filename[:-len(u'.part')]
 607                 return filename
 608
 609         def try_rename(self, old_filename, new_filename):
 610                 try:
 611                         if old_filename == new_filename:
 612                                 return
 613                         os.rename(old_filename, new_filename)
 614                 except (IOError, OSError), err:
 615                         self.trouble(u'ERROR: unable to rename file')
 616
 617         def try_utime(self, filename, last_modified_hdr):
 618                 """Try to set the last-modified time of the given file."""
 619                 if last_modified_hdr is None:
 620                         return
 621                 if not os.path.isfile(filename):
 622                         return
 623                 timestr = last_modified_hdr
 624                 if timestr is None:
 625                         return
 626                 filetime = timeconvert(timestr)
 627                 if filetime is None:
 628                         return filetime
 629                 try:
 630                         os.utime(filename, (time.time(), filetime))
 631                 except:
 632                         pass
 633                 return filetime
 634
 635         def report_writedescription(self, descfn):
 636                 """ Report that the description file is being written """
 637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 638
 639         def report_writeinfojson(self, infofn):
 640                 """ Report that the metadata file has been written """
 641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 642
 643         def report_destination(self, filename):
 644                 """Report destination filename."""
 645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 646
 647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 648                 """Report download progress."""
 649                 if self.params.get('noprogress', False):
 650                         return
 651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 655
 656         def report_resuming_byte(self, resume_len):
 657                 """Report attempt to resume at given byte."""
 658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 659
 660         def report_retry(self, count, retries):
 661                 """Report retry in case of HTTP error 5xx"""
 662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 663
 664         def report_file_already_downloaded(self, file_name):
 665                 """Report file has already been fully downloaded."""
 666                 try:
 667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 668                 except (UnicodeEncodeError), err:
 669                         self.to_screen(u'[download] The file has already been downloaded')
 670
 671         def report_unable_to_resume(self):
 672                 """Report it was impossible to resume download."""
 673                 self.to_screen(u'[download] Unable to resume')
 674
 675         def report_finish(self):
 676                 """Report download finished."""
 677                 if self.params.get('noprogress', False):
 678                         self.to_screen(u'[download] Download completed')
 679                 else:
 680                         self.to_screen(u'')
 681
 682         def increment_downloads(self):
 683                 """Increment the ordinal that assigns a number to each file."""
 684                 self._num_downloads += 1
 685
 686         def prepare_filename(self, info_dict):
 687                 """Generate the output filename."""
 688                 try:
 689                         template_dict = dict(info_dict)
 690                         template_dict['epoch'] = unicode(long(time.time()))
 691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 692                         filename = self.params['outtmpl'] % template_dict
 693                         return filename
 694                 except (ValueError, KeyError), err:
 695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 696                         return None
 697
 698         def process_info(self, info_dict):
 699                 """Process a single dictionary returned by an InfoExtractor."""
 700                 filename = self.prepare_filename(info_dict)
 701
 702                 # Forced printings
 703                 if self.params.get('forcetitle', False):
 704                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 705                 if self.params.get('forceurl', False):
 706                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 707                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 708                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 710                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 711                 if self.params.get('forcefilename', False) and filename is not None:
 712                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 713                 if self.params.get('forceformat', False):
 714                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 715
 716                 # Do nothing else if in simulate mode
 717                 if self.params.get('simulate', False):
 718                         return
 719
 720                 if filename is None:
 721                         return
 722
 723                 matchtitle=self.params.get('matchtitle',False)
 724                 rejecttitle=self.params.get('rejecttitle',False)
 725                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 726                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 727                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 728                         return
 729                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 730                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 731                         return
 732
 733                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 734                         self.to_stderr(u'WARNING: file exists and will be skipped')
 735                         return
 736
 737                 try:
 738                         dn = os.path.dirname(filename)
 739                         if dn != '' and not os.path.exists(dn):
 740                                 os.makedirs(dn)
 741                 except (OSError, IOError), err:
 742                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 743                         return
 744
 745                 if self.params.get('writedescription', False):
 746                         try:
 747                                 descfn = filename + '.description'
 748                                 self.report_writedescription(descfn)
 749                                 descfile = open(descfn, 'wb')
 750                                 try:
 751                                         descfile.write(info_dict['description'].encode('utf-8'))
 752                                 finally:
 753                                         descfile.close()
 754                         except (OSError, IOError):
 755                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 756                                 return
 757
 758                 if self.params.get('writeinfojson', False):
 759                         infofn = filename + '.info.json'
 760                         self.report_writeinfojson(infofn)
 761                         try:
 762                                 json.dump
 763                         except (NameError,AttributeError):
 764                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 765                                 return
 766                         try:
 767                                 infof = open(infofn, 'wb')
 768                                 try:
 769                                         json.dump(info_dict, infof)
 770                                 finally:
 771                                         infof.close()
 772                         except (OSError, IOError):
 773                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 774                                 return
 775
 776                 if not self.params.get('skip_download', False):
 777                         try:
 778                                 success = self._do_download(filename, info_dict)
 779                         except (OSError, IOError), err:
 780                                 raise UnavailableVideoError
 781                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 782                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 783                                 return
 784                         except (ContentTooShortError, ), err:
 785                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 786                                 return
 787
 788                         if success:
 789                                 try:
 790                                         self.post_process(filename, info_dict)
 791                                 except (PostProcessingError), err:
 792                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 793                                         return
 794
 795         def download(self, url_list):
 796                 """Download a given list of URLs."""
 797                 if len(url_list) > 1 and self.fixed_template():
 798                         raise SameFileError(self.params['outtmpl'])
 799
 800                 for url in url_list:
 801                         suitable_found = False
 802                         for ie in self._ies:
 803                                 # Go to next InfoExtractor if not suitable
 804                                 if not ie.suitable(url):
 805                                         continue
 806
 807                                 # Suitable InfoExtractor found
 808                                 suitable_found = True
 809
 810                                 # Extract information from URL and process it
 811                                 ie.extract(url)
 812
 813                                 # Suitable InfoExtractor had been found; go to next URL
 814                                 break
 815
 816                         if not suitable_found:
 817                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 818
 819                 return self._download_retcode
 820
 821         def post_process(self, filename, ie_info):
 822                 """Run the postprocessing chain on the given file."""
 823                 info = dict(ie_info)
 824                 info['filepath'] = filename
 825                 for pp in self._pps:
 826                         info = pp.run(info)
 827                         if info is None:
 828                                 break
 829
 830         def _download_with_rtmpdump(self, filename, url, player_url):
 831                 self.report_destination(filename)
 832                 tmpfilename = self.temp_name(filename)
 833
 834                 # Check for rtmpdump first
 835                 try:
 836                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 837                 except (OSError, IOError):
 838                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 839                         return False
 840
 841                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 842                 # the connection was interrumpted and resuming appears to be
 843                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 844                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 845                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 846                 while retval == 2 or retval == 1:
 847                         prevsize = os.path.getsize(tmpfilename)
 848                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 849                         time.sleep(5.0) # This seems to be needed
 850                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 851                         cursize = os.path.getsize(tmpfilename)
 852                         if prevsize == cursize and retval == 1:
 853                                 break
 854                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 855                         if prevsize == cursize and retval == 2 and cursize > 1024:
 856                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 857                                 retval = 0
 858                                 break
 859                 if retval == 0:
 860                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 861                         self.try_rename(tmpfilename, filename)
 862                         return True
 863                 else:
 864                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 865                         return False
 866
 867         def _do_download(self, filename, info_dict):
 868                 url = info_dict['url']
 869                 player_url = info_dict.get('player_url', None)
 870
 871                 # Check file already present
 872                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 873                         self.report_file_already_downloaded(filename)
 874                         return True
 875
 876                 # Attempt to download using rtmpdump
 877                 if url.startswith('rtmp'):
 878                         return self._download_with_rtmpdump(filename, url, player_url)
 879
 880                 tmpfilename = self.temp_name(filename)
 881                 stream = None
 882
 883                 # Do not include the Accept-Encoding header
 884                 headers = {'Youtubedl-no-compression': 'True'}
 885                 basic_request = urllib2.Request(url, None, headers)
 886                 request = urllib2.Request(url, None, headers)
 887
 888                 # Establish possible resume length
 889                 if os.path.isfile(tmpfilename):
 890                         resume_len = os.path.getsize(tmpfilename)
 891                 else:
 892                         resume_len = 0
 893
 894                 open_mode = 'wb'
 895                 if resume_len != 0:
 896                         if self.params.get('continuedl', False):
 897                                 self.report_resuming_byte(resume_len)
 898                                 request.add_header('Range','bytes=%d-' % resume_len)
 899                                 open_mode = 'ab'
 900                         else:
 901                                 resume_len = 0
 902
 903                 count = 0
 904                 retries = self.params.get('retries', 0)
 905                 while count <= retries:
 906                         # Establish connection
 907                         try:
 908                                 data = urllib2.urlopen(request)
 909                                 break
 910                         except (urllib2.HTTPError, ), err:
 911                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 912                                         # Unexpected HTTP error
 913                                         raise
 914                                 elif err.code == 416:
 915                                         # Unable to resume (requested range not satisfiable)
 916                                         try:
 917                                                 # Open the connection again without the range header
 918                                                 data = urllib2.urlopen(basic_request)
 919                                                 content_length = data.info()['Content-Length']
 920                                         except (urllib2.HTTPError, ), err:
 921                                                 if err.code < 500 or err.code >= 600:
 922                                                         raise
 923                                         else:
 924                                                 # Examine the reported length
 925                                                 if (content_length is not None and
 926                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 927                                                         # The file had already been fully downloaded.
 928                                                         # Explanation to the above condition: in issue #175 it was revealed that
 929                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 930                                                         # changing the file size slightly and causing problems for some users. So
 931                                                         # I decided to implement a suggested change and consider the file
 932                                                         # completely downloaded if the file size differs less than 100 bytes from
 933                                                         # the one in the hard drive.
 934                                                         self.report_file_already_downloaded(filename)
 935                                                         self.try_rename(tmpfilename, filename)
 936                                                         return True
 937                                                 else:
 938                                                         # The length does not match, we start the download over
 939                                                         self.report_unable_to_resume()
 940                                                         open_mode = 'wb'
 941                                                         break
 942                         # Retry
 943                         count += 1
 944                         if count <= retries:
 945                                 self.report_retry(count, retries)
 946
 947                 if count > retries:
 948                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 949                         return False
 950
 951                 data_len = data.info().get('Content-length', None)
 952                 if data_len is not None:
 953                         data_len = long(data_len) + resume_len
 954                 data_len_str = self.format_bytes(data_len)
 955                 byte_counter = 0 + resume_len
 956                 block_size = 1024
 957                 start = time.time()
 958                 while True:
 959                         # Download and write
 960                         before = time.time()
 961                         data_block = data.read(block_size)
 962                         after = time.time()
 963                         if len(data_block) == 0:
 964                                 break
 965                         byte_counter += len(data_block)
 966
 967                         # Open file just in time
 968                         if stream is None:
 969                                 try:
 970                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 971                                         assert stream is not None
 972                                         filename = self.undo_temp_name(tmpfilename)
 973                                         self.report_destination(filename)
 974                                 except (OSError, IOError), err:
 975                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 976                                         return False
 977                         try:
 978                                 stream.write(data_block)
 979                         except (IOError, OSError), err:
 980                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 981                                 return False
 982                         block_size = self.best_block_size(after - before, len(data_block))
 983
 984                         # Progress message
 985                         percent_str = self.calc_percent(byte_counter, data_len)
 986                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 987                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 988                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 989
 990                         # Apply rate limit
 991                         self.slow_down(start, byte_counter - resume_len)
 992
 993                 if stream is None:
 994                         self.trouble(u'\nERROR: Did not get any data blocks')
 995                         return False
 996                 stream.close()
 997                 self.report_finish()
 998                 if data_len is not None and byte_counter != data_len:
 999                         raise ContentTooShortError(byte_counter, long(data_len))
1000                 self.try_rename(tmpfilename, filename)
1001
1002                 # Update file modification time
1003                 if self.params.get('updatetime', True):
1004                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1005
1006                 return True
1007
1008
1009 class InfoExtractor(object):
1010         """Information Extractor class.
1011
1012         Information extractors are the classes that, given a URL, extract
1013         information from the video (or videos) the URL refers to. This
1014         information includes the real video URL, the video title and simplified
1015         title, author and others. The information is stored in a dictionary
1016         which is then passed to the FileDownloader. The FileDownloader
1017         processes this information possibly downloading the video to the file
1018         system, among other possible outcomes. The dictionaries must include
1019         the following fields:
1020
1021         id:             Video identifier.
1022         url:            Final video URL.
1023         uploader:       Nickname of the video uploader.
1024         title:          Literal title.
1025         stitle:         Simplified title.
1026         ext:            Video filename extension.
1027         format:         Video format.
1028         player_url:     SWF Player URL (may be None).
1029
1030         The following fields are optional. Their primary purpose is to allow
1031         youtube-dl to serve as the backend for a video search function, such
1032         as the one in youtube2mp3.  They are only used when their respective
1033         forced printing functions are called:
1034
1035         thumbnail:      Full URL to a video thumbnail image.
1036         description:    One-line video description.
1037
1038         Subclasses of this one should re-define the _real_initialize() and
1039         _real_extract() methods and define a _VALID_URL regexp.
1040         Probably, they should also be added to the list of extractors.
1041         """
1042
1043         _ready = False
1044         _downloader = None
1045
1046         def __init__(self, downloader=None):
1047                 """Constructor. Receives an optional downloader."""
1048                 self._ready = False
1049                 self.set_downloader(downloader)
1050
1051         def suitable(self, url):
1052                 """Receives a URL and returns True if suitable for this IE."""
1053                 return re.match(self._VALID_URL, url) is not None
1054
1055         def initialize(self):
1056                 """Initializes an instance (authentication, etc)."""
1057                 if not self._ready:
1058                         self._real_initialize()
1059                         self._ready = True
1060
1061         def extract(self, url):
1062                 """Extracts URL information and returns it in list of dicts."""
1063                 self.initialize()
1064                 return self._real_extract(url)
1065
1066         def set_downloader(self, downloader):
1067                 """Sets the downloader for this IE."""
1068                 self._downloader = downloader
1069
1070         def _real_initialize(self):
1071                 """Real initialization process. Redefine in subclasses."""
1072                 pass
1073
1074         def _real_extract(self, url):
1075                 """Real extraction process. Redefine in subclasses."""
1076                 pass
1077
1078
1079 class YoutubeIE(InfoExtractor):
1080         """Information extractor for youtube.com."""
1081
1082         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1083         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1084         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1085         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1086         _NETRC_MACHINE = 'youtube'
1087         # Listed in order of quality
1088         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1089         _video_extensions = {
1090                 '13': '3gp',
1091                 '17': 'mp4',
1092                 '18': 'mp4',
1093                 '22': 'mp4',
1094                 '37': 'mp4',
1095                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1096                 '43': 'webm',
1097                 '45': 'webm',
1098         }
1099         IE_NAME = u'youtube'
1100
1101         def report_lang(self):
1102                 """Report attempt to set language."""
1103                 self._downloader.to_screen(u'[youtube] Setting language')
1104
1105         def report_login(self):
1106                 """Report attempt to log in."""
1107                 self._downloader.to_screen(u'[youtube] Logging in')
1108
1109         def report_age_confirmation(self):
1110                 """Report attempt to confirm age."""
1111                 self._downloader.to_screen(u'[youtube] Confirming age')
1112
1113         def report_video_webpage_download(self, video_id):
1114                 """Report attempt to download video webpage."""
1115                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1116
1117         def report_video_info_webpage_download(self, video_id):
1118                 """Report attempt to download video info webpage."""
1119                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1120
1121         def report_information_extraction(self, video_id):
1122                 """Report attempt to extract video information."""
1123                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1124
1125         def report_unavailable_format(self, video_id, format):
1126                 """Report extracted video URL."""
1127                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1128
1129         def report_rtmp_download(self):
1130                 """Indicate the download will use the RTMP protocol."""
1131                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1132
1133         def _real_initialize(self):
1134                 if self._downloader is None:
1135                         return
1136
1137                 username = None
1138                 password = None
1139                 downloader_params = self._downloader.params
1140
1141                 # Attempt to use provided username and password or .netrc data
1142                 if downloader_params.get('username', None) is not None:
1143                         username = downloader_params['username']
1144                         password = downloader_params['password']
1145                 elif downloader_params.get('usenetrc', False):
1146                         try:
1147                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1148                                 if info is not None:
1149                                         username = info[0]
1150                                         password = info[2]
1151                                 else:
1152                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1153                         except (IOError, netrc.NetrcParseError), err:
1154                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1155                                 return
1156
1157                 # Set language
1158                 request = urllib2.Request(self._LANG_URL)
1159                 try:
1160                         self.report_lang()
1161                         urllib2.urlopen(request).read()
1162                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1163                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1164                         return
1165
1166                 # No authentication to be performed
1167                 if username is None:
1168                         return
1169
1170                 # Log in
1171                 login_form = {
1172                                 'current_form': 'loginForm',
1173                                 'next':         '/',
1174                                 'action_login': 'Log In',
1175                                 'username':     username,
1176                                 'password':     password,
1177                                 }
1178                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1179                 try:
1180                         self.report_login()
1181                         login_results = urllib2.urlopen(request).read()
1182                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1183                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1184                                 return
1185                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1187                         return
1188
1189                 # Confirm age
1190                 age_form = {
1191                                 'next_url':             '/',
1192                                 'action_confirm':       'Confirm',
1193                                 }
1194                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1195                 try:
1196                         self.report_age_confirmation()
1197                         age_results = urllib2.urlopen(request).read()
1198                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1199                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1200                         return
1201
1202         def _real_extract(self, url):
1203                 # Extract video id from URL
1204                 mobj = re.match(self._VALID_URL, url)
1205                 if mobj is None:
1206                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1207                         return
1208                 video_id = mobj.group(2)
1209
1210                 # Get video webpage
1211                 self.report_video_webpage_download(video_id)
1212                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1213                 try:
1214                         video_webpage = urllib2.urlopen(request).read()
1215                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1216                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1217                         return
1218
1219                 # Attempt to extract SWF player URL
1220                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1221                 if mobj is not None:
1222                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1223                 else:
1224                         player_url = None
1225
1226                 # Get video info
1227                 self.report_video_info_webpage_download(video_id)
1228                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1229                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1230                                         % (video_id, el_type))
1231                         request = urllib2.Request(video_info_url)
1232                         try:
1233                                 video_info_webpage = urllib2.urlopen(request).read()
1234                                 video_info = parse_qs(video_info_webpage)
1235                                 if 'token' in video_info:
1236                                         break
1237                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1238                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1239                                 return
1240                 if 'token' not in video_info:
1241                         if 'reason' in video_info:
1242                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1243                         else:
1244                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1245                         return
1246
1247                 # Start extracting information
1248                 self.report_information_extraction(video_id)
1249
1250                 # uploader
1251                 if 'author' not in video_info:
1252                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1253                         return
1254                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1255
1256                 # title
1257                 if 'title' not in video_info:
1258                         self._downloader.trouble(u'ERROR: unable to extract video title')
1259                         return
1260                 video_title = urllib.unquote_plus(video_info['title'][0])
1261                 video_title = video_title.decode('utf-8')
1262                 video_title = sanitize_title(video_title)
1263
1264                 # simplified title
1265                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1266                 simple_title = simple_title.strip(ur'_')
1267
1268                 # thumbnail image
1269                 if 'thumbnail_url' not in video_info:
1270                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1271                         video_thumbnail = ''
1272                 else:   # don't panic if we can't find it
1273                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1274
1275                 # upload date
1276                 upload_date = u'NA'
1277                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1278                 if mobj is not None:
1279                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1280                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1281                         for expression in format_expressions:
1282                                 try:
1283                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1284                                 except:
1285                                         pass
1286
1287                 # description
1288                 try:
1289                         lxml.etree
1290                 except NameError:
1291                         video_description = u'No description available.'
1292                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1293                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1294                                 if mobj is not None:
1295                                         video_description = mobj.group(1).decode('utf-8')
1296                 else:
1297                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1298                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1299                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1300                         # TODO use another parser
1301
1302                 # token
1303                 video_token = urllib.unquote_plus(video_info['token'][0])
1304
1305                 # Decide which formats to download
1306                 req_format = self._downloader.params.get('format', None)
1307
1308                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1309                         self.report_rtmp_download()
1310                         video_url_list = [(None, video_info['conn'][0])]
1311                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1312                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1313                         url_data = [parse_qs(uds) for uds in url_data_strs]
1314                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1315                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1316
1317                         format_limit = self._downloader.params.get('format_limit', None)
1318                         if format_limit is not None and format_limit in self._available_formats:
1319                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1320                         else:
1321                                 format_list = self._available_formats
1322                         existing_formats = [x for x in format_list if x in url_map]
1323                         if len(existing_formats) == 0:
1324                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1325                                 return
1326                         if req_format is None or req_format == 'best':
1327                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1328                         elif req_format == 'worst':
1329                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1330                         elif req_format in ('-1', 'all'):
1331                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1332                         else:
1333                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1334                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1335                                 req_formats = req_format.split('/')
1336                                 video_url_list = None
1337                                 for rf in req_formats:
1338                                         if rf in url_map:
1339                                                 video_url_list = [(rf, url_map[rf])]
1340                                                 break
1341                                 if video_url_list is None:
1342                                         self._downloader.trouble(u'ERROR: requested format not available')
1343                                         return
1344                 else:
1345                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1346                         return
1347
1348                 for format_param, video_real_url in video_url_list:
1349                         # At this point we have a new video
1350                         self._downloader.increment_downloads()
1351
1352                         # Extension
1353                         video_extension = self._video_extensions.get(format_param, 'flv')
1354
1355                         try:
1356                                 # Process video information
1357                                 self._downloader.process_info({
1358                                         'id':           video_id.decode('utf-8'),
1359                                         'url':          video_real_url.decode('utf-8'),
1360                                         'uploader':     video_uploader.decode('utf-8'),
1361                                         'upload_date':  upload_date,
1362                                         'title':        video_title,
1363                                         'stitle':       simple_title,
1364                                         'ext':          video_extension.decode('utf-8'),
1365                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1366                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1367                                         'description':  video_description,
1368                                         'player_url':   player_url,
1369                                 })
1370                         except UnavailableVideoError, err:
1371                                 self._downloader.trouble(u'\nERROR: unable to download video')
1372
1373
1374 class MetacafeIE(InfoExtractor):
1375         """Information Extractor for metacafe.com."""
1376
1377         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1378         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1379         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1380         _youtube_ie = None
1381         IE_NAME = u'metacafe'
1382
1383         def __init__(self, youtube_ie, downloader=None):
1384                 InfoExtractor.__init__(self, downloader)
1385                 self._youtube_ie = youtube_ie
1386
1387         def report_disclaimer(self):
1388                 """Report disclaimer retrieval."""
1389                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1390
1391         def report_age_confirmation(self):
1392                 """Report attempt to confirm age."""
1393                 self._downloader.to_screen(u'[metacafe] Confirming age')
1394
1395         def report_download_webpage(self, video_id):
1396                 """Report webpage download."""
1397                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1398
1399         def report_extraction(self, video_id):
1400                 """Report information extraction."""
1401                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1402
1403         def _real_initialize(self):
1404                 # Retrieve disclaimer
1405                 request = urllib2.Request(self._DISCLAIMER)
1406                 try:
1407                         self.report_disclaimer()
1408                         disclaimer = urllib2.urlopen(request).read()
1409                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1410                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1411                         return
1412
1413                 # Confirm age
1414                 disclaimer_form = {
1415                         'filters': '0',
1416                         'submit': "Continue - I'm over 18",
1417                         }
1418                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1419                 try:
1420                         self.report_age_confirmation()
1421                         disclaimer = urllib2.urlopen(request).read()
1422                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1423                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1424                         return
1425
1426         def _real_extract(self, url):
1427                 # Extract id and simplified title from URL
1428                 mobj = re.match(self._VALID_URL, url)
1429                 if mobj is None:
1430                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1431                         return
1432
1433                 video_id = mobj.group(1)
1434
1435                 # Check if video comes from YouTube
1436                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1437                 if mobj2 is not None:
1438                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1439                         return
1440
1441                 # At this point we have a new video
1442                 self._downloader.increment_downloads()
1443
1444                 simple_title = mobj.group(2).decode('utf-8')
1445
1446                 # Retrieve video webpage to extract further information
1447                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1448                 try:
1449                         self.report_download_webpage(video_id)
1450                         webpage = urllib2.urlopen(request).read()
1451                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1452                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1453                         return
1454
1455                 # Extract URL, uploader and title from webpage
1456                 self.report_extraction(video_id)
1457                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1458                 if mobj is not None:
1459                         mediaURL = urllib.unquote(mobj.group(1))
1460                         video_extension = mediaURL[-3:]
1461
1462                         # Extract gdaKey if available
1463                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1464                         if mobj is None:
1465                                 video_url = mediaURL
1466                         else:
1467                                 gdaKey = mobj.group(1)
1468                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1469                 else:
1470                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1471                         if mobj is None:
1472                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1473                                 return
1474                         vardict = parse_qs(mobj.group(1))
1475                         if 'mediaData' not in vardict:
1476                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1477                                 return
1478                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1479                         if mobj is None:
1480                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1481                                 return
1482                         mediaURL = mobj.group(1).replace('\\/', '/')
1483                         video_extension = mediaURL[-3:]
1484                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1485
1486                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1487                 if mobj is None:
1488                         self._downloader.trouble(u'ERROR: unable to extract title')
1489                         return
1490                 video_title = mobj.group(1).decode('utf-8')
1491                 video_title = sanitize_title(video_title)
1492
1493                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1494                 if mobj is None:
1495                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1496                         return
1497                 video_uploader = mobj.group(1)
1498
1499                 try:
1500                         # Process video information
1501                         self._downloader.process_info({
1502                                 'id':           video_id.decode('utf-8'),
1503                                 'url':          video_url.decode('utf-8'),
1504                                 'uploader':     video_uploader.decode('utf-8'),
1505                                 'upload_date':  u'NA',
1506                                 'title':        video_title,
1507                                 'stitle':       simple_title,
1508                                 'ext':          video_extension.decode('utf-8'),
1509                                 'format':       u'NA',
1510                                 'player_url':   None,
1511                         })
1512                 except UnavailableVideoError:
1513                         self._downloader.trouble(u'\nERROR: unable to download video')
1514
1515
1516 class DailymotionIE(InfoExtractor):
1517         """Information Extractor for Dailymotion"""
1518
1519         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1520         IE_NAME = u'dailymotion'
1521
1522         def __init__(self, downloader=None):
1523                 InfoExtractor.__init__(self, downloader)
1524
1525         def report_download_webpage(self, video_id):
1526                 """Report webpage download."""
1527                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1528
1529         def report_extraction(self, video_id):
1530                 """Report information extraction."""
1531                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1532
1533         def _real_initialize(self):
1534                 return
1535
1536         def _real_extract(self, url):
1537                 # Extract id and simplified title from URL
1538                 mobj = re.match(self._VALID_URL, url)
1539                 if mobj is None:
1540                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1541                         return
1542
1543                 # At this point we have a new video
1544                 self._downloader.increment_downloads()
1545                 video_id = mobj.group(1)
1546
1547                 simple_title = mobj.group(2).decode('utf-8')
1548                 video_extension = 'flv'
1549
1550                 # Retrieve video webpage to extract further information
1551                 request = urllib2.Request(url)
1552                 request.add_header('Cookie', 'family_filter=off')
1553                 try:
1554                         self.report_download_webpage(video_id)
1555                         webpage = urllib2.urlopen(request).read()
1556                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1557                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1558                         return
1559
1560                 # Extract URL, uploader and title from webpage
1561                 self.report_extraction(video_id)
1562                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1563                 if mobj is None:
1564                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1565                         return
1566                 sequence = urllib.unquote(mobj.group(1))
1567                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1568                 if mobj is None:
1569                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1570                         return
1571                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1572
1573                 # if needed add http://www.dailymotion.com/ if relative URL
1574
1575                 video_url = mediaURL
1576
1577                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1578                 if mobj is None:
1579                         self._downloader.trouble(u'ERROR: unable to extract title')
1580                         return
1581                 video_title = mobj.group(1).decode('utf-8')
1582                 video_title = sanitize_title(video_title)
1583
1584                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1585                 if mobj is None:
1586                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1587                         return
1588                 video_uploader = mobj.group(1)
1589
1590                 try:
1591                         # Process video information
1592                         self._downloader.process_info({
1593                                 'id':           video_id.decode('utf-8'),
1594                                 'url':          video_url.decode('utf-8'),
1595                                 'uploader':     video_uploader.decode('utf-8'),
1596                                 'upload_date':  u'NA',
1597                                 'title':        video_title,
1598                                 'stitle':       simple_title,
1599                                 'ext':          video_extension.decode('utf-8'),
1600                                 'format':       u'NA',
1601                                 'player_url':   None,
1602                         })
1603                 except UnavailableVideoError:
1604                         self._downloader.trouble(u'\nERROR: unable to download video')
1605
1606
1607 class GoogleIE(InfoExtractor):
1608         """Information extractor for video.google.com."""
1609
1610         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1611         IE_NAME = u'video.google'
1612
1613         def __init__(self, downloader=None):
1614                 InfoExtractor.__init__(self, downloader)
1615
1616         def report_download_webpage(self, video_id):
1617                 """Report webpage download."""
1618                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1619
1620         def report_extraction(self, video_id):
1621                 """Report information extraction."""
1622                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1623
1624         def _real_initialize(self):
1625                 return
1626
1627         def _real_extract(self, url):
1628                 # Extract id from URL
1629                 mobj = re.match(self._VALID_URL, url)
1630                 if mobj is None:
1631                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1632                         return
1633
1634                 # At this point we have a new video
1635                 self._downloader.increment_downloads()
1636                 video_id = mobj.group(1)
1637
1638                 video_extension = 'mp4'
1639
1640                 # Retrieve video webpage to extract further information
1641                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1642                 try:
1643                         self.report_download_webpage(video_id)
1644                         webpage = urllib2.urlopen(request).read()
1645                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1646                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1647                         return
1648
1649                 # Extract URL, uploader, and title from webpage
1650                 self.report_extraction(video_id)
1651                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1652                 if mobj is None:
1653                         video_extension = 'flv'
1654                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1655                 if mobj is None:
1656                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1657                         return
1658                 mediaURL = urllib.unquote(mobj.group(1))
1659                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1660                 mediaURL = mediaURL.replace('\\x26', '\x26')
1661
1662                 video_url = mediaURL
1663
1664                 mobj = re.search(r'<title>(.*)</title>', webpage)
1665                 if mobj is None:
1666                         self._downloader.trouble(u'ERROR: unable to extract title')
1667                         return
1668                 video_title = mobj.group(1).decode('utf-8')
1669                 video_title = sanitize_title(video_title)
1670                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1671
1672                 # Extract video description
1673                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1674                 if mobj is None:
1675                         self._downloader.trouble(u'ERROR: unable to extract video description')
1676                         return
1677                 video_description = mobj.group(1).decode('utf-8')
1678                 if not video_description:
1679                         video_description = 'No description available.'
1680
1681                 # Extract video thumbnail
1682                 if self._downloader.params.get('forcethumbnail', False):
1683                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1684                         try:
1685                                 webpage = urllib2.urlopen(request).read()
1686                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1687                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1688                                 return
1689                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1690                         if mobj is None:
1691                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1692                                 return
1693                         video_thumbnail = mobj.group(1)
1694                 else:   # we need something to pass to process_info
1695                         video_thumbnail = ''
1696
1697                 try:
1698                         # Process video information
1699                         self._downloader.process_info({
1700                                 'id':           video_id.decode('utf-8'),
1701                                 'url':          video_url.decode('utf-8'),
1702                                 'uploader':     u'NA',
1703                                 'upload_date':  u'NA',
1704                                 'title':        video_title,
1705                                 'stitle':       simple_title,
1706                                 'ext':          video_extension.decode('utf-8'),
1707                                 'format':       u'NA',
1708                                 'player_url':   None,
1709                         })
1710                 except UnavailableVideoError:
1711                         self._downloader.trouble(u'\nERROR: unable to download video')
1712
1713
1714 class PhotobucketIE(InfoExtractor):
1715         """Information extractor for photobucket.com."""
1716
1717         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1718         IE_NAME = u'photobucket'
1719
1720         def __init__(self, downloader=None):
1721                 InfoExtractor.__init__(self, downloader)
1722
1723         def report_download_webpage(self, video_id):
1724                 """Report webpage download."""
1725                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1726
1727         def report_extraction(self, video_id):
1728                 """Report information extraction."""
1729                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1730
1731         def _real_initialize(self):
1732                 return
1733
1734         def _real_extract(self, url):
1735                 # Extract id from URL
1736                 mobj = re.match(self._VALID_URL, url)
1737                 if mobj is None:
1738                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1739                         return
1740
1741                 # At this point we have a new video
1742                 self._downloader.increment_downloads()
1743                 video_id = mobj.group(1)
1744
1745                 video_extension = 'flv'
1746
1747                 # Retrieve video webpage to extract further information
1748                 request = urllib2.Request(url)
1749                 try:
1750                         self.report_download_webpage(video_id)
1751                         webpage = urllib2.urlopen(request).read()
1752                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1753                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1754                         return
1755
1756                 # Extract URL, uploader, and title from webpage
1757                 self.report_extraction(video_id)
1758                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1759                 if mobj is None:
1760                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1761                         return
1762                 mediaURL = urllib.unquote(mobj.group(1))
1763
1764                 video_url = mediaURL
1765
1766                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1767                 if mobj is None:
1768                         self._downloader.trouble(u'ERROR: unable to extract title')
1769                         return
1770                 video_title = mobj.group(1).decode('utf-8')
1771                 video_title = sanitize_title(video_title)
1772                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1773
1774                 video_uploader = mobj.group(2).decode('utf-8')
1775
1776                 try:
1777                         # Process video information
1778                         self._downloader.process_info({
1779                                 'id':           video_id.decode('utf-8'),
1780                                 'url':          video_url.decode('utf-8'),
1781                                 'uploader':     video_uploader,
1782                                 'upload_date':  u'NA',
1783                                 'title':        video_title,
1784                                 'stitle':       simple_title,
1785                                 'ext':          video_extension.decode('utf-8'),
1786                                 'format':       u'NA',
1787                                 'player_url':   None,
1788                         })
1789                 except UnavailableVideoError:
1790                         self._downloader.trouble(u'\nERROR: unable to download video')
1791
1792
1793 class YahooIE(InfoExtractor):
1794         """Information extractor for video.yahoo.com."""
1795
1796         # _VALID_URL matches all Yahoo! Video URLs
1797         # _VPAGE_URL matches only the extractable '/watch/' URLs
1798         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1799         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1800         IE_NAME = u'video.yahoo'
1801
1802         def __init__(self, downloader=None):
1803                 InfoExtractor.__init__(self, downloader)
1804
1805         def report_download_webpage(self, video_id):
1806                 """Report webpage download."""
1807                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1808
1809         def report_extraction(self, video_id):
1810                 """Report information extraction."""
1811                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1812
1813         def _real_initialize(self):
1814                 return
1815
1816         def _real_extract(self, url, new_video=True):
1817                 # Extract ID from URL
1818                 mobj = re.match(self._VALID_URL, url)
1819                 if mobj is None:
1820                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1821                         return
1822
1823                 # At this point we have a new video
1824                 self._downloader.increment_downloads()
1825                 video_id = mobj.group(2)
1826                 video_extension = 'flv'
1827
1828                 # Rewrite valid but non-extractable URLs as
1829                 # extractable English language /watch/ URLs
1830                 if re.match(self._VPAGE_URL, url) is None:
1831                         request = urllib2.Request(url)
1832                         try:
1833                                 webpage = urllib2.urlopen(request).read()
1834                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1835                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1836                                 return
1837
1838                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1839                         if mobj is None:
1840                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1841                                 return
1842                         yahoo_id = mobj.group(1)
1843
1844                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1845                         if mobj is None:
1846                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1847                                 return
1848                         yahoo_vid = mobj.group(1)
1849
1850                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1851                         return self._real_extract(url, new_video=False)
1852
1853                 # Retrieve video webpage to extract further information
1854                 request = urllib2.Request(url)
1855                 try:
1856                         self.report_download_webpage(video_id)
1857                         webpage = urllib2.urlopen(request).read()
1858                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1859                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1860                         return
1861
1862                 # Extract uploader and title from webpage
1863                 self.report_extraction(video_id)
1864                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1865                 if mobj is None:
1866                         self._downloader.trouble(u'ERROR: unable to extract video title')
1867                         return
1868                 video_title = mobj.group(1).decode('utf-8')
1869                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1870
1871                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1872                 if mobj is None:
1873                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1874                         return
1875                 video_uploader = mobj.group(1).decode('utf-8')
1876
1877                 # Extract video thumbnail
1878                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1879                 if mobj is None:
1880                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1881                         return
1882                 video_thumbnail = mobj.group(1).decode('utf-8')
1883
1884                 # Extract video description
1885                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1886                 if mobj is None:
1887                         self._downloader.trouble(u'ERROR: unable to extract video description')
1888                         return
1889                 video_description = mobj.group(1).decode('utf-8')
1890                 if not video_description:
1891                         video_description = 'No description available.'
1892
1893                 # Extract video height and width
1894                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1895                 if mobj is None:
1896                         self._downloader.trouble(u'ERROR: unable to extract video height')
1897                         return
1898                 yv_video_height = mobj.group(1)
1899
1900                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1901                 if mobj is None:
1902                         self._downloader.trouble(u'ERROR: unable to extract video width')
1903                         return
1904                 yv_video_width = mobj.group(1)
1905
1906                 # Retrieve video playlist to extract media URL
1907                 # I'm not completely sure what all these options are, but we
1908                 # seem to need most of them, otherwise the server sends a 401.
1909                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1910                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1911                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1912                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1913                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1914                 try:
1915                         self.report_download_webpage(video_id)
1916                         webpage = urllib2.urlopen(request).read()
1917                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1918                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1919                         return
1920
1921                 # Extract media URL from playlist XML
1922                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1923                 if mobj is None:
1924                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1925                         return
1926                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1927                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1928
1929                 try:
1930                         # Process video information
1931                         self._downloader.process_info({
1932                                 'id':           video_id.decode('utf-8'),
1933                                 'url':          video_url,
1934                                 'uploader':     video_uploader,
1935                                 'upload_date':  u'NA',
1936                                 'title':        video_title,
1937                                 'stitle':       simple_title,
1938                                 'ext':          video_extension.decode('utf-8'),
1939                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1940                                 'description':  video_description,
1941                                 'thumbnail':    video_thumbnail,
1942                                 'player_url':   None,
1943                         })
1944                 except UnavailableVideoError:
1945                         self._downloader.trouble(u'\nERROR: unable to download video')
1946
1947
1948 class VimeoIE(InfoExtractor):
1949         """Information extractor for vimeo.com."""
1950
1951         # _VALID_URL matches Vimeo URLs
1952         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1953         IE_NAME = u'vimeo'
1954
1955         def __init__(self, downloader=None):
1956                 InfoExtractor.__init__(self, downloader)
1957
1958         def report_download_webpage(self, video_id):
1959                 """Report webpage download."""
1960                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1961
1962         def report_extraction(self, video_id):
1963                 """Report information extraction."""
1964                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1965
1966         def _real_initialize(self):
1967                 return
1968
1969         def _real_extract(self, url, new_video=True):
1970                 # Extract ID from URL
1971                 mobj = re.match(self._VALID_URL, url)
1972                 if mobj is None:
1973                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1974                         return
1975
1976                 # At this point we have a new video
1977                 self._downloader.increment_downloads()
1978                 video_id = mobj.group(1)
1979
1980                 # Retrieve video webpage to extract further information
1981                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1982                 try:
1983                         self.report_download_webpage(video_id)
1984                         webpage = urllib2.urlopen(request).read()
1985                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1986                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1987                         return
1988
1989                 # Now we begin extracting as much information as we can from what we
1990                 # retrieved. First we extract the information common to all extractors,
1991                 # and latter we extract those that are Vimeo specific.
1992                 self.report_extraction(video_id)
1993
1994                 # Extract title
1995                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1996                 if mobj is None:
1997                         self._downloader.trouble(u'ERROR: unable to extract video title')
1998                         return
1999                 video_title = mobj.group(1).decode('utf-8')
2000                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2001
2002                 # Extract uploader
2003                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2004                 if mobj is None:
2005                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2006                         return
2007                 video_uploader = mobj.group(1).decode('utf-8')
2008
2009                 # Extract video thumbnail
2010                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2011                 if mobj is None:
2012                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2013                         return
2014                 video_thumbnail = mobj.group(1).decode('utf-8')
2015
2016                 # # Extract video description
2017                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2018                 # if mobj is None:
2019                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2020                 #       return
2021                 # video_description = mobj.group(1).decode('utf-8')
2022                 # if not video_description: video_description = 'No description available.'
2023                 video_description = 'Foo.'
2024
2025                 # Vimeo specific: extract request signature
2026                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2027                 if mobj is None:
2028                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2029                         return
2030                 sig = mobj.group(1).decode('utf-8')
2031
2032                 # Vimeo specific: Extract request signature expiration
2033                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2034                 if mobj is None:
2035                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2036                         return
2037                 sig_exp = mobj.group(1).decode('utf-8')
2038
2039                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2040
2041                 try:
2042                         # Process video information
2043                         self._downloader.process_info({
2044                                 'id':           video_id.decode('utf-8'),
2045                                 'url':          video_url,
2046                                 'uploader':     video_uploader,
2047                                 'upload_date':  u'NA',
2048                                 'title':        video_title,
2049                                 'stitle':       simple_title,
2050                                 'ext':          u'mp4',
2051                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2052                                 'description':  video_description,
2053                                 'thumbnail':    video_thumbnail,
2054                                 'description':  video_description,
2055                                 'player_url':   None,
2056                         })
2057                 except UnavailableVideoError:
2058                         self._downloader.trouble(u'ERROR: unable to download video')
2059
2060
2061 class GenericIE(InfoExtractor):
2062         """Generic last-resort information extractor."""
2063
2064         _VALID_URL = r'.*'
2065         IE_NAME = u'generic'
2066
2067         def __init__(self, downloader=None):
2068                 InfoExtractor.__init__(self, downloader)
2069
2070         def report_download_webpage(self, video_id):
2071                 """Report webpage download."""
2072                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2073                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2074
2075         def report_extraction(self, video_id):
2076                 """Report information extraction."""
2077                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2078
2079         def _real_initialize(self):
2080                 return
2081
2082         def _real_extract(self, url):
2083                 # At this point we have a new video
2084                 self._downloader.increment_downloads()
2085
2086                 video_id = url.split('/')[-1]
2087                 request = urllib2.Request(url)
2088                 try:
2089                         self.report_download_webpage(video_id)
2090                         webpage = urllib2.urlopen(request).read()
2091                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2092                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2093                         return
2094                 except ValueError, err:
2095                         # since this is the last-resort InfoExtractor, if
2096                         # this error is thrown, it'll be thrown here
2097                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2098                         return
2099
2100                 self.report_extraction(video_id)
2101                 # Start with something easy: JW Player in SWFObject
2102                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2103                 if mobj is None:
2104                         # Broaden the search a little bit
2105                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2106                 if mobj is None:
2107                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2108                         return
2109
2110                 # It's possible that one of the regexes
2111                 # matched, but returned an empty group:
2112                 if mobj.group(1) is None:
2113                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2114                         return
2115
2116                 video_url = urllib.unquote(mobj.group(1))
2117                 video_id = os.path.basename(video_url)
2118
2119                 # here's a fun little line of code for you:
2120                 video_extension = os.path.splitext(video_id)[1][1:]
2121                 video_id = os.path.splitext(video_id)[0]
2122
2123                 # it's tempting to parse this further, but you would
2124                 # have to take into account all the variations like
2125                 #   Video Title - Site Name
2126                 #   Site Name | Video Title
2127                 #   Video Title - Tagline | Site Name
2128                 # and so on and so forth; it's just not practical
2129                 mobj = re.search(r'<title>(.*)</title>', webpage)
2130                 if mobj is None:
2131                         self._downloader.trouble(u'ERROR: unable to extract title')
2132                         return
2133                 video_title = mobj.group(1).decode('utf-8')
2134                 video_title = sanitize_title(video_title)
2135                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2136
2137                 # video uploader is domain name
2138                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2139                 if mobj is None:
2140                         self._downloader.trouble(u'ERROR: unable to extract title')
2141                         return
2142                 video_uploader = mobj.group(1).decode('utf-8')
2143
2144                 try:
2145                         # Process video information
2146                         self._downloader.process_info({
2147                                 'id':           video_id.decode('utf-8'),
2148                                 'url':          video_url.decode('utf-8'),
2149                                 'uploader':     video_uploader,
2150                                 'upload_date':  u'NA',
2151                                 'title':        video_title,
2152                                 'stitle':       simple_title,
2153                                 'ext':          video_extension.decode('utf-8'),
2154                                 'format':       u'NA',
2155                                 'player_url':   None,
2156                         })
2157                 except UnavailableVideoError, err:
2158                         self._downloader.trouble(u'\nERROR: unable to download video')
2159
2160
2161 class YoutubeSearchIE(InfoExtractor):
2162         """Information Extractor for YouTube search queries."""
2163         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2164         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2165         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2166         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2167         _youtube_ie = None
2168         _max_youtube_results = 1000
2169         IE_NAME = u'youtube:search'
2170
2171         def __init__(self, youtube_ie, downloader=None):
2172                 InfoExtractor.__init__(self, downloader)
2173                 self._youtube_ie = youtube_ie
2174
2175         def report_download_page(self, query, pagenum):
2176                 """Report attempt to download playlist page with given number."""
2177                 query = query.decode(preferredencoding())
2178                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2179
2180         def _real_initialize(self):
2181                 self._youtube_ie.initialize()
2182
2183         def _real_extract(self, query):
2184                 mobj = re.match(self._VALID_URL, query)
2185                 if mobj is None:
2186                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2187                         return
2188
2189                 prefix, query = query.split(':')
2190                 prefix = prefix[8:]
2191                 query = query.encode('utf-8')
2192                 if prefix == '':
2193                         self._download_n_results(query, 1)
2194                         return
2195                 elif prefix == 'all':
2196                         self._download_n_results(query, self._max_youtube_results)
2197                         return
2198                 else:
2199                         try:
2200                                 n = long(prefix)
2201                                 if n <= 0:
2202                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2203                                         return
2204                                 elif n > self._max_youtube_results:
2205                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2206                                         n = self._max_youtube_results
2207                                 self._download_n_results(query, n)
2208                                 return
2209                         except ValueError: # parsing prefix as integer fails
2210                                 self._download_n_results(query, 1)
2211                                 return
2212
2213         def _download_n_results(self, query, n):
2214                 """Downloads a specified number of results for a query"""
2215
2216                 video_ids = []
2217                 already_seen = set()
2218                 pagenum = 1
2219
2220                 while True:
2221                         self.report_download_page(query, pagenum)
2222                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2223                         request = urllib2.Request(result_url)
2224                         try:
2225                                 page = urllib2.urlopen(request).read()
2226                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2227                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2228                                 return
2229
2230                         # Extract video identifiers
2231                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2232                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2233                                 if video_id not in already_seen:
2234                                         video_ids.append(video_id)
2235                                         already_seen.add(video_id)
2236                                         if len(video_ids) == n:
2237                                                 # Specified n videos reached
2238                                                 for id in video_ids:
2239                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2240                                                 return
2241
2242                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2243                                 for id in video_ids:
2244                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2245                                 return
2246
2247                         pagenum = pagenum + 1
2248
2249
2250 class GoogleSearchIE(InfoExtractor):
2251         """Information Extractor for Google Video search queries."""
2252         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2253         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2254         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2255         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2256         _google_ie = None
2257         _max_google_results = 1000
2258         IE_NAME = u'video.google:search'
2259
2260         def __init__(self, google_ie, downloader=None):
2261                 InfoExtractor.__init__(self, downloader)
2262                 self._google_ie = google_ie
2263
2264         def report_download_page(self, query, pagenum):
2265                 """Report attempt to download playlist page with given number."""
2266                 query = query.decode(preferredencoding())
2267                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2268
2269         def _real_initialize(self):
2270                 self._google_ie.initialize()
2271
2272         def _real_extract(self, query):
2273                 mobj = re.match(self._VALID_URL, query)
2274                 if mobj is None:
2275                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2276                         return
2277
2278                 prefix, query = query.split(':')
2279                 prefix = prefix[8:]
2280                 query = query.encode('utf-8')
2281                 if prefix == '':
2282                         self._download_n_results(query, 1)
2283                         return
2284                 elif prefix == 'all':
2285                         self._download_n_results(query, self._max_google_results)
2286                         return
2287                 else:
2288                         try:
2289                                 n = long(prefix)
2290                                 if n <= 0:
2291                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2292                                         return
2293                                 elif n > self._max_google_results:
2294                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2295                                         n = self._max_google_results
2296                                 self._download_n_results(query, n)
2297                                 return
2298                         except ValueError: # parsing prefix as integer fails
2299                                 self._download_n_results(query, 1)
2300                                 return
2301
2302         def _download_n_results(self, query, n):
2303                 """Downloads a specified number of results for a query"""
2304
2305                 video_ids = []
2306                 already_seen = set()
2307                 pagenum = 1
2308
2309                 while True:
2310                         self.report_download_page(query, pagenum)
2311                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2312                         request = urllib2.Request(result_url)
2313                         try:
2314                                 page = urllib2.urlopen(request).read()
2315                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2316                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2317                                 return
2318
2319                         # Extract video identifiers
2320                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2321                                 video_id = mobj.group(1)
2322                                 if video_id not in already_seen:
2323                                         video_ids.append(video_id)
2324                                         already_seen.add(video_id)
2325                                         if len(video_ids) == n:
2326                                                 # Specified n videos reached
2327                                                 for id in video_ids:
2328                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2329                                                 return
2330
2331                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2332                                 for id in video_ids:
2333                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2334                                 return
2335
2336                         pagenum = pagenum + 1
2337
2338
2339 class YahooSearchIE(InfoExtractor):
2340         """Information Extractor for Yahoo! Video search queries."""
2341         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2342         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2343         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2344         _MORE_PAGES_INDICATOR = r'\s*Next'
2345         _yahoo_ie = None
2346         _max_yahoo_results = 1000
2347         IE_NAME = u'video.yahoo:search'
2348
2349         def __init__(self, yahoo_ie, downloader=None):
2350                 InfoExtractor.__init__(self, downloader)
2351                 self._yahoo_ie = yahoo_ie
2352
2353         def report_download_page(self, query, pagenum):
2354                 """Report attempt to download playlist page with given number."""
2355                 query = query.decode(preferredencoding())
2356                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2357
2358         def _real_initialize(self):
2359                 self._yahoo_ie.initialize()
2360
2361         def _real_extract(self, query):
2362                 mobj = re.match(self._VALID_URL, query)
2363                 if mobj is None:
2364                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2365                         return
2366
2367                 prefix, query = query.split(':')
2368                 prefix = prefix[8:]
2369                 query = query.encode('utf-8')
2370                 if prefix == '':
2371                         self._download_n_results(query, 1)
2372                         return
2373                 elif prefix == 'all':
2374                         self._download_n_results(query, self._max_yahoo_results)
2375                         return
2376                 else:
2377                         try:
2378                                 n = long(prefix)
2379                                 if n <= 0:
2380                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2381                                         return
2382                                 elif n > self._max_yahoo_results:
2383                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2384                                         n = self._max_yahoo_results
2385                                 self._download_n_results(query, n)
2386                                 return
2387                         except ValueError: # parsing prefix as integer fails
2388                                 self._download_n_results(query, 1)
2389                                 return
2390
2391         def _download_n_results(self, query, n):
2392                 """Downloads a specified number of results for a query"""
2393
2394                 video_ids = []
2395                 already_seen = set()
2396                 pagenum = 1
2397
2398                 while True:
2399                         self.report_download_page(query, pagenum)
2400                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2401                         request = urllib2.Request(result_url)
2402                         try:
2403                                 page = urllib2.urlopen(request).read()
2404                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2405                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2406                                 return
2407
2408                         # Extract video identifiers
2409                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2410                                 video_id = mobj.group(1)
2411                                 if video_id not in already_seen:
2412                                         video_ids.append(video_id)
2413                                         already_seen.add(video_id)
2414                                         if len(video_ids) == n:
2415                                                 # Specified n videos reached
2416                                                 for id in video_ids:
2417                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2418                                                 return
2419
2420                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2421                                 for id in video_ids:
2422                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2423                                 return
2424
2425                         pagenum = pagenum + 1
2426
2427
2428 class YoutubePlaylistIE(InfoExtractor):
2429         """Information Extractor for YouTube playlists."""
2430
2431         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2432         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2433         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2434         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2435         _youtube_ie = None
2436         IE_NAME = u'youtube:playlist'
2437
2438         def __init__(self, youtube_ie, downloader=None):
2439                 InfoExtractor.__init__(self, downloader)
2440                 self._youtube_ie = youtube_ie
2441
2442         def report_download_page(self, playlist_id, pagenum):
2443                 """Report attempt to download playlist page with given number."""
2444                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2445
2446         def _real_initialize(self):
2447                 self._youtube_ie.initialize()
2448
2449         def _real_extract(self, url):
2450                 # Extract playlist id
2451                 mobj = re.match(self._VALID_URL, url)
2452                 if mobj is None:
2453                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2454                         return
2455
2456                 # Single video case
2457                 if mobj.group(3) is not None:
2458                         self._youtube_ie.extract(mobj.group(3))
2459                         return
2460
2461                 # Download playlist pages
2462                 # prefix is 'p' as default for playlists but there are other types that need extra care
2463                 playlist_prefix = mobj.group(1)
2464                 if playlist_prefix == 'a':
2465                         playlist_access = 'artist'
2466                 else:
2467                         playlist_prefix = 'p'
2468                         playlist_access = 'view_play_list'
2469                 playlist_id = mobj.group(2)
2470                 video_ids = []
2471                 pagenum = 1
2472
2473                 while True:
2474                         self.report_download_page(playlist_id, pagenum)
2475                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2476                         try:
2477                                 page = urllib2.urlopen(request).read()
2478                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2479                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2480                                 return
2481
2482                         # Extract video identifiers
2483                         ids_in_page = []
2484                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2485                                 if mobj.group(1) not in ids_in_page:
2486                                         ids_in_page.append(mobj.group(1))
2487                         video_ids.extend(ids_in_page)
2488
2489                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2490                                 break
2491                         pagenum = pagenum + 1
2492
2493                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2494                 playlistend = self._downloader.params.get('playlistend', -1)
2495                 video_ids = video_ids[playliststart:playlistend]
2496
2497                 for id in video_ids:
2498                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2499                 return
2500
2501
2502 class YoutubeUserIE(InfoExtractor):
2503         """Information Extractor for YouTube users."""
2504
2505         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2506         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2507         _GDATA_PAGE_SIZE = 50
2508         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2509         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2510         _youtube_ie = None
2511         IE_NAME = u'youtube:user'
2512
2513         def __init__(self, youtube_ie, downloader=None):
2514                 InfoExtractor.__init__(self, downloader)
2515                 self._youtube_ie = youtube_ie
2516
2517         def report_download_page(self, username, start_index):
2518                 """Report attempt to download user page."""
2519                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2520                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2521
2522         def _real_initialize(self):
2523                 self._youtube_ie.initialize()
2524
2525         def _real_extract(self, url):
2526                 # Extract username
2527                 mobj = re.match(self._VALID_URL, url)
2528                 if mobj is None:
2529                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2530                         return
2531
2532                 username = mobj.group(1)
2533
2534                 # Download video ids using YouTube Data API. Result size per
2535                 # query is limited (currently to 50 videos) so we need to query
2536                 # page by page until there are no video ids - it means we got
2537                 # all of them.
2538
2539                 video_ids = []
2540                 pagenum = 0
2541
2542                 while True:
2543                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2544                         self.report_download_page(username, start_index)
2545
2546                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2547
2548                         try:
2549                                 page = urllib2.urlopen(request).read()
2550                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2551                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2552                                 return
2553
2554                         # Extract video identifiers
2555                         ids_in_page = []
2556
2557                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2558                                 if mobj.group(1) not in ids_in_page:
2559                                         ids_in_page.append(mobj.group(1))
2560
2561                         video_ids.extend(ids_in_page)
2562
2563                         # A little optimization - if current page is not
2564                         # "full", ie. does not contain PAGE_SIZE video ids then
2565                         # we can assume that this page is the last one - there
2566                         # are no more ids on further pages - no need to query
2567                         # again.
2568
2569                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2570                                 break
2571
2572                         pagenum += 1
2573
2574                 all_ids_count = len(video_ids)
2575                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2576                 playlistend = self._downloader.params.get('playlistend', -1)
2577
2578                 if playlistend == -1:
2579                         video_ids = video_ids[playliststart:]
2580                 else:
2581                         video_ids = video_ids[playliststart:playlistend]
2582
2583                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2584                                 (username, all_ids_count, len(video_ids)))
2585
2586                 for video_id in video_ids:
2587                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2588
2589
2590 class DepositFilesIE(InfoExtractor):
2591         """Information extractor for depositfiles.com"""
2592
2593         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2594         IE_NAME = u'DepositFiles'
2595
2596         def __init__(self, downloader=None):
2597                 InfoExtractor.__init__(self, downloader)
2598
2599         def report_download_webpage(self, file_id):
2600                 """Report webpage download."""
2601                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2602
2603         def report_extraction(self, file_id):
2604                 """Report information extraction."""
2605                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2606
2607         def _real_initialize(self):
2608                 return
2609
2610         def _real_extract(self, url):
2611                 # At this point we have a new file
2612                 self._downloader.increment_downloads()
2613
2614                 file_id = url.split('/')[-1]
2615                 # Rebuild url in english locale
2616                 url = 'http://depositfiles.com/en/files/' + file_id
2617
2618                 # Retrieve file webpage with 'Free download' button pressed
2619                 free_download_indication = { 'gateway_result' : '1' }
2620                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2621                 try:
2622                         self.report_download_webpage(file_id)
2623                         webpage = urllib2.urlopen(request).read()
2624                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2625                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2626                         return
2627
2628                 # Search for the real file URL
2629                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2630                 if (mobj is None) or (mobj.group(1) is None):
2631                         # Try to figure out reason of the error.
2632                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2633                         if (mobj is not None) and (mobj.group(1) is not None):
2634                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2635                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2636                         else:
2637                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2638                         return
2639
2640                 file_url = mobj.group(1)
2641                 file_extension = os.path.splitext(file_url)[1][1:]
2642
2643                 # Search for file title
2644                 mobj = re.search(r'<b title="(.*?)">', webpage)
2645                 if mobj is None:
2646                         self._downloader.trouble(u'ERROR: unable to extract title')
2647                         return
2648                 file_title = mobj.group(1).decode('utf-8')
2649
2650                 try:
2651                         # Process file information
2652                         self._downloader.process_info({
2653                                 'id':           file_id.decode('utf-8'),
2654                                 'url':          file_url.decode('utf-8'),
2655                                 'uploader':     u'NA',
2656                                 'upload_date':  u'NA',
2657                                 'title':        file_title,
2658                                 'stitle':       file_title,
2659                                 'ext':          file_extension.decode('utf-8'),
2660                                 'format':       u'NA',
2661                                 'player_url':   None,
2662                         })
2663                 except UnavailableVideoError, err:
2664                         self._downloader.trouble(u'ERROR: unable to download file')
2665
2666
2667 class FacebookIE(InfoExtractor):
2668         """Information Extractor for Facebook"""
2669
2670         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2671         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2672         _NETRC_MACHINE = 'facebook'
2673         _available_formats = ['highqual', 'lowqual']
2674         _video_extensions = {
2675                 'highqual': 'mp4',
2676                 'lowqual': 'mp4',
2677         }
2678         IE_NAME = u'facebook'
2679
2680         def __init__(self, downloader=None):
2681                 InfoExtractor.__init__(self, downloader)
2682
2683         def _reporter(self, message):
2684                 """Add header and report message."""
2685                 self._downloader.to_screen(u'[facebook] %s' % message)
2686
2687         def report_login(self):
2688                 """Report attempt to log in."""
2689                 self._reporter(u'Logging in')
2690
2691         def report_video_webpage_download(self, video_id):
2692                 """Report attempt to download video webpage."""
2693                 self._reporter(u'%s: Downloading video webpage' % video_id)
2694
2695         def report_information_extraction(self, video_id):
2696                 """Report attempt to extract video information."""
2697                 self._reporter(u'%s: Extracting video information' % video_id)
2698
2699         def _parse_page(self, video_webpage):
2700                 """Extract video information from page"""
2701                 # General data
2702                 data = {'title': r'class="video_title datawrap">(.*?)</',
2703                         'description': r'<div class="datawrap">(.*?)</div>',
2704                         'owner': r'\("video_owner_name", "(.*?)"\)',
2705                         'upload_date': r'data-date="(.*?)"',
2706                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2707                         }
2708                 video_info = {}
2709                 for piece in data.keys():
2710                         mobj = re.search(data[piece], video_webpage)
2711                         if mobj is not None:
2712                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2713
2714                 # Video urls
2715                 video_urls = {}
2716                 for fmt in self._available_formats:
2717                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2718                         if mobj is not None:
2719                                 # URL is in a Javascript segment inside an escaped Unicode format within
2720                                 # the generally utf-8 page
2721                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2722                 video_info['video_urls'] = video_urls
2723
2724                 return video_info
2725
2726         def _real_initialize(self):
2727                 if self._downloader is None:
2728                         return
2729
2730                 useremail = None
2731                 password = None
2732                 downloader_params = self._downloader.params
2733
2734                 # Attempt to use provided username and password or .netrc data
2735                 if downloader_params.get('username', None) is not None:
2736                         useremail = downloader_params['username']
2737                         password = downloader_params['password']
2738                 elif downloader_params.get('usenetrc', False):
2739                         try:
2740                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2741                                 if info is not None:
2742                                         useremail = info[0]
2743                                         password = info[2]
2744                                 else:
2745                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2746                         except (IOError, netrc.NetrcParseError), err:
2747                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2748                                 return
2749
2750                 if useremail is None:
2751                         return
2752
2753                 # Log in
2754                 login_form = {
2755                         'email': useremail,
2756                         'pass': password,
2757                         'login': 'Log+In'
2758                         }
2759                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2760                 try:
2761                         self.report_login()
2762                         login_results = urllib2.urlopen(request).read()
2763                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2764                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2765                                 return
2766                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2767                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2768                         return
2769
2770         def _real_extract(self, url):
2771                 mobj = re.match(self._VALID_URL, url)
2772                 if mobj is None:
2773                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2774                         return
2775                 video_id = mobj.group('ID')
2776
2777                 # Get video webpage
2778                 self.report_video_webpage_download(video_id)
2779                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2780                 try:
2781                         page = urllib2.urlopen(request)
2782                         video_webpage = page.read()
2783                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2784                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2785                         return
2786
2787                 # Start extracting information
2788                 self.report_information_extraction(video_id)
2789
2790                 # Extract information
2791                 video_info = self._parse_page(video_webpage)
2792
2793                 # uploader
2794                 if 'owner' not in video_info:
2795                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2796                         return
2797                 video_uploader = video_info['owner']
2798
2799                 # title
2800                 if 'title' not in video_info:
2801                         self._downloader.trouble(u'ERROR: unable to extract video title')
2802                         return
2803                 video_title = video_info['title']
2804                 video_title = video_title.decode('utf-8')
2805                 video_title = sanitize_title(video_title)
2806
2807                 # simplified title
2808                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2809                 simple_title = simple_title.strip(ur'_')
2810
2811                 # thumbnail image
2812                 if 'thumbnail' not in video_info:
2813                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2814                         video_thumbnail = ''
2815                 else:
2816                         video_thumbnail = video_info['thumbnail']
2817
2818                 # upload date
2819                 upload_date = u'NA'
2820                 if 'upload_date' in video_info:
2821                         upload_time = video_info['upload_date']
2822                         timetuple = email.utils.parsedate_tz(upload_time)
2823                         if timetuple is not None:
2824                                 try:
2825                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2826                                 except:
2827                                         pass
2828
2829                 # description
2830                 video_description = video_info.get('description', 'No description available.')
2831
2832                 url_map = video_info['video_urls']
2833                 if len(url_map.keys()) > 0:
2834                         # Decide which formats to download
2835                         req_format = self._downloader.params.get('format', None)
2836                         format_limit = self._downloader.params.get('format_limit', None)
2837
2838                         if format_limit is not None and format_limit in self._available_formats:
2839                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2840                         else:
2841                                 format_list = self._available_formats
2842                         existing_formats = [x for x in format_list if x in url_map]
2843                         if len(existing_formats) == 0:
2844                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2845                                 return
2846                         if req_format is None:
2847                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2848                         elif req_format == 'worst':
2849                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2850                         elif req_format == '-1':
2851                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2852                         else:
2853                                 # Specific format
2854                                 if req_format not in url_map:
2855                                         self._downloader.trouble(u'ERROR: requested format not available')
2856                                         return
2857                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2858
2859                 for format_param, video_real_url in video_url_list:
2860
2861                         # At this point we have a new video
2862                         self._downloader.increment_downloads()
2863
2864                         # Extension
2865                         video_extension = self._video_extensions.get(format_param, 'mp4')
2866
2867                         try:
2868                                 # Process video information
2869                                 self._downloader.process_info({
2870                                         'id':           video_id.decode('utf-8'),
2871                                         'url':          video_real_url.decode('utf-8'),
2872                                         'uploader':     video_uploader.decode('utf-8'),
2873                                         'upload_date':  upload_date,
2874                                         'title':        video_title,
2875                                         'stitle':       simple_title,
2876                                         'ext':          video_extension.decode('utf-8'),
2877                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2878                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2879                                         'description':  video_description.decode('utf-8'),
2880                                         'player_url':   None,
2881                                 })
2882                         except UnavailableVideoError, err:
2883                                 self._downloader.trouble(u'\nERROR: unable to download video')
2884
2885 class BlipTVIE(InfoExtractor):
2886         """Information extractor for blip.tv"""
2887
2888         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2889         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2890         IE_NAME = u'blip.tv'
2891
2892         def report_extraction(self, file_id):
2893                 """Report information extraction."""
2894                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2895
2896         def _simplify_title(self, title):
2897                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2898                 res = res.strip(ur'_')
2899                 return res
2900
2901         def _real_extract(self, url):
2902                 mobj = re.match(self._VALID_URL, url)
2903                 if mobj is None:
2904                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2905                         return
2906
2907                 if '?' in url:
2908                         cchar = '&'
2909                 else:
2910                         cchar = '?'
2911                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2912                 request = urllib2.Request(json_url)
2913                 self.report_extraction(mobj.group(1))
2914                 try:
2915                         json_code = urllib2.urlopen(request).read()
2916                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2917                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2918                         return
2919                 try:
2920                         json_data = json.loads(json_code)
2921                         if 'Post' in json_data:
2922                                 data = json_data['Post']
2923                         else:
2924                                 data = json_data
2925
2926                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2927                         video_url = data['media']['url']
2928                         umobj = re.match(self._URL_EXT, video_url)
2929                         if umobj is None:
2930                                 raise ValueError('Can not determine filename extension')
2931                         ext = umobj.group(1)
2932
2933                         self._downloader.increment_downloads()
2934
2935                         info = {
2936                                 'id': data['item_id'],
2937                                 'url': video_url,
2938                                 'uploader': data['display_name'],
2939                                 'upload_date': upload_date,
2940                                 'title': data['title'],
2941                                 'stitle': self._simplify_title(data['title']),
2942                                 'ext': ext,
2943                                 'format': data['media']['mimeType'],
2944                                 'thumbnail': data['thumbnailUrl'],
2945                                 'description': data['description'],
2946                                 'player_url': data['embedUrl']
2947                         }
2948                 except (ValueError,KeyError), err:
2949                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2950                         return
2951
2952                 try:
2953                         self._downloader.process_info(info)
2954                 except UnavailableVideoError, err:
2955                         self._downloader.trouble(u'\nERROR: unable to download video')
2956
2957
2958 class MyVideoIE(InfoExtractor):
2959         """Information Extractor for myvideo.de."""
2960
2961         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2962         IE_NAME = u'myvideo'
2963
2964         def __init__(self, downloader=None):
2965                 InfoExtractor.__init__(self, downloader)
2966
2967         def report_download_webpage(self, video_id):
2968                 """Report webpage download."""
2969                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2970
2971         def report_extraction(self, video_id):
2972                 """Report information extraction."""
2973                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2974
2975         def _real_initialize(self):
2976                 return
2977
2978         def _real_extract(self,url):
2979                 mobj = re.match(self._VALID_URL, url)
2980                 if mobj is None:
2981                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2982                         return
2983
2984                 video_id = mobj.group(1)
2985                 simple_title = mobj.group(2).decode('utf-8')
2986                 # should actually not be necessary
2987                 simple_title = sanitize_title(simple_title)
2988                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2989
2990                 # Get video webpage
2991                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2992                 try:
2993                         self.report_download_webpage(video_id)
2994                         webpage = urllib2.urlopen(request).read()
2995                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2996                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2997                         return
2998
2999                 self.report_extraction(video_id)
3000                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3001                                  webpage)
3002                 if mobj is None:
3003                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3004                         return
3005                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3006
3007                 mobj = re.search('<title>([^<]+)</title>', webpage)
3008                 if mobj is None:
3009                         self._downloader.trouble(u'ERROR: unable to extract title')
3010                         return
3011
3012                 video_title = mobj.group(1)
3013                 video_title = sanitize_title(video_title)
3014
3015                 try:
3016                         print(video_url)
3017                         self._downloader.process_info({
3018                                 'id':           video_id,
3019                                 'url':          video_url,
3020                                 'uploader':     u'NA',
3021                                 'upload_date':  u'NA',
3022                                 'title':        video_title,
3023                                 'stitle':       simple_title,
3024                                 'ext':          u'flv',
3025                                 'format':       u'NA',
3026                                 'player_url':   None,
3027                         })
3028                 except UnavailableVideoError:
3029                         self._downloader.trouble(u'\nERROR: Unable to download video')
3030
3031 class ComedyCentralIE(InfoExtractor):
3032         """Information extractor for The Daily Show and Colbert Report """
3033
3034         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3035         IE_NAME = u'comedycentral'
3036
3037         def report_extraction(self, episode_id):
3038                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3039
3040         def report_config_download(self, episode_id):
3041                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3042
3043         def report_index_download(self, episode_id):
3044                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3045
3046         def report_player_url(self, episode_id):
3047                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3048
3049         def _simplify_title(self, title):
3050                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3051                 res = res.strip(ur'_')
3052                 return res
3053
3054         def _real_extract(self, url):
3055                 mobj = re.match(self._VALID_URL, url)
3056                 if mobj is None:
3057                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3058                         return
3059
3060                 if mobj.group('shortname'):
3061                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3062                                 url = 'http://www.thedailyshow.com/full-episodes/'
3063                         else:
3064                                 url = 'http://www.colbertnation.com/full-episodes/'
3065                         mobj = re.match(self._VALID_URL, url)
3066                         assert mobj is not None
3067
3068                 dlNewest = not mobj.group('episode')
3069                 if dlNewest:
3070                         epTitle = mobj.group('showname')
3071                 else:
3072                         epTitle = mobj.group('episode')
3073
3074                 req = urllib2.Request(url)
3075                 self.report_extraction(epTitle)
3076                 try:
3077                         htmlHandle = urllib2.urlopen(req)
3078                         html = htmlHandle.read()
3079                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3080                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3081                         return
3082                 if dlNewest:
3083                         url = htmlHandle.geturl()
3084                         mobj = re.match(self._VALID_URL, url)
3085                         if mobj is None:
3086                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3087                                 return
3088                         if mobj.group('episode') == '':
3089                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3090                                 return
3091                         epTitle = mobj.group('episode')
3092
3093                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3094                 if len(mMovieParams) == 0:
3095                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3096                         return
3097
3098                 playerUrl_raw = mMovieParams[0][0]
3099                 self.report_player_url(epTitle)
3100                 try:
3101                         urlHandle = urllib2.urlopen(playerUrl_raw)
3102                         playerUrl = urlHandle.geturl()
3103                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3104                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3105                         return
3106
3107                 uri = mMovieParams[0][1]
3108                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3109                 self.report_index_download(epTitle)
3110                 try:
3111                         indexXml = urllib2.urlopen(indexUrl).read()
3112                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3113                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3114                         return
3115
3116                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3117                 itemEls = idoc.findall('.//item')
3118                 for itemEl in itemEls:
3119                         mediaId = itemEl.findall('./guid')[0].text
3120                         shortMediaId = mediaId.split(':')[-1]
3121                         showId = mediaId.split(':')[-2].replace('.com', '')
3122                         officialTitle = itemEl.findall('./title')[0].text
3123                         officialDate = itemEl.findall('./pubDate')[0].text
3124
3125                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3126                                                 urllib.urlencode({'uri': mediaId}))
3127                         configReq = urllib2.Request(configUrl)
3128                         self.report_config_download(epTitle)
3129                         try:
3130                                 configXml = urllib2.urlopen(configReq).read()
3131                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3132                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3133                                 return
3134
3135                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3136                         turls = []
3137                         for rendition in cdoc.findall('.//rendition'):
3138                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3139                                 turls.append(finfo)
3140
3141                         if len(turls) == 0:
3142                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3143                                 continue
3144
3145                         # For now, just pick the highest bitrate
3146                         format,video_url = turls[-1]
3147
3148                         self._downloader.increment_downloads()
3149
3150                         effTitle = showId + '-' + epTitle
3151                         info = {
3152                                 'id': shortMediaId,
3153                                 'url': video_url,
3154                                 'uploader': showId,
3155                                 'upload_date': officialDate,
3156                                 'title': effTitle,
3157                                 'stitle': self._simplify_title(effTitle),
3158                                 'ext': 'mp4',
3159                                 'format': format,
3160                                 'thumbnail': None,
3161                                 'description': officialTitle,
3162                                 'player_url': playerUrl
3163                         }
3164
3165                         try:
3166                                 self._downloader.process_info(info)
3167                         except UnavailableVideoError, err:
3168                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3169                                 continue
3170
3171
3172 class EscapistIE(InfoExtractor):
3173         """Information extractor for The Escapist """
3174
3175         _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3176         IE_NAME = u'escapist'
3177
3178         def report_extraction(self, showName):
3179                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3180
3181         def report_config_download(self, showName):
3182                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3183
3184         def _simplify_title(self, title):
3185                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3186                 res = res.strip(ur'_')
3187                 return res
3188
3189         def _real_extract(self, url):
3190                 htmlParser = HTMLParser.HTMLParser()
3191
3192                 mobj = re.match(self._VALID_URL, url)
3193                 if mobj is None:
3194                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3195                         return
3196                 showName = mobj.group('showname')
3197                 videoId = mobj.group('episode')
3198
3199                 self.report_extraction(showName)
3200                 try:
3201                         webPage = urllib2.urlopen(url).read()
3202                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3203                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3204                         return
3205
3206                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3207                 description = htmlParser.unescape(descMatch.group(1))
3208                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3209                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3210                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3211                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3212                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3213                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3214
3215                 self.report_config_download(showName)
3216                 try:
3217                         configJSON = urllib2.urlopen(configUrl).read()
3218                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3219                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3220                         return
3221
3222                 # Technically, it's JavaScript, not JSON
3223                 configJSON = configJSON.replace("'", '"')
3224
3225                 try:
3226                         config = json.loads(configJSON)
3227                 except (ValueError,), err:
3228                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3229                         return
3230
3231                 playlist = config['playlist']
3232                 videoUrl = playlist[1]['url']
3233
3234                 self._downloader.increment_downloads()
3235                 info = {
3236                         'id': videoId,
3237                         'url': videoUrl,
3238                         'uploader': showName,
3239                         'upload_date': None,
3240                         'title': showName,
3241                         'stitle': self._simplify_title(showName),
3242                         'ext': 'flv',
3243                         'format': 'flv',
3244                         'thumbnail': imgUrl,
3245                         'description': description,
3246                         'player_url': playerUrl,
3247                 }
3248
3249                 try:
3250                         self._downloader.process_info(info)
3251                 except UnavailableVideoError, err:
3252                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3253
3254
3255
3256 class PostProcessor(object):
3257         """Post Processor class.
3258
3259         PostProcessor objects can be added to downloaders with their
3260         add_post_processor() method. When the downloader has finished a
3261         successful download, it will take its internal chain of PostProcessors
3262         and start calling the run() method on each one of them, first with
3263         an initial argument and then with the returned value of the previous
3264         PostProcessor.
3265
3266         The chain will be stopped if one of them ever returns None or the end
3267         of the chain is reached.
3268
3269         PostProcessor objects follow a "mutual registration" process similar
3270         to InfoExtractor objects.
3271         """
3272
3273         _downloader = None
3274
3275         def __init__(self, downloader=None):
3276                 self._downloader = downloader
3277
3278         def set_downloader(self, downloader):
3279                 """Sets the downloader for this PP."""
3280                 self._downloader = downloader
3281
3282         def run(self, information):
3283                 """Run the PostProcessor.
3284
3285                 The "information" argument is a dictionary like the ones
3286                 composed by InfoExtractors. The only difference is that this
3287                 one has an extra field called "filepath" that points to the
3288                 downloaded file.
3289
3290                 When this method returns None, the postprocessing chain is
3291                 stopped. However, this method may return an information
3292                 dictionary that will be passed to the next postprocessing
3293                 object in the chain. It can be the one it received after
3294                 changing some fields.
3295
3296                 In addition, this method may raise a PostProcessingError
3297                 exception that will be taken into account by the downloader
3298                 it was called from.
3299                 """
3300                 return information # by default, do nothing
3301
3302
3303 class FFmpegExtractAudioPP(PostProcessor):
3304
3305         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3306                 PostProcessor.__init__(self, downloader)
3307                 if preferredcodec is None:
3308                         preferredcodec = 'best'
3309                 self._preferredcodec = preferredcodec
3310                 self._preferredquality = preferredquality
3311                 self._keepvideo = keepvideo
3312
3313         @staticmethod
3314         def get_audio_codec(path):
3315                 try:
3316                         cmd = ['ffprobe', '-show_streams', '--', path]
3317                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3318                         output = handle.communicate()[0]
3319                         if handle.wait() != 0:
3320                                 return None
3321                 except (IOError, OSError):
3322                         return None
3323                 audio_codec = None
3324                 for line in output.split('\n'):
3325                         if line.startswith('codec_name='):
3326                                 audio_codec = line.split('=')[1].strip()
3327                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3328                                 return audio_codec
3329                 return None
3330
3331         @staticmethod
3332         def run_ffmpeg(path, out_path, codec, more_opts):
3333                 try:
3334                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3335                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3336                         return (ret == 0)
3337                 except (IOError, OSError):
3338                         return False
3339
3340         def run(self, information):
3341                 path = information['filepath']
3342
3343                 filecodec = self.get_audio_codec(path)
3344                 if filecodec is None:
3345                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3346                         return None
3347
3348                 more_opts = []
3349                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3350                         if filecodec == 'aac' or filecodec == 'mp3':
3351                                 # Lossless if possible
3352                                 acodec = 'copy'
3353                                 extension = filecodec
3354                                 if filecodec == 'aac':
3355                                         more_opts = ['-f', 'adts']
3356                         else:
3357                                 # MP3 otherwise.
3358                                 acodec = 'libmp3lame'
3359                                 extension = 'mp3'
3360                                 more_opts = []
3361                                 if self._preferredquality is not None:
3362                                         more_opts += ['-ab', self._preferredquality]
3363                 else:
3364                         # We convert the audio (lossy)
3365                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3366                         extension = self._preferredcodec
3367                         more_opts = []
3368                         if self._preferredquality is not None:
3369                                 more_opts += ['-ab', self._preferredquality]
3370                         if self._preferredcodec == 'aac':
3371                                 more_opts += ['-f', 'adts']
3372
3373                 (prefix, ext) = os.path.splitext(path)
3374                 new_path = prefix + '.' + extension
3375                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3376                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3377
3378                 if not status:
3379                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3380                         return None
3381
3382                 # Try to update the date time for extracted audio file.
3383                 if information.get('filetime') is not None:
3384                         try:
3385                                 os.utime(new_path, (time.time(), information['filetime']))
3386                         except:
3387                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3388
3389                 if not self._keepvideo:
3390                         try:
3391                                 os.remove(path)
3392                         except (IOError, OSError):
3393                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3394                                 return None
3395
3396                 information['filepath'] = new_path
3397                 return information
3398
3399
3400 def updateSelf(downloader, filename):
3401         ''' Update the program file with the latest version from the repository '''
3402         # Note: downloader only used for options
3403         if not os.access(filename, os.W_OK):
3404                 sys.exit('ERROR: no write permissions on %s' % filename)
3405
3406         downloader.to_screen('Updating to latest version...')
3407
3408         try:
3409                 try:
3410                         urlh = urllib.urlopen(UPDATE_URL)
3411                         newcontent = urlh.read()
3412
3413                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3414                         if vmatch is not None and vmatch.group(1) == __version__:
3415                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3416                                 return
3417                 finally:
3418                         urlh.close()
3419         except (IOError, OSError), err:
3420                 sys.exit('ERROR: unable to download latest version')
3421
3422         try:
3423                 outf = open(filename, 'wb')
3424                 try:
3425                         outf.write(newcontent)
3426                 finally:
3427                         outf.close()
3428         except (IOError, OSError), err:
3429                 sys.exit('ERROR: unable to overwrite current version')
3430
3431         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3432
3433 def parseOpts():
3434         # Deferred imports
3435         import getpass
3436         import optparse
3437
3438         def _format_option_string(option):
3439                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3440
3441                 opts = []
3442
3443                 if option._short_opts: opts.append(option._short_opts[0])
3444                 if option._long_opts: opts.append(option._long_opts[0])
3445                 if len(opts) > 1: opts.insert(1, ', ')
3446
3447                 if option.takes_value(): opts.append(' %s' % option.metavar)
3448
3449                 return "".join(opts)
3450
3451         def _find_term_columns():
3452                 columns = os.environ.get('COLUMNS', None)
3453                 if columns:
3454                         return int(columns)
3455
3456                 try:
3457                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3458                         out,err = sp.communicate()
3459                         return int(out.split()[1])
3460                 except:
3461                         pass
3462                 return None
3463
3464         max_width = 80
3465         max_help_position = 80
3466
3467         # No need to wrap help messages if we're on a wide console
3468         columns = _find_term_columns()
3469         if columns: max_width = columns
3470
3471         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3472         fmt.format_option_strings = _format_option_string
3473
3474         kw = {
3475                 'version'   : __version__,
3476                 'formatter' : fmt,
3477                 'usage' : '%prog [options] url [url...]',
3478                 'conflict_handler' : 'resolve',
3479         }
3480
3481         parser = optparse.OptionParser(**kw)
3482
3483         # option groups
3484         general        = optparse.OptionGroup(parser, 'General Options')
3485         selection      = optparse.OptionGroup(parser, 'Video Selection')
3486         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3487         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3488         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3489         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3490         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3491
3492         general.add_option('-h', '--help',
3493                         action='help', help='print this help text and exit')
3494         general.add_option('-v', '--version',
3495                         action='version', help='print program version and exit')
3496         general.add_option('-U', '--update',
3497                         action='store_true', dest='update_self', help='update this program to latest version')
3498         general.add_option('-i', '--ignore-errors',
3499                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3500         general.add_option('-r', '--rate-limit',
3501                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3502         general.add_option('-R', '--retries',
3503                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3504         general.add_option('--dump-user-agent',
3505                         action='store_true', dest='dump_user_agent',
3506                         help='display the current browser identification', default=False)
3507         general.add_option('--list-extractors',
3508                         action='store_true', dest='list_extractors',
3509                         help='List all supported extractors and the URLs they would handle', default=False)
3510
3511         selection.add_option('--playlist-start',
3512                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3513         selection.add_option('--playlist-end',
3514                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3515         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3516         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3517
3518         authentication.add_option('-u', '--username',
3519                         dest='username', metavar='USERNAME', help='account username')
3520         authentication.add_option('-p', '--password',
3521                         dest='password', metavar='PASSWORD', help='account password')
3522         authentication.add_option('-n', '--netrc',
3523                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3524
3525
3526         video_format.add_option('-f', '--format',
3527                         action='store', dest='format', metavar='FORMAT', help='video format code')
3528         video_format.add_option('--all-formats',
3529                         action='store_const', dest='format', help='download all available video formats', const='all')
3530         video_format.add_option('--max-quality',
3531                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3532
3533
3534         verbosity.add_option('-q', '--quiet',
3535                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3536         verbosity.add_option('-s', '--simulate',
3537                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3538         verbosity.add_option('--skip-download',
3539                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3540         verbosity.add_option('-g', '--get-url',
3541                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3542         verbosity.add_option('-e', '--get-title',
3543                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3544         verbosity.add_option('--get-thumbnail',
3545                         action='store_true', dest='getthumbnail',
3546                         help='simulate, quiet but print thumbnail URL', default=False)
3547         verbosity.add_option('--get-description',
3548                         action='store_true', dest='getdescription',
3549                         help='simulate, quiet but print video description', default=False)
3550         verbosity.add_option('--get-filename',
3551                         action='store_true', dest='getfilename',
3552                         help='simulate, quiet but print output filename', default=False)
3553         verbosity.add_option('--get-format',
3554                         action='store_true', dest='getformat',
3555                         help='simulate, quiet but print output format', default=False)
3556         verbosity.add_option('--no-progress',
3557                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3558         verbosity.add_option('--console-title',
3559                         action='store_true', dest='consoletitle',
3560                         help='display progress in console titlebar', default=False)
3561
3562
3563         filesystem.add_option('-t', '--title',
3564                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3565         filesystem.add_option('-l', '--literal',
3566                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3567         filesystem.add_option('-A', '--auto-number',
3568                         action='store_true', dest='autonumber',
3569                         help='number downloaded files starting from 00000', default=False)
3570         filesystem.add_option('-o', '--output',
3571                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3572         filesystem.add_option('-a', '--batch-file',
3573                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3574         filesystem.add_option('-w', '--no-overwrites',
3575                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3576         filesystem.add_option('-c', '--continue',
3577                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3578         filesystem.add_option('--no-continue',
3579                         action='store_false', dest='continue_dl',
3580                         help='do not resume partially downloaded files (restart from beginning)')
3581         filesystem.add_option('--cookies',
3582                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3583         filesystem.add_option('--no-part',
3584                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3585         filesystem.add_option('--no-mtime',
3586                         action='store_false', dest='updatetime',
3587                         help='do not use the Last-modified header to set the file modification time', default=True)
3588         filesystem.add_option('--write-description',
3589                         action='store_true', dest='writedescription',
3590                         help='write video description to a .description file', default=False)
3591         filesystem.add_option('--write-info-json',
3592                         action='store_true', dest='writeinfojson',
3593                         help='write video metadata to a .info.json file', default=False)
3594
3595
3596         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3597                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3598         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3599                         help='"best", "aac" or "mp3"; best by default')
3600         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3601                         help='ffmpeg audio bitrate specification, 128k by default')
3602         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3603                         help='keeps the video file on disk after the post-processing; the video is erased by default')
3604
3605
3606         parser.add_option_group(general)
3607         parser.add_option_group(selection)
3608         parser.add_option_group(filesystem)
3609         parser.add_option_group(verbosity)
3610         parser.add_option_group(video_format)
3611         parser.add_option_group(authentication)
3612         parser.add_option_group(postproc)
3613
3614         opts, args = parser.parse_args()
3615
3616         return parser, opts, args
3617
3618 def gen_extractors():
3619         """ Return a list of an instance of every supported extractor.
3620         The order does matter; the first extractor matched is the one handling the URL.
3621         """
3622         youtube_ie = YoutubeIE()
3623         google_ie = GoogleIE()
3624         yahoo_ie = YahooIE()
3625         return [
3626                 youtube_ie,
3627                 MetacafeIE(youtube_ie),
3628                 DailymotionIE(),
3629                 YoutubePlaylistIE(youtube_ie),
3630                 YoutubeUserIE(youtube_ie),
3631                 YoutubeSearchIE(youtube_ie),
3632                 google_ie,
3633                 GoogleSearchIE(google_ie),
3634                 PhotobucketIE(),
3635                 yahoo_ie,
3636                 YahooSearchIE(yahoo_ie),
3637                 DepositFilesIE(),
3638                 FacebookIE(),
3639                 BlipTVIE(),
3640                 VimeoIE(),
3641                 MyVideoIE(),
3642                 ComedyCentralIE(),
3643                 EscapistIE(),
3644
3645                 GenericIE()
3646         ]
3647
3648 def main():
3649         parser, opts, args = parseOpts()
3650
3651         # Open appropriate CookieJar
3652         if opts.cookiefile is None:
3653                 jar = cookielib.CookieJar()
3654         else:
3655                 try:
3656                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3657                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3658                                 jar.load()
3659                 except (IOError, OSError), err:
3660                         sys.exit(u'ERROR: unable to open cookie file')
3661
3662         # Dump user agent
3663         if opts.dump_user_agent:
3664                 print std_headers['User-Agent']
3665                 sys.exit(0)
3666
3667         # Batch file verification
3668         batchurls = []
3669         if opts.batchfile is not None:
3670                 try:
3671                         if opts.batchfile == '-':
3672                                 batchfd = sys.stdin
3673                         else:
3674                                 batchfd = open(opts.batchfile, 'r')
3675                         batchurls = batchfd.readlines()
3676                         batchurls = [x.strip() for x in batchurls]
3677                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3678                 except IOError:
3679                         sys.exit(u'ERROR: batch file could not be read')
3680         all_urls = batchurls + args
3681
3682         # General configuration
3683         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3684         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3685         urllib2.install_opener(opener)
3686         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3687
3688         extractors = gen_extractors()
3689
3690         if opts.list_extractors:
3691                 for ie in extractors:
3692                         print(ie.IE_NAME)
3693                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3694                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3695                         for mu in matchedUrls:
3696                                 print(u'  ' + mu)
3697                 sys.exit(0)
3698
3699         # Conflicting, missing and erroneous options
3700         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3701                 parser.error(u'using .netrc conflicts with giving username/password')
3702         if opts.password is not None and opts.username is None:
3703                 parser.error(u'account username missing')
3704         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3705                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3706         if opts.usetitle and opts.useliteral:
3707                 parser.error(u'using title conflicts with using literal title')
3708         if opts.username is not None and opts.password is None:
3709                 opts.password = getpass.getpass(u'Type account password and press return:')
3710         if opts.ratelimit is not None:
3711                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3712                 if numeric_limit is None:
3713                         parser.error(u'invalid rate limit specified')
3714                 opts.ratelimit = numeric_limit
3715         if opts.retries is not None:
3716                 try:
3717                         opts.retries = long(opts.retries)
3718                 except (TypeError, ValueError), err:
3719                         parser.error(u'invalid retry count specified')
3720         try:
3721                 opts.playliststart = int(opts.playliststart)
3722                 if opts.playliststart <= 0:
3723                         raise ValueError(u'Playlist start must be positive')
3724         except (TypeError, ValueError), err:
3725                 parser.error(u'invalid playlist start number specified')
3726         try:
3727                 opts.playlistend = int(opts.playlistend)
3728                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3729                         raise ValueError(u'Playlist end must be greater than playlist start')
3730         except (TypeError, ValueError), err:
3731                 parser.error(u'invalid playlist end number specified')
3732         if opts.extractaudio:
3733                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3734                         parser.error(u'invalid audio format specified')
3735
3736         # File downloader
3737         fd = FileDownloader({
3738                 'usenetrc': opts.usenetrc,
3739                 'username': opts.username,
3740                 'password': opts.password,
3741                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3742                 'forceurl': opts.geturl,
3743                 'forcetitle': opts.gettitle,
3744                 'forcethumbnail': opts.getthumbnail,
3745                 'forcedescription': opts.getdescription,
3746                 'forcefilename': opts.getfilename,
3747                 'forceformat': opts.getformat,
3748                 'simulate': opts.simulate,
3749                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3750                 'format': opts.format,
3751                 'format_limit': opts.format_limit,
3752                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3753                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3754                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3755                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3756                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3757                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3758                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3759                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3760                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3761                         or u'%(id)s.%(ext)s'),
3762                 'ignoreerrors': opts.ignoreerrors,
3763                 'ratelimit': opts.ratelimit,
3764                 'nooverwrites': opts.nooverwrites,
3765                 'retries': opts.retries,
3766                 'continuedl': opts.continue_dl,
3767                 'noprogress': opts.noprogress,
3768                 'playliststart': opts.playliststart,
3769                 'playlistend': opts.playlistend,
3770                 'logtostderr': opts.outtmpl == '-',
3771                 'consoletitle': opts.consoletitle,
3772                 'nopart': opts.nopart,
3773                 'updatetime': opts.updatetime,
3774                 'writedescription': opts.writedescription,
3775                 'writeinfojson': opts.writeinfojson,
3776                 'matchtitle': opts.matchtitle,
3777                 'rejecttitle': opts.rejecttitle,
3778                 })
3779         for extractor in extractors:
3780                 fd.add_info_extractor(extractor)
3781
3782         # PostProcessors
3783         if opts.extractaudio:
3784                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3785
3786         # Update version
3787         if opts.update_self:
3788                 updateSelf(fd, sys.argv[0])
3789
3790         # Maybe do nothing
3791         if len(all_urls) < 1:
3792                 if not opts.update_self:
3793                         parser.error(u'you must provide at least one URL')
3794                 else:
3795                         sys.exit()
3796         retcode = fd.download(all_urls)
3797
3798         # Dump cookie jar if requested
3799         if opts.cookiefile is not None:
3800                 try:
3801                         jar.save()
3802                 except (IOError, OSError), err:
3803                         sys.exit(u'ERROR: unable to save cookie jar')
3804
3805         sys.exit(retcode)
3806
3807
3808 if __name__ == '__main__':
3809         try:
3810                 main()
3811         except DownloadError:
3812                 sys.exit(1)
3813         except SameFileError:
3814                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3815         except KeyboardInterrupt:
3816                 sys.exit(u'\nERROR: Interrupted by user')
3817
3818 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: