youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.09.15'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import HTMLParser
  27 import httplib
  28 import locale
  29 import math
  30 import netrc
  31 import os
  32 import os.path
  33 import re
  34 import socket
  35 import string
  36 import subprocess
  37 import sys
  38 import time
  39 import urllib
  40 import urllib2
  41 import warnings
  42 import zlib
  43
  44 if os.name == 'nt':
  45         import ctypes
  46
  47 try:
  48         import email.utils
  49 except ImportError: # Python 2.4
  50         import email.Utils
  51 try:
  52         import cStringIO as StringIO
  53 except ImportError:
  54         import StringIO
  55
  56 # parse_qs was moved from the cgi module to the urlparse module recently.
  57 try:
  58         from urlparse import parse_qs
  59 except ImportError:
  60         from cgi import parse_qs
  61
  62 try:
  63         import lxml.etree
  64 except ImportError:
  65         pass # Handled below
  66
  67 try:
  68         import xml.etree.ElementTree
  69 except ImportError: # Python<2.5: Not officially supported, but let it slip
  70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  71
  72 std_headers = {
  73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  76         'Accept-Encoding': 'gzip, deflate',
  77         'Accept-Language': 'en-us,en;q=0.5',
  78 }
  79
  80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280
 281 class DownloadError(Exception):
 282         """Download Error exception.
 283
 284         This exception may be thrown by FileDownloader objects if they are not
 285         configured to continue on errors. They will contain the appropriate
 286         error message.
 287         """
 288         pass
 289
 290
 291 class SameFileError(Exception):
 292         """Same File exception.
 293
 294         This exception will be thrown by FileDownloader objects if they detect
 295         multiple files would have to be downloaded to the same file on disk.
 296         """
 297         pass
 298
 299
 300 class PostProcessingError(Exception):
 301         """Post Processing exception.
 302
 303         This exception may be raised by PostProcessor's .run() method to
 304         indicate an error in the postprocessing task.
 305         """
 306         pass
 307
 308
 309 class UnavailableVideoError(Exception):
 310         """Unavailable Format exception.
 311
 312         This exception will be thrown when a video is requested
 313         in a format that is not available for that video.
 314         """
 315         pass
 316
 317
 318 class ContentTooShortError(Exception):
 319         """Content Too Short exception.
 320
 321         This exception may be raised by FileDownloader objects when a file they
 322         download is too small for what the server announced first, indicating
 323         the connection was probably interrupted.
 324         """
 325         # Both in bytes
 326         downloaded = None
 327         expected = None
 328
 329         def __init__(self, downloaded, expected):
 330                 self.downloaded = downloaded
 331                 self.expected = expected
 332
 333
 334 class YoutubeDLHandler(urllib2.HTTPHandler):
 335         """Handler for HTTP requests and responses.
 336
 337         This class, when installed with an OpenerDirector, automatically adds
 338         the standard headers to every HTTP request and handles gzipped and
 339         deflated responses from web servers. If compression is to be avoided in
 340         a particular request, the original request in the program code only has
 341         to include the HTTP header "Youtubedl-No-Compression", which will be
 342         removed before making the real request.
 343
 344         Part of this code was copied from:
 345
 346         http://techknack.net/python-urllib2-handlers/
 347
 348         Andrew Rowls, the author of that code, agreed to release it to the
 349         public domain.
 350         """
 351
 352         @staticmethod
 353         def deflate(data):
 354                 try:
 355                         return zlib.decompress(data, -zlib.MAX_WBITS)
 356                 except zlib.error:
 357                         return zlib.decompress(data)
 358
 359         @staticmethod
 360         def addinfourl_wrapper(stream, headers, url, code):
 361                 if hasattr(urllib2.addinfourl, 'getcode'):
 362                         return urllib2.addinfourl(stream, headers, url, code)
 363                 ret = urllib2.addinfourl(stream, headers, url)
 364                 ret.code = code
 365                 return ret
 366
 367         def http_request(self, req):
 368                 for h in std_headers:
 369                         if h in req.headers:
 370                                 del req.headers[h]
 371                         req.add_header(h, std_headers[h])
 372                 if 'Youtubedl-no-compression' in req.headers:
 373                         if 'Accept-encoding' in req.headers:
 374                                 del req.headers['Accept-encoding']
 375                         del req.headers['Youtubedl-no-compression']
 376                 return req
 377
 378         def http_response(self, req, resp):
 379                 old_resp = resp
 380                 # gzip
 381                 if resp.headers.get('Content-encoding', '') == 'gzip':
 382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 384                         resp.msg = old_resp.msg
 385                 # deflate
 386                 if resp.headers.get('Content-encoding', '') == 'deflate':
 387                         gz = StringIO.StringIO(self.deflate(resp.read()))
 388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 389                         resp.msg = old_resp.msg
 390                 return resp
 391
 392
 393 class FileDownloader(object):
 394         """File Downloader class.
 395
 396         File downloader objects are the ones responsible of downloading the
 397         actual video file and writing it to disk if the user has requested
 398         it, among some other tasks. In most cases there should be one per
 399         program. As, given a video URL, the downloader doesn't know how to
 400         extract all the needed information, task that InfoExtractors do, it
 401         has to pass the URL to one of them.
 402
 403         For this, file downloader objects have a method that allows
 404         InfoExtractors to be registered in a given order. When it is passed
 405         a URL, the file downloader handles it to the first InfoExtractor it
 406         finds that reports being able to handle it. The InfoExtractor extracts
 407         all the information about the video or videos the URL refers to, and
 408         asks the FileDownloader to process the video information, possibly
 409         downloading the video.
 410
 411         File downloaders accept a lot of parameters. In order not to saturate
 412         the object constructor with arguments, it receives a dictionary of
 413         options instead. These options are available through the params
 414         attribute for the InfoExtractors to use. The FileDownloader also
 415         registers itself as the downloader in charge for the InfoExtractors
 416         that are added to it, so this is a "mutual registration".
 417
 418         Available options:
 419
 420         username:         Username for authentication purposes.
 421         password:         Password for authentication purposes.
 422         usenetrc:         Use netrc for authentication instead.
 423         quiet:            Do not print messages to stdout.
 424         forceurl:         Force printing final URL.
 425         forcetitle:       Force printing title.
 426         forcethumbnail:   Force printing thumbnail URL.
 427         forcedescription: Force printing description.
 428         forcefilename:    Force printing final filename.
 429         simulate:         Do not download the video files.
 430         format:           Video format code.
 431         format_limit:     Highest quality format to try.
 432         outtmpl:          Template for output names.
 433         ignoreerrors:     Do not stop on download errors.
 434         ratelimit:        Download speed limit, in bytes/sec.
 435         nooverwrites:     Prevent overwriting files.
 436         retries:          Number of times to retry for HTTP error 5xx
 437         continuedl:       Try to continue downloads if possible.
 438         noprogress:       Do not print the progress bar.
 439         playliststart:    Playlist item to start at.
 440         playlistend:      Playlist item to end at.
 441         matchtitle:       Download only matching titles.
 442         rejecttitle:      Reject downloads for matching titles.
 443         logtostderr:      Log messages to stderr instead of stdout.
 444         consoletitle:     Display progress in console window's titlebar.
 445         nopart:           Do not use temporary .part files.
 446         updatetime:       Use the Last-modified header to set output file timestamps.
 447         writedescription: Write the video description to a .description file
 448         writeinfojson:    Write the video description to a .info.json file
 449         """
 450
 451         params = None
 452         _ies = []
 453         _pps = []
 454         _download_retcode = None
 455         _num_downloads = None
 456         _screen_file = None
 457
 458         def __init__(self, params):
 459                 """Create a FileDownloader object with the given options."""
 460                 self._ies = []
 461                 self._pps = []
 462                 self._download_retcode = 0
 463                 self._num_downloads = 0
 464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 465                 self.params = params
 466
 467         @staticmethod
 468         def format_bytes(bytes):
 469                 if bytes is None:
 470                         return 'N/A'
 471                 if type(bytes) is str:
 472                         bytes = float(bytes)
 473                 if bytes == 0.0:
 474                         exponent = 0
 475                 else:
 476                         exponent = long(math.log(bytes, 1024.0))
 477                 suffix = 'bkMGTPEZY'[exponent]
 478                 converted = float(bytes) / float(1024 ** exponent)
 479                 return '%.2f%s' % (converted, suffix)
 480
 481         @staticmethod
 482         def calc_percent(byte_counter, data_len):
 483                 if data_len is None:
 484                         return '---.-%'
 485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 486
 487         @staticmethod
 488         def calc_eta(start, now, total, current):
 489                 if total is None:
 490                         return '--:--'
 491                 dif = now - start
 492                 if current == 0 or dif < 0.001: # One millisecond
 493                         return '--:--'
 494                 rate = float(current) / dif
 495                 eta = long((float(total) - float(current)) / rate)
 496                 (eta_mins, eta_secs) = divmod(eta, 60)
 497                 if eta_mins > 99:
 498                         return '--:--'
 499                 return '%02d:%02d' % (eta_mins, eta_secs)
 500
 501         @staticmethod
 502         def calc_speed(start, now, bytes):
 503                 dif = now - start
 504                 if bytes == 0 or dif < 0.001: # One millisecond
 505                         return '%10s' % '---b/s'
 506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 507
 508         @staticmethod
 509         def best_block_size(elapsed_time, bytes):
 510                 new_min = max(bytes / 2.0, 1.0)
 511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 512                 if elapsed_time < 0.001:
 513                         return long(new_max)
 514                 rate = bytes / elapsed_time
 515                 if rate > new_max:
 516                         return long(new_max)
 517                 if rate < new_min:
 518                         return long(new_min)
 519                 return long(rate)
 520
 521         @staticmethod
 522         def parse_bytes(bytestr):
 523                 """Parse a string indicating a byte quantity into a long integer."""
 524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 525                 if matchobj is None:
 526                         return None
 527                 number = float(matchobj.group(1))
 528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 529                 return long(round(number * multiplier))
 530
 531         def add_info_extractor(self, ie):
 532                 """Add an InfoExtractor object to the end of the list."""
 533                 self._ies.append(ie)
 534                 ie.set_downloader(self)
 535
 536         def add_post_processor(self, pp):
 537                 """Add a PostProcessor object to the end of the chain."""
 538                 self._pps.append(pp)
 539                 pp.set_downloader(self)
 540
 541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 542                 """Print message to stdout if not in quiet mode."""
 543                 try:
 544                         if not self.params.get('quiet', False):
 545                                 terminator = [u'\n', u''][skip_eol]
 546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 547                         self._screen_file.flush()
 548                 except (UnicodeEncodeError), err:
 549                         if not ignore_encoding_errors:
 550                                 raise
 551
 552         def to_stderr(self, message):
 553                 """Print message to stderr."""
 554                 print >>sys.stderr, message.encode(preferredencoding())
 555
 556         def to_cons_title(self, message):
 557                 """Set console/terminal window title to message."""
 558                 if not self.params.get('consoletitle', False):
 559                         return
 560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 561                         # c_wchar_p() might not be necessary if `message` is
 562                         # already of type unicode()
 563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 564                 elif 'TERM' in os.environ:
 565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 566
 567         def fixed_template(self):
 568                 """Checks if the output template is fixed."""
 569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 570
 571         def trouble(self, message=None):
 572                 """Determine action to take when a download problem appears.
 573
 574                 Depending on if the downloader has been configured to ignore
 575                 download errors or not, this method may throw an exception or
 576                 not when errors are found, after printing the message.
 577                 """
 578                 if message is not None:
 579                         self.to_stderr(message)
 580                 if not self.params.get('ignoreerrors', False):
 581                         raise DownloadError(message)
 582                 self._download_retcode = 1
 583
 584         def slow_down(self, start_time, byte_counter):
 585                 """Sleep if the download speed is over the rate limit."""
 586                 rate_limit = self.params.get('ratelimit', None)
 587                 if rate_limit is None or byte_counter == 0:
 588                         return
 589                 now = time.time()
 590                 elapsed = now - start_time
 591                 if elapsed <= 0.0:
 592                         return
 593                 speed = float(byte_counter) / elapsed
 594                 if speed > rate_limit:
 595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 596
 597         def temp_name(self, filename):
 598                 """Returns a temporary filename for the given filename."""
 599                 if self.params.get('nopart', False) or filename == u'-' or \
 600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 601                         return filename
 602                 return filename + u'.part'
 603
 604         def undo_temp_name(self, filename):
 605                 if filename.endswith(u'.part'):
 606                         return filename[:-len(u'.part')]
 607                 return filename
 608
 609         def try_rename(self, old_filename, new_filename):
 610                 try:
 611                         if old_filename == new_filename:
 612                                 return
 613                         os.rename(old_filename, new_filename)
 614                 except (IOError, OSError), err:
 615                         self.trouble(u'ERROR: unable to rename file')
 616
 617         def try_utime(self, filename, last_modified_hdr):
 618                 """Try to set the last-modified time of the given file."""
 619                 if last_modified_hdr is None:
 620                         return
 621                 if not os.path.isfile(filename):
 622                         return
 623                 timestr = last_modified_hdr
 624                 if timestr is None:
 625                         return
 626                 filetime = timeconvert(timestr)
 627                 if filetime is None:
 628                         return filetime
 629                 try:
 630                         os.utime(filename, (time.time(), filetime))
 631                 except:
 632                         pass
 633                 return filetime
 634
 635         def report_writedescription(self, descfn):
 636                 """ Report that the description file is being written """
 637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 638
 639         def report_writeinfojson(self, infofn):
 640                 """ Report that the metadata file has been written """
 641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 642
 643         def report_destination(self, filename):
 644                 """Report destination filename."""
 645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 646
 647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 648                 """Report download progress."""
 649                 if self.params.get('noprogress', False):
 650                         return
 651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 655
 656         def report_resuming_byte(self, resume_len):
 657                 """Report attempt to resume at given byte."""
 658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 659
 660         def report_retry(self, count, retries):
 661                 """Report retry in case of HTTP error 5xx"""
 662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 663
 664         def report_file_already_downloaded(self, file_name):
 665                 """Report file has already been fully downloaded."""
 666                 try:
 667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 668                 except (UnicodeEncodeError), err:
 669                         self.to_screen(u'[download] The file has already been downloaded')
 670
 671         def report_unable_to_resume(self):
 672                 """Report it was impossible to resume download."""
 673                 self.to_screen(u'[download] Unable to resume')
 674
 675         def report_finish(self):
 676                 """Report download finished."""
 677                 if self.params.get('noprogress', False):
 678                         self.to_screen(u'[download] Download completed')
 679                 else:
 680                         self.to_screen(u'')
 681
 682         def increment_downloads(self):
 683                 """Increment the ordinal that assigns a number to each file."""
 684                 self._num_downloads += 1
 685
 686         def prepare_filename(self, info_dict):
 687                 """Generate the output filename."""
 688                 try:
 689                         template_dict = dict(info_dict)
 690                         template_dict['epoch'] = unicode(long(time.time()))
 691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 692                         filename = self.params['outtmpl'] % template_dict
 693                         return filename
 694                 except (ValueError, KeyError), err:
 695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 696                         return None
 697
 698         def process_info(self, info_dict):
 699                 """Process a single dictionary returned by an InfoExtractor."""
 700                 filename = self.prepare_filename(info_dict)
 701
 702                 # Forced printings
 703                 if self.params.get('forcetitle', False):
 704                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 705                 if self.params.get('forceurl', False):
 706                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 707                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 708                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 710                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 711                 if self.params.get('forcefilename', False) and filename is not None:
 712                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 713
 714                 # Do nothing else if in simulate mode
 715                 if self.params.get('simulate', False):
 716                         return
 717
 718                 if filename is None:
 719                         return
 720
 721                 matchtitle=self.params.get('matchtitle',False)
 722                 rejecttitle=self.params.get('rejecttitle',False)
 723                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 724                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 725                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 726                         return
 727                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 728                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 729                         return
 730
 731                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 732                         self.to_stderr(u'WARNING: file exists and will be skipped')
 733                         return
 734
 735                 try:
 736                         dn = os.path.dirname(filename)
 737                         if dn != '' and not os.path.exists(dn):
 738                                 os.makedirs(dn)
 739                 except (OSError, IOError), err:
 740                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 741                         return
 742
 743                 if self.params.get('writedescription', False):
 744                         try:
 745                                 descfn = filename + '.description'
 746                                 self.report_writedescription(descfn)
 747                                 descfile = open(descfn, 'wb')
 748                                 try:
 749                                         descfile.write(info_dict['description'].encode('utf-8'))
 750                                 finally:
 751                                         descfile.close()
 752                         except (OSError, IOError):
 753                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 754                                 return
 755
 756                 if self.params.get('writeinfojson', False):
 757                         infofn = filename + '.info.json'
 758                         self.report_writeinfojson(infofn)
 759                         try:
 760                                 json.dump
 761                         except (NameError,AttributeError):
 762                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 763                                 return
 764                         try:
 765                                 infof = open(infofn, 'wb')
 766                                 try:
 767                                         json.dump(info_dict, infof)
 768                                 finally:
 769                                         infof.close()
 770                         except (OSError, IOError):
 771                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 772                                 return
 773
 774                 if not self.params.get('skip_download', False):
 775                         try:
 776                                 success,add_data = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 777                                 info_dict.update(add_data)
 778                         except (OSError, IOError), err:
 779                                 raise UnavailableVideoError
 780                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 781                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 782                                 return
 783                         except (ContentTooShortError, ), err:
 784                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 785                                 return
 786
 787                         if success:
 788                                 try:
 789                                         self.post_process(filename, info_dict)
 790                                 except (PostProcessingError), err:
 791                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 792                                         return
 793
 794         def download(self, url_list):
 795                 """Download a given list of URLs."""
 796                 if len(url_list) > 1 and self.fixed_template():
 797                         raise SameFileError(self.params['outtmpl'])
 798
 799                 for url in url_list:
 800                         suitable_found = False
 801                         for ie in self._ies:
 802                                 # Go to next InfoExtractor if not suitable
 803                                 if not ie.suitable(url):
 804                                         continue
 805
 806                                 # Suitable InfoExtractor found
 807                                 suitable_found = True
 808
 809                                 # Extract information from URL and process it
 810                                 ie.extract(url)
 811
 812                                 # Suitable InfoExtractor had been found; go to next URL
 813                                 break
 814
 815                         if not suitable_found:
 816                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 817
 818                 return self._download_retcode
 819
 820         def post_process(self, filename, ie_info):
 821                 """Run the postprocessing chain on the given file."""
 822                 info = dict(ie_info)
 823                 info['filepath'] = filename
 824                 for pp in self._pps:
 825                         info = pp.run(info)
 826                         if info is None:
 827                                 break
 828
 829         def _download_with_rtmpdump(self, filename, url, player_url):
 830                 self.report_destination(filename)
 831                 tmpfilename = self.temp_name(filename)
 832
 833                 # Check for rtmpdump first
 834                 try:
 835                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 836                 except (OSError, IOError):
 837                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 838                         return False
 839
 840                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 841                 # the connection was interrumpted and resuming appears to be
 842                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 843                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 844                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 845                 while retval == 2 or retval == 1:
 846                         prevsize = os.path.getsize(tmpfilename)
 847                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 848                         time.sleep(5.0) # This seems to be needed
 849                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 850                         cursize = os.path.getsize(tmpfilename)
 851                         if prevsize == cursize and retval == 1:
 852                                 break
 853                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 854                         if prevsize == cursize and retval == 2 and cursize > 1024:
 855                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 856                                 retval = 0
 857                                 break
 858                 if retval == 0:
 859                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 860                         self.try_rename(tmpfilename, filename)
 861                         return True
 862                 else:
 863                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 864                         return False
 865
 866         def _do_download(self, filename, url, player_url):
 867                 # Check file already present
 868                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 869                         self.report_file_already_downloaded(filename)
 870                         return True
 871
 872                 # Attempt to download using rtmpdump
 873                 if url.startswith('rtmp'):
 874                         return self._download_with_rtmpdump(filename, url, player_url)
 875
 876                 tmpfilename = self.temp_name(filename)
 877                 stream = None
 878                 open_mode = 'wb'
 879
 880                 # Do not include the Accept-Encoding header
 881                 headers = {'Youtubedl-no-compression': 'True'}
 882                 basic_request = urllib2.Request(url, None, headers)
 883                 request = urllib2.Request(url, None, headers)
 884
 885                 # Establish possible resume length
 886                 if os.path.isfile(tmpfilename):
 887                         resume_len = os.path.getsize(tmpfilename)
 888                 else:
 889                         resume_len = 0
 890
 891                 # Request parameters in case of being able to resume
 892                 if self.params.get('continuedl', False) and resume_len != 0:
 893                         self.report_resuming_byte(resume_len)
 894                         request.add_header('Range', 'bytes=%d-' % resume_len)
 895                         open_mode = 'ab'
 896
 897                 count = 0
 898                 retries = self.params.get('retries', 0)
 899                 while count <= retries:
 900                         # Establish connection
 901                         try:
 902                                 data = urllib2.urlopen(request)
 903                                 break
 904                         except (urllib2.HTTPError, ), err:
 905                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 906                                         # Unexpected HTTP error
 907                                         raise
 908                                 elif err.code == 416:
 909                                         # Unable to resume (requested range not satisfiable)
 910                                         try:
 911                                                 # Open the connection again without the range header
 912                                                 data = urllib2.urlopen(basic_request)
 913                                                 content_length = data.info()['Content-Length']
 914                                         except (urllib2.HTTPError, ), err:
 915                                                 if err.code < 500 or err.code >= 600:
 916                                                         raise
 917                                         else:
 918                                                 # Examine the reported length
 919                                                 if (content_length is not None and
 920                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 921                                                         # The file had already been fully downloaded.
 922                                                         # Explanation to the above condition: in issue #175 it was revealed that
 923                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 924                                                         # changing the file size slightly and causing problems for some users. So
 925                                                         # I decided to implement a suggested change and consider the file
 926                                                         # completely downloaded if the file size differs less than 100 bytes from
 927                                                         # the one in the hard drive.
 928                                                         self.report_file_already_downloaded(filename)
 929                                                         self.try_rename(tmpfilename, filename)
 930                                                         return True
 931                                                 else:
 932                                                         # The length does not match, we start the download over
 933                                                         self.report_unable_to_resume()
 934                                                         open_mode = 'wb'
 935                                                         break
 936                         # Retry
 937                         count += 1
 938                         if count <= retries:
 939                                 self.report_retry(count, retries)
 940
 941                 if count > retries:
 942                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 943                         return False
 944
 945                 data_len = data.info().get('Content-length', None)
 946                 if data_len is not None:
 947                         data_len = long(data_len) + resume_len
 948                 data_len_str = self.format_bytes(data_len)
 949                 byte_counter = 0 + resume_len
 950                 block_size = 1024
 951                 start = time.time()
 952                 while True:
 953                         # Download and write
 954                         before = time.time()
 955                         data_block = data.read(block_size)
 956                         after = time.time()
 957                         if len(data_block) == 0:
 958                                 break
 959                         byte_counter += len(data_block)
 960
 961                         # Open file just in time
 962                         if stream is None:
 963                                 try:
 964                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 965                                         assert stream is not None
 966                                         filename = self.undo_temp_name(tmpfilename)
 967                                         self.report_destination(filename)
 968                                 except (OSError, IOError), err:
 969                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 970                                         return False
 971                         try:
 972                                 stream.write(data_block)
 973                         except (IOError, OSError), err:
 974                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 975                                 return False
 976                         block_size = self.best_block_size(after - before, len(data_block))
 977
 978                         # Progress message
 979                         percent_str = self.calc_percent(byte_counter, data_len)
 980                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 981                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 982                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 983
 984                         # Apply rate limit
 985                         self.slow_down(start, byte_counter - resume_len)
 986
 987                 if stream is None:
 988                         self.trouble(u'\nERROR: Did not get any data blocks')
 989                         return False
 990                 stream.close()
 991                 self.report_finish()
 992                 if data_len is not None and byte_counter != data_len:
 993                         raise ContentTooShortError(byte_counter, long(data_len))
 994                 self.try_rename(tmpfilename, filename)
 995
 996                 # Update file modification time
 997                 filetime = None
 998                 if self.params.get('updatetime', True):
 999                         filetime = self.try_utime(filename, data.info().get('last-modified', None))
1000
1001                 return True, {'filetime': filetime}
1002
1003
1004 class InfoExtractor(object):
1005         """Information Extractor class.
1006
1007         Information extractors are the classes that, given a URL, extract
1008         information from the video (or videos) the URL refers to. This
1009         information includes the real video URL, the video title and simplified
1010         title, author and others. The information is stored in a dictionary
1011         which is then passed to the FileDownloader. The FileDownloader
1012         processes this information possibly downloading the video to the file
1013         system, among other possible outcomes. The dictionaries must include
1014         the following fields:
1015
1016         id:             Video identifier.
1017         url:            Final video URL.
1018         uploader:       Nickname of the video uploader.
1019         title:          Literal title.
1020         stitle:         Simplified title.
1021         ext:            Video filename extension.
1022         format:         Video format.
1023         player_url:     SWF Player URL (may be None).
1024
1025         The following fields are optional. Their primary purpose is to allow
1026         youtube-dl to serve as the backend for a video search function, such
1027         as the one in youtube2mp3.  They are only used when their respective
1028         forced printing functions are called:
1029
1030         thumbnail:      Full URL to a video thumbnail image.
1031         description:    One-line video description.
1032
1033         Subclasses of this one should re-define the _real_initialize() and
1034         _real_extract() methods and define a _VALID_URL regexp.
1035         Probably, they should also be added to the list of extractors.
1036         """
1037
1038         _ready = False
1039         _downloader = None
1040
1041         def __init__(self, downloader=None):
1042                 """Constructor. Receives an optional downloader."""
1043                 self._ready = False
1044                 self.set_downloader(downloader)
1045
1046         def suitable(self, url):
1047                 """Receives a URL and returns True if suitable for this IE."""
1048                 return re.match(self._VALID_URL, url) is not None
1049
1050         def initialize(self):
1051                 """Initializes an instance (authentication, etc)."""
1052                 if not self._ready:
1053                         self._real_initialize()
1054                         self._ready = True
1055
1056         def extract(self, url):
1057                 """Extracts URL information and returns it in list of dicts."""
1058                 self.initialize()
1059                 return self._real_extract(url)
1060
1061         def set_downloader(self, downloader):
1062                 """Sets the downloader for this IE."""
1063                 self._downloader = downloader
1064
1065         def _real_initialize(self):
1066                 """Real initialization process. Redefine in subclasses."""
1067                 pass
1068
1069         def _real_extract(self, url):
1070                 """Real extraction process. Redefine in subclasses."""
1071                 pass
1072
1073
1074 class YoutubeIE(InfoExtractor):
1075         """Information extractor for youtube.com."""
1076
1077         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1078         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1079         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1080         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1081         _NETRC_MACHINE = 'youtube'
1082         # Listed in order of quality
1083         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1084         _video_extensions = {
1085                 '13': '3gp',
1086                 '17': 'mp4',
1087                 '18': 'mp4',
1088                 '22': 'mp4',
1089                 '37': 'mp4',
1090                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1091                 '43': 'webm',
1092                 '45': 'webm',
1093         }
1094         IE_NAME = u'youtube'
1095
1096         def report_lang(self):
1097                 """Report attempt to set language."""
1098                 self._downloader.to_screen(u'[youtube] Setting language')
1099
1100         def report_login(self):
1101                 """Report attempt to log in."""
1102                 self._downloader.to_screen(u'[youtube] Logging in')
1103
1104         def report_age_confirmation(self):
1105                 """Report attempt to confirm age."""
1106                 self._downloader.to_screen(u'[youtube] Confirming age')
1107
1108         def report_video_webpage_download(self, video_id):
1109                 """Report attempt to download video webpage."""
1110                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1111
1112         def report_video_info_webpage_download(self, video_id):
1113                 """Report attempt to download video info webpage."""
1114                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1115
1116         def report_information_extraction(self, video_id):
1117                 """Report attempt to extract video information."""
1118                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1119
1120         def report_unavailable_format(self, video_id, format):
1121                 """Report extracted video URL."""
1122                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1123
1124         def report_rtmp_download(self):
1125                 """Indicate the download will use the RTMP protocol."""
1126                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1127
1128         def _real_initialize(self):
1129                 if self._downloader is None:
1130                         return
1131
1132                 username = None
1133                 password = None
1134                 downloader_params = self._downloader.params
1135
1136                 # Attempt to use provided username and password or .netrc data
1137                 if downloader_params.get('username', None) is not None:
1138                         username = downloader_params['username']
1139                         password = downloader_params['password']
1140                 elif downloader_params.get('usenetrc', False):
1141                         try:
1142                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1143                                 if info is not None:
1144                                         username = info[0]
1145                                         password = info[2]
1146                                 else:
1147                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1148                         except (IOError, netrc.NetrcParseError), err:
1149                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1150                                 return
1151
1152                 # Set language
1153                 request = urllib2.Request(self._LANG_URL)
1154                 try:
1155                         self.report_lang()
1156                         urllib2.urlopen(request).read()
1157                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1158                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1159                         return
1160
1161                 # No authentication to be performed
1162                 if username is None:
1163                         return
1164
1165                 # Log in
1166                 login_form = {
1167                                 'current_form': 'loginForm',
1168                                 'next':         '/',
1169                                 'action_login': 'Log In',
1170                                 'username':     username,
1171                                 'password':     password,
1172                                 }
1173                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1174                 try:
1175                         self.report_login()
1176                         login_results = urllib2.urlopen(request).read()
1177                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1178                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1179                                 return
1180                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1182                         return
1183
1184                 # Confirm age
1185                 age_form = {
1186                                 'next_url':             '/',
1187                                 'action_confirm':       'Confirm',
1188                                 }
1189                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1190                 try:
1191                         self.report_age_confirmation()
1192                         age_results = urllib2.urlopen(request).read()
1193                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1195                         return
1196
1197         def _real_extract(self, url):
1198                 # Extract video id from URL
1199                 mobj = re.match(self._VALID_URL, url)
1200                 if mobj is None:
1201                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1202                         return
1203                 video_id = mobj.group(2)
1204
1205                 # Get video webpage
1206                 self.report_video_webpage_download(video_id)
1207                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1208                 try:
1209                         video_webpage = urllib2.urlopen(request).read()
1210                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1212                         return
1213
1214                 # Attempt to extract SWF player URL
1215                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1216                 if mobj is not None:
1217                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1218                 else:
1219                         player_url = None
1220
1221                 # Get video info
1222                 self.report_video_info_webpage_download(video_id)
1223                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1224                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1225                                         % (video_id, el_type))
1226                         request = urllib2.Request(video_info_url)
1227                         try:
1228                                 video_info_webpage = urllib2.urlopen(request).read()
1229                                 video_info = parse_qs(video_info_webpage)
1230                                 if 'token' in video_info:
1231                                         break
1232                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1233                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1234                                 return
1235                 if 'token' not in video_info:
1236                         if 'reason' in video_info:
1237                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1238                         else:
1239                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1240                         return
1241
1242                 # Start extracting information
1243                 self.report_information_extraction(video_id)
1244
1245                 # uploader
1246                 if 'author' not in video_info:
1247                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1248                         return
1249                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1250
1251                 # title
1252                 if 'title' not in video_info:
1253                         self._downloader.trouble(u'ERROR: unable to extract video title')
1254                         return
1255                 video_title = urllib.unquote_plus(video_info['title'][0])
1256                 video_title = video_title.decode('utf-8')
1257                 video_title = sanitize_title(video_title)
1258
1259                 # simplified title
1260                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1261                 simple_title = simple_title.strip(ur'_')
1262
1263                 # thumbnail image
1264                 if 'thumbnail_url' not in video_info:
1265                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1266                         video_thumbnail = ''
1267                 else:   # don't panic if we can't find it
1268                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1269
1270                 # upload date
1271                 upload_date = u'NA'
1272                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1273                 if mobj is not None:
1274                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1275                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1276                         for expression in format_expressions:
1277                                 try:
1278                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1279                                 except:
1280                                         pass
1281
1282                 # description
1283                 try:
1284                         lxml.etree
1285                 except NameError:
1286                         video_description = u'No description available.'
1287                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1288                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1289                                 if mobj is not None:
1290                                         video_description = mobj.group(1).decode('utf-8')
1291                 else:
1292                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1293                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1294                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1295                         # TODO use another parser
1296
1297                 # token
1298                 video_token = urllib.unquote_plus(video_info['token'][0])
1299
1300                 # Decide which formats to download
1301                 req_format = self._downloader.params.get('format', None)
1302
1303                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1304                         self.report_rtmp_download()
1305                         video_url_list = [(None, video_info['conn'][0])]
1306                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1307                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1308                         url_data = [parse_qs(uds) for uds in url_data_strs]
1309                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1310                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1311
1312                         format_limit = self._downloader.params.get('format_limit', None)
1313                         if format_limit is not None and format_limit in self._available_formats:
1314                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1315                         else:
1316                                 format_list = self._available_formats
1317                         existing_formats = [x for x in format_list if x in url_map]
1318                         if len(existing_formats) == 0:
1319                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1320                                 return
1321                         if req_format is None:
1322                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1323                         elif req_format == '-1':
1324                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1325                         else:
1326                                 # Specific format
1327                                 if req_format not in url_map:
1328                                         self._downloader.trouble(u'ERROR: requested format not available')
1329                                         return
1330                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1331                 else:
1332                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1333                         return
1334
1335                 for format_param, video_real_url in video_url_list:
1336                         # At this point we have a new video
1337                         self._downloader.increment_downloads()
1338
1339                         # Extension
1340                         video_extension = self._video_extensions.get(format_param, 'flv')
1341
1342                         try:
1343                                 # Process video information
1344                                 self._downloader.process_info({
1345                                         'id':           video_id.decode('utf-8'),
1346                                         'url':          video_real_url.decode('utf-8'),
1347                                         'uploader':     video_uploader.decode('utf-8'),
1348                                         'upload_date':  upload_date,
1349                                         'title':        video_title,
1350                                         'stitle':       simple_title,
1351                                         'ext':          video_extension.decode('utf-8'),
1352                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1353                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1354                                         'description':  video_description,
1355                                         'player_url':   player_url,
1356                                 })
1357                         except UnavailableVideoError, err:
1358                                 self._downloader.trouble(u'\nERROR: unable to download video')
1359
1360
1361 class MetacafeIE(InfoExtractor):
1362         """Information Extractor for metacafe.com."""
1363
1364         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1365         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1366         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1367         _youtube_ie = None
1368         IE_NAME = u'metacafe'
1369
1370         def __init__(self, youtube_ie, downloader=None):
1371                 InfoExtractor.__init__(self, downloader)
1372                 self._youtube_ie = youtube_ie
1373
1374         def report_disclaimer(self):
1375                 """Report disclaimer retrieval."""
1376                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1377
1378         def report_age_confirmation(self):
1379                 """Report attempt to confirm age."""
1380                 self._downloader.to_screen(u'[metacafe] Confirming age')
1381
1382         def report_download_webpage(self, video_id):
1383                 """Report webpage download."""
1384                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1385
1386         def report_extraction(self, video_id):
1387                 """Report information extraction."""
1388                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1389
1390         def _real_initialize(self):
1391                 # Retrieve disclaimer
1392                 request = urllib2.Request(self._DISCLAIMER)
1393                 try:
1394                         self.report_disclaimer()
1395                         disclaimer = urllib2.urlopen(request).read()
1396                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1397                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1398                         return
1399
1400                 # Confirm age
1401                 disclaimer_form = {
1402                         'filters': '0',
1403                         'submit': "Continue - I'm over 18",
1404                         }
1405                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1406                 try:
1407                         self.report_age_confirmation()
1408                         disclaimer = urllib2.urlopen(request).read()
1409                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1410                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1411                         return
1412
1413         def _real_extract(self, url):
1414                 # Extract id and simplified title from URL
1415                 mobj = re.match(self._VALID_URL, url)
1416                 if mobj is None:
1417                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1418                         return
1419
1420                 video_id = mobj.group(1)
1421
1422                 # Check if video comes from YouTube
1423                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1424                 if mobj2 is not None:
1425                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1426                         return
1427
1428                 # At this point we have a new video
1429                 self._downloader.increment_downloads()
1430
1431                 simple_title = mobj.group(2).decode('utf-8')
1432
1433                 # Retrieve video webpage to extract further information
1434                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1435                 try:
1436                         self.report_download_webpage(video_id)
1437                         webpage = urllib2.urlopen(request).read()
1438                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1439                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1440                         return
1441
1442                 # Extract URL, uploader and title from webpage
1443                 self.report_extraction(video_id)
1444                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1445                 if mobj is not None:
1446                         mediaURL = urllib.unquote(mobj.group(1))
1447                         video_extension = mediaURL[-3:]
1448
1449                         # Extract gdaKey if available
1450                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1451                         if mobj is None:
1452                                 video_url = mediaURL
1453                         else:
1454                                 gdaKey = mobj.group(1)
1455                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1456                 else:
1457                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1458                         if mobj is None:
1459                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1460                                 return
1461                         vardict = parse_qs(mobj.group(1))
1462                         if 'mediaData' not in vardict:
1463                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1464                                 return
1465                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1466                         if mobj is None:
1467                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1468                                 return
1469                         mediaURL = mobj.group(1).replace('\\/', '/')
1470                         video_extension = mediaURL[-3:]
1471                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1472
1473                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1474                 if mobj is None:
1475                         self._downloader.trouble(u'ERROR: unable to extract title')
1476                         return
1477                 video_title = mobj.group(1).decode('utf-8')
1478                 video_title = sanitize_title(video_title)
1479
1480                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1481                 if mobj is None:
1482                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1483                         return
1484                 video_uploader = mobj.group(1)
1485
1486                 try:
1487                         # Process video information
1488                         self._downloader.process_info({
1489                                 'id':           video_id.decode('utf-8'),
1490                                 'url':          video_url.decode('utf-8'),
1491                                 'uploader':     video_uploader.decode('utf-8'),
1492                                 'upload_date':  u'NA',
1493                                 'title':        video_title,
1494                                 'stitle':       simple_title,
1495                                 'ext':          video_extension.decode('utf-8'),
1496                                 'format':       u'NA',
1497                                 'player_url':   None,
1498                         })
1499                 except UnavailableVideoError:
1500                         self._downloader.trouble(u'\nERROR: unable to download video')
1501
1502
1503 class DailymotionIE(InfoExtractor):
1504         """Information Extractor for Dailymotion"""
1505
1506         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1507         IE_NAME = u'dailymotion'
1508
1509         def __init__(self, downloader=None):
1510                 InfoExtractor.__init__(self, downloader)
1511
1512         def report_download_webpage(self, video_id):
1513                 """Report webpage download."""
1514                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1515
1516         def report_extraction(self, video_id):
1517                 """Report information extraction."""
1518                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1519
1520         def _real_initialize(self):
1521                 return
1522
1523         def _real_extract(self, url):
1524                 # Extract id and simplified title from URL
1525                 mobj = re.match(self._VALID_URL, url)
1526                 if mobj is None:
1527                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1528                         return
1529
1530                 # At this point we have a new video
1531                 self._downloader.increment_downloads()
1532                 video_id = mobj.group(1)
1533
1534                 simple_title = mobj.group(2).decode('utf-8')
1535                 video_extension = 'flv'
1536
1537                 # Retrieve video webpage to extract further information
1538                 request = urllib2.Request(url)
1539                 request.add_header('Cookie', 'family_filter=off')
1540                 try:
1541                         self.report_download_webpage(video_id)
1542                         webpage = urllib2.urlopen(request).read()
1543                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1544                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1545                         return
1546
1547                 # Extract URL, uploader and title from webpage
1548                 self.report_extraction(video_id)
1549                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1550                 if mobj is None:
1551                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1552                         return
1553                 sequence = urllib.unquote(mobj.group(1))
1554                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1555                 if mobj is None:
1556                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1557                         return
1558                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1559
1560                 # if needed add http://www.dailymotion.com/ if relative URL
1561
1562                 video_url = mediaURL
1563
1564                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1565                 if mobj is None:
1566                         self._downloader.trouble(u'ERROR: unable to extract title')
1567                         return
1568                 video_title = mobj.group(1).decode('utf-8')
1569                 video_title = sanitize_title(video_title)
1570
1571                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1572                 if mobj is None:
1573                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1574                         return
1575                 video_uploader = mobj.group(1)
1576
1577                 try:
1578                         # Process video information
1579                         self._downloader.process_info({
1580                                 'id':           video_id.decode('utf-8'),
1581                                 'url':          video_url.decode('utf-8'),
1582                                 'uploader':     video_uploader.decode('utf-8'),
1583                                 'upload_date':  u'NA',
1584                                 'title':        video_title,
1585                                 'stitle':       simple_title,
1586                                 'ext':          video_extension.decode('utf-8'),
1587                                 'format':       u'NA',
1588                                 'player_url':   None,
1589                         })
1590                 except UnavailableVideoError:
1591                         self._downloader.trouble(u'\nERROR: unable to download video')
1592
1593
1594 class GoogleIE(InfoExtractor):
1595         """Information extractor for video.google.com."""
1596
1597         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1598         IE_NAME = u'video.google'
1599
1600         def __init__(self, downloader=None):
1601                 InfoExtractor.__init__(self, downloader)
1602
1603         def report_download_webpage(self, video_id):
1604                 """Report webpage download."""
1605                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1606
1607         def report_extraction(self, video_id):
1608                 """Report information extraction."""
1609                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1610
1611         def _real_initialize(self):
1612                 return
1613
1614         def _real_extract(self, url):
1615                 # Extract id from URL
1616                 mobj = re.match(self._VALID_URL, url)
1617                 if mobj is None:
1618                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1619                         return
1620
1621                 # At this point we have a new video
1622                 self._downloader.increment_downloads()
1623                 video_id = mobj.group(1)
1624
1625                 video_extension = 'mp4'
1626
1627                 # Retrieve video webpage to extract further information
1628                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1629                 try:
1630                         self.report_download_webpage(video_id)
1631                         webpage = urllib2.urlopen(request).read()
1632                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1633                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1634                         return
1635
1636                 # Extract URL, uploader, and title from webpage
1637                 self.report_extraction(video_id)
1638                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1639                 if mobj is None:
1640                         video_extension = 'flv'
1641                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1642                 if mobj is None:
1643                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1644                         return
1645                 mediaURL = urllib.unquote(mobj.group(1))
1646                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1647                 mediaURL = mediaURL.replace('\\x26', '\x26')
1648
1649                 video_url = mediaURL
1650
1651                 mobj = re.search(r'<title>(.*)</title>', webpage)
1652                 if mobj is None:
1653                         self._downloader.trouble(u'ERROR: unable to extract title')
1654                         return
1655                 video_title = mobj.group(1).decode('utf-8')
1656                 video_title = sanitize_title(video_title)
1657                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1658
1659                 # Extract video description
1660                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1661                 if mobj is None:
1662                         self._downloader.trouble(u'ERROR: unable to extract video description')
1663                         return
1664                 video_description = mobj.group(1).decode('utf-8')
1665                 if not video_description:
1666                         video_description = 'No description available.'
1667
1668                 # Extract video thumbnail
1669                 if self._downloader.params.get('forcethumbnail', False):
1670                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1671                         try:
1672                                 webpage = urllib2.urlopen(request).read()
1673                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1674                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1675                                 return
1676                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1677                         if mobj is None:
1678                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1679                                 return
1680                         video_thumbnail = mobj.group(1)
1681                 else:   # we need something to pass to process_info
1682                         video_thumbnail = ''
1683
1684                 try:
1685                         # Process video information
1686                         self._downloader.process_info({
1687                                 'id':           video_id.decode('utf-8'),
1688                                 'url':          video_url.decode('utf-8'),
1689                                 'uploader':     u'NA',
1690                                 'upload_date':  u'NA',
1691                                 'title':        video_title,
1692                                 'stitle':       simple_title,
1693                                 'ext':          video_extension.decode('utf-8'),
1694                                 'format':       u'NA',
1695                                 'player_url':   None,
1696                         })
1697                 except UnavailableVideoError:
1698                         self._downloader.trouble(u'\nERROR: unable to download video')
1699
1700
1701 class PhotobucketIE(InfoExtractor):
1702         """Information extractor for photobucket.com."""
1703
1704         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1705         IE_NAME = u'photobucket'
1706
1707         def __init__(self, downloader=None):
1708                 InfoExtractor.__init__(self, downloader)
1709
1710         def report_download_webpage(self, video_id):
1711                 """Report webpage download."""
1712                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1713
1714         def report_extraction(self, video_id):
1715                 """Report information extraction."""
1716                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1717
1718         def _real_initialize(self):
1719                 return
1720
1721         def _real_extract(self, url):
1722                 # Extract id from URL
1723                 mobj = re.match(self._VALID_URL, url)
1724                 if mobj is None:
1725                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1726                         return
1727
1728                 # At this point we have a new video
1729                 self._downloader.increment_downloads()
1730                 video_id = mobj.group(1)
1731
1732                 video_extension = 'flv'
1733
1734                 # Retrieve video webpage to extract further information
1735                 request = urllib2.Request(url)
1736                 try:
1737                         self.report_download_webpage(video_id)
1738                         webpage = urllib2.urlopen(request).read()
1739                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1740                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1741                         return
1742
1743                 # Extract URL, uploader, and title from webpage
1744                 self.report_extraction(video_id)
1745                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1746                 if mobj is None:
1747                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1748                         return
1749                 mediaURL = urllib.unquote(mobj.group(1))
1750
1751                 video_url = mediaURL
1752
1753                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1754                 if mobj is None:
1755                         self._downloader.trouble(u'ERROR: unable to extract title')
1756                         return
1757                 video_title = mobj.group(1).decode('utf-8')
1758                 video_title = sanitize_title(video_title)
1759                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1760
1761                 video_uploader = mobj.group(2).decode('utf-8')
1762
1763                 try:
1764                         # Process video information
1765                         self._downloader.process_info({
1766                                 'id':           video_id.decode('utf-8'),
1767                                 'url':          video_url.decode('utf-8'),
1768                                 'uploader':     video_uploader,
1769                                 'upload_date':  u'NA',
1770                                 'title':        video_title,
1771                                 'stitle':       simple_title,
1772                                 'ext':          video_extension.decode('utf-8'),
1773                                 'format':       u'NA',
1774                                 'player_url':   None,
1775                         })
1776                 except UnavailableVideoError:
1777                         self._downloader.trouble(u'\nERROR: unable to download video')
1778
1779
1780 class YahooIE(InfoExtractor):
1781         """Information extractor for video.yahoo.com."""
1782
1783         # _VALID_URL matches all Yahoo! Video URLs
1784         # _VPAGE_URL matches only the extractable '/watch/' URLs
1785         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1786         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1787         IE_NAME = u'video.yahoo'
1788
1789         def __init__(self, downloader=None):
1790                 InfoExtractor.__init__(self, downloader)
1791
1792         def report_download_webpage(self, video_id):
1793                 """Report webpage download."""
1794                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1795
1796         def report_extraction(self, video_id):
1797                 """Report information extraction."""
1798                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1799
1800         def _real_initialize(self):
1801                 return
1802
1803         def _real_extract(self, url, new_video=True):
1804                 # Extract ID from URL
1805                 mobj = re.match(self._VALID_URL, url)
1806                 if mobj is None:
1807                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1808                         return
1809
1810                 # At this point we have a new video
1811                 self._downloader.increment_downloads()
1812                 video_id = mobj.group(2)
1813                 video_extension = 'flv'
1814
1815                 # Rewrite valid but non-extractable URLs as
1816                 # extractable English language /watch/ URLs
1817                 if re.match(self._VPAGE_URL, url) is None:
1818                         request = urllib2.Request(url)
1819                         try:
1820                                 webpage = urllib2.urlopen(request).read()
1821                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1822                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1823                                 return
1824
1825                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1826                         if mobj is None:
1827                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1828                                 return
1829                         yahoo_id = mobj.group(1)
1830
1831                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1832                         if mobj is None:
1833                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1834                                 return
1835                         yahoo_vid = mobj.group(1)
1836
1837                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1838                         return self._real_extract(url, new_video=False)
1839
1840                 # Retrieve video webpage to extract further information
1841                 request = urllib2.Request(url)
1842                 try:
1843                         self.report_download_webpage(video_id)
1844                         webpage = urllib2.urlopen(request).read()
1845                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1846                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1847                         return
1848
1849                 # Extract uploader and title from webpage
1850                 self.report_extraction(video_id)
1851                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1852                 if mobj is None:
1853                         self._downloader.trouble(u'ERROR: unable to extract video title')
1854                         return
1855                 video_title = mobj.group(1).decode('utf-8')
1856                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1857
1858                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1859                 if mobj is None:
1860                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1861                         return
1862                 video_uploader = mobj.group(1).decode('utf-8')
1863
1864                 # Extract video thumbnail
1865                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1866                 if mobj is None:
1867                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1868                         return
1869                 video_thumbnail = mobj.group(1).decode('utf-8')
1870
1871                 # Extract video description
1872                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1873                 if mobj is None:
1874                         self._downloader.trouble(u'ERROR: unable to extract video description')
1875                         return
1876                 video_description = mobj.group(1).decode('utf-8')
1877                 if not video_description:
1878                         video_description = 'No description available.'
1879
1880                 # Extract video height and width
1881                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1882                 if mobj is None:
1883                         self._downloader.trouble(u'ERROR: unable to extract video height')
1884                         return
1885                 yv_video_height = mobj.group(1)
1886
1887                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1888                 if mobj is None:
1889                         self._downloader.trouble(u'ERROR: unable to extract video width')
1890                         return
1891                 yv_video_width = mobj.group(1)
1892
1893                 # Retrieve video playlist to extract media URL
1894                 # I'm not completely sure what all these options are, but we
1895                 # seem to need most of them, otherwise the server sends a 401.
1896                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1897                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1898                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1899                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1900                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1901                 try:
1902                         self.report_download_webpage(video_id)
1903                         webpage = urllib2.urlopen(request).read()
1904                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1905                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1906                         return
1907
1908                 # Extract media URL from playlist XML
1909                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1910                 if mobj is None:
1911                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1912                         return
1913                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1914                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1915
1916                 try:
1917                         # Process video information
1918                         self._downloader.process_info({
1919                                 'id':           video_id.decode('utf-8'),
1920                                 'url':          video_url,
1921                                 'uploader':     video_uploader,
1922                                 'upload_date':  u'NA',
1923                                 'title':        video_title,
1924                                 'stitle':       simple_title,
1925                                 'ext':          video_extension.decode('utf-8'),
1926                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1927                                 'description':  video_description,
1928                                 'thumbnail':    video_thumbnail,
1929                                 'player_url':   None,
1930                         })
1931                 except UnavailableVideoError:
1932                         self._downloader.trouble(u'\nERROR: unable to download video')
1933
1934
1935 class VimeoIE(InfoExtractor):
1936         """Information extractor for vimeo.com."""
1937
1938         # _VALID_URL matches Vimeo URLs
1939         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1940         IE_NAME = u'vimeo'
1941
1942         def __init__(self, downloader=None):
1943                 InfoExtractor.__init__(self, downloader)
1944
1945         def report_download_webpage(self, video_id):
1946                 """Report webpage download."""
1947                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1948
1949         def report_extraction(self, video_id):
1950                 """Report information extraction."""
1951                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1952
1953         def _real_initialize(self):
1954                 return
1955
1956         def _real_extract(self, url, new_video=True):
1957                 # Extract ID from URL
1958                 mobj = re.match(self._VALID_URL, url)
1959                 if mobj is None:
1960                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1961                         return
1962
1963                 # At this point we have a new video
1964                 self._downloader.increment_downloads()
1965                 video_id = mobj.group(1)
1966
1967                 # Retrieve video webpage to extract further information
1968                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1969                 try:
1970                         self.report_download_webpage(video_id)
1971                         webpage = urllib2.urlopen(request).read()
1972                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1973                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1974                         return
1975
1976                 # Now we begin extracting as much information as we can from what we
1977                 # retrieved. First we extract the information common to all extractors,
1978                 # and latter we extract those that are Vimeo specific.
1979                 self.report_extraction(video_id)
1980
1981                 # Extract title
1982                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1983                 if mobj is None:
1984                         self._downloader.trouble(u'ERROR: unable to extract video title')
1985                         return
1986                 video_title = mobj.group(1).decode('utf-8')
1987                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1988
1989                 # Extract uploader
1990                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1991                 if mobj is None:
1992                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1993                         return
1994                 video_uploader = mobj.group(1).decode('utf-8')
1995
1996                 # Extract video thumbnail
1997                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1998                 if mobj is None:
1999                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2000                         return
2001                 video_thumbnail = mobj.group(1).decode('utf-8')
2002
2003                 # # Extract video description
2004                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2005                 # if mobj is None:
2006                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2007                 #       return
2008                 # video_description = mobj.group(1).decode('utf-8')
2009                 # if not video_description: video_description = 'No description available.'
2010                 video_description = 'Foo.'
2011
2012                 # Vimeo specific: extract request signature
2013                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2014                 if mobj is None:
2015                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2016                         return
2017                 sig = mobj.group(1).decode('utf-8')
2018
2019                 # Vimeo specific: Extract request signature expiration
2020                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2021                 if mobj is None:
2022                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2023                         return
2024                 sig_exp = mobj.group(1).decode('utf-8')
2025
2026                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2027
2028                 try:
2029                         # Process video information
2030                         self._downloader.process_info({
2031                                 'id':           video_id.decode('utf-8'),
2032                                 'url':          video_url,
2033                                 'uploader':     video_uploader,
2034                                 'upload_date':  u'NA',
2035                                 'title':        video_title,
2036                                 'stitle':       simple_title,
2037                                 'ext':          u'mp4',
2038                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2039                                 'description':  video_description,
2040                                 'thumbnail':    video_thumbnail,
2041                                 'description':  video_description,
2042                                 'player_url':   None,
2043                         })
2044                 except UnavailableVideoError:
2045                         self._downloader.trouble(u'ERROR: unable to download video')
2046
2047
2048 class GenericIE(InfoExtractor):
2049         """Generic last-resort information extractor."""
2050
2051         _VALID_URL = r'.*'
2052         IE_NAME = u'generic'
2053
2054         def __init__(self, downloader=None):
2055                 InfoExtractor.__init__(self, downloader)
2056
2057         def report_download_webpage(self, video_id):
2058                 """Report webpage download."""
2059                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2060                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2061
2062         def report_extraction(self, video_id):
2063                 """Report information extraction."""
2064                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2065
2066         def _real_initialize(self):
2067                 return
2068
2069         def _real_extract(self, url):
2070                 # At this point we have a new video
2071                 self._downloader.increment_downloads()
2072
2073                 video_id = url.split('/')[-1]
2074                 request = urllib2.Request(url)
2075                 try:
2076                         self.report_download_webpage(video_id)
2077                         webpage = urllib2.urlopen(request).read()
2078                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2079                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2080                         return
2081                 except ValueError, err:
2082                         # since this is the last-resort InfoExtractor, if
2083                         # this error is thrown, it'll be thrown here
2084                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2085                         return
2086
2087                 self.report_extraction(video_id)
2088                 # Start with something easy: JW Player in SWFObject
2089                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2090                 if mobj is None:
2091                         # Broaden the search a little bit
2092                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2093                 if mobj is None:
2094                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2095                         return
2096
2097                 # It's possible that one of the regexes
2098                 # matched, but returned an empty group:
2099                 if mobj.group(1) is None:
2100                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2101                         return
2102
2103                 video_url = urllib.unquote(mobj.group(1))
2104                 video_id = os.path.basename(video_url)
2105
2106                 # here's a fun little line of code for you:
2107                 video_extension = os.path.splitext(video_id)[1][1:]
2108                 video_id = os.path.splitext(video_id)[0]
2109
2110                 # it's tempting to parse this further, but you would
2111                 # have to take into account all the variations like
2112                 #   Video Title - Site Name
2113                 #   Site Name | Video Title
2114                 #   Video Title - Tagline | Site Name
2115                 # and so on and so forth; it's just not practical
2116                 mobj = re.search(r'<title>(.*)</title>', webpage)
2117                 if mobj is None:
2118                         self._downloader.trouble(u'ERROR: unable to extract title')
2119                         return
2120                 video_title = mobj.group(1).decode('utf-8')
2121                 video_title = sanitize_title(video_title)
2122                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2123
2124                 # video uploader is domain name
2125                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2126                 if mobj is None:
2127                         self._downloader.trouble(u'ERROR: unable to extract title')
2128                         return
2129                 video_uploader = mobj.group(1).decode('utf-8')
2130
2131                 try:
2132                         # Process video information
2133                         self._downloader.process_info({
2134                                 'id':           video_id.decode('utf-8'),
2135                                 'url':          video_url.decode('utf-8'),
2136                                 'uploader':     video_uploader,
2137                                 'upload_date':  u'NA',
2138                                 'title':        video_title,
2139                                 'stitle':       simple_title,
2140                                 'ext':          video_extension.decode('utf-8'),
2141                                 'format':       u'NA',
2142                                 'player_url':   None,
2143                         })
2144                 except UnavailableVideoError, err:
2145                         self._downloader.trouble(u'\nERROR: unable to download video')
2146
2147
2148 class YoutubeSearchIE(InfoExtractor):
2149         """Information Extractor for YouTube search queries."""
2150         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2151         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2152         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2153         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2154         _youtube_ie = None
2155         _max_youtube_results = 1000
2156         IE_NAME = u'youtube:search'
2157
2158         def __init__(self, youtube_ie, downloader=None):
2159                 InfoExtractor.__init__(self, downloader)
2160                 self._youtube_ie = youtube_ie
2161
2162         def report_download_page(self, query, pagenum):
2163                 """Report attempt to download playlist page with given number."""
2164                 query = query.decode(preferredencoding())
2165                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2166
2167         def _real_initialize(self):
2168                 self._youtube_ie.initialize()
2169
2170         def _real_extract(self, query):
2171                 mobj = re.match(self._VALID_URL, query)
2172                 if mobj is None:
2173                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2174                         return
2175
2176                 prefix, query = query.split(':')
2177                 prefix = prefix[8:]
2178                 query = query.encode('utf-8')
2179                 if prefix == '':
2180                         self._download_n_results(query, 1)
2181                         return
2182                 elif prefix == 'all':
2183                         self._download_n_results(query, self._max_youtube_results)
2184                         return
2185                 else:
2186                         try:
2187                                 n = long(prefix)
2188                                 if n <= 0:
2189                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2190                                         return
2191                                 elif n > self._max_youtube_results:
2192                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2193                                         n = self._max_youtube_results
2194                                 self._download_n_results(query, n)
2195                                 return
2196                         except ValueError: # parsing prefix as integer fails
2197                                 self._download_n_results(query, 1)
2198                                 return
2199
2200         def _download_n_results(self, query, n):
2201                 """Downloads a specified number of results for a query"""
2202
2203                 video_ids = []
2204                 already_seen = set()
2205                 pagenum = 1
2206
2207                 while True:
2208                         self.report_download_page(query, pagenum)
2209                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2210                         request = urllib2.Request(result_url)
2211                         try:
2212                                 page = urllib2.urlopen(request).read()
2213                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2214                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2215                                 return
2216
2217                         # Extract video identifiers
2218                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2219                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2220                                 if video_id not in already_seen:
2221                                         video_ids.append(video_id)
2222                                         already_seen.add(video_id)
2223                                         if len(video_ids) == n:
2224                                                 # Specified n videos reached
2225                                                 for id in video_ids:
2226                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2227                                                 return
2228
2229                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2230                                 for id in video_ids:
2231                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2232                                 return
2233
2234                         pagenum = pagenum + 1
2235
2236
2237 class GoogleSearchIE(InfoExtractor):
2238         """Information Extractor for Google Video search queries."""
2239         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2240         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2241         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2242         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2243         _google_ie = None
2244         _max_google_results = 1000
2245         IE_NAME = u'video.google:search'
2246
2247         def __init__(self, google_ie, downloader=None):
2248                 InfoExtractor.__init__(self, downloader)
2249                 self._google_ie = google_ie
2250
2251         def report_download_page(self, query, pagenum):
2252                 """Report attempt to download playlist page with given number."""
2253                 query = query.decode(preferredencoding())
2254                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2255
2256         def _real_initialize(self):
2257                 self._google_ie.initialize()
2258
2259         def _real_extract(self, query):
2260                 mobj = re.match(self._VALID_URL, query)
2261                 if mobj is None:
2262                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2263                         return
2264
2265                 prefix, query = query.split(':')
2266                 prefix = prefix[8:]
2267                 query = query.encode('utf-8')
2268                 if prefix == '':
2269                         self._download_n_results(query, 1)
2270                         return
2271                 elif prefix == 'all':
2272                         self._download_n_results(query, self._max_google_results)
2273                         return
2274                 else:
2275                         try:
2276                                 n = long(prefix)
2277                                 if n <= 0:
2278                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2279                                         return
2280                                 elif n > self._max_google_results:
2281                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2282                                         n = self._max_google_results
2283                                 self._download_n_results(query, n)
2284                                 return
2285                         except ValueError: # parsing prefix as integer fails
2286                                 self._download_n_results(query, 1)
2287                                 return
2288
2289         def _download_n_results(self, query, n):
2290                 """Downloads a specified number of results for a query"""
2291
2292                 video_ids = []
2293                 already_seen = set()
2294                 pagenum = 1
2295
2296                 while True:
2297                         self.report_download_page(query, pagenum)
2298                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2299                         request = urllib2.Request(result_url)
2300                         try:
2301                                 page = urllib2.urlopen(request).read()
2302                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2303                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2304                                 return
2305
2306                         # Extract video identifiers
2307                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2308                                 video_id = mobj.group(1)
2309                                 if video_id not in already_seen:
2310                                         video_ids.append(video_id)
2311                                         already_seen.add(video_id)
2312                                         if len(video_ids) == n:
2313                                                 # Specified n videos reached
2314                                                 for id in video_ids:
2315                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2316                                                 return
2317
2318                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2319                                 for id in video_ids:
2320                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2321                                 return
2322
2323                         pagenum = pagenum + 1
2324
2325
2326 class YahooSearchIE(InfoExtractor):
2327         """Information Extractor for Yahoo! Video search queries."""
2328         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2329         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2330         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2331         _MORE_PAGES_INDICATOR = r'\s*Next'
2332         _yahoo_ie = None
2333         _max_yahoo_results = 1000
2334         IE_NAME = u'video.yahoo:search'
2335
2336         def __init__(self, yahoo_ie, downloader=None):
2337                 InfoExtractor.__init__(self, downloader)
2338                 self._yahoo_ie = yahoo_ie
2339
2340         def report_download_page(self, query, pagenum):
2341                 """Report attempt to download playlist page with given number."""
2342                 query = query.decode(preferredencoding())
2343                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2344
2345         def _real_initialize(self):
2346                 self._yahoo_ie.initialize()
2347
2348         def _real_extract(self, query):
2349                 mobj = re.match(self._VALID_URL, query)
2350                 if mobj is None:
2351                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2352                         return
2353
2354                 prefix, query = query.split(':')
2355                 prefix = prefix[8:]
2356                 query = query.encode('utf-8')
2357                 if prefix == '':
2358                         self._download_n_results(query, 1)
2359                         return
2360                 elif prefix == 'all':
2361                         self._download_n_results(query, self._max_yahoo_results)
2362                         return
2363                 else:
2364                         try:
2365                                 n = long(prefix)
2366                                 if n <= 0:
2367                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2368                                         return
2369                                 elif n > self._max_yahoo_results:
2370                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2371                                         n = self._max_yahoo_results
2372                                 self._download_n_results(query, n)
2373                                 return
2374                         except ValueError: # parsing prefix as integer fails
2375                                 self._download_n_results(query, 1)
2376                                 return
2377
2378         def _download_n_results(self, query, n):
2379                 """Downloads a specified number of results for a query"""
2380
2381                 video_ids = []
2382                 already_seen = set()
2383                 pagenum = 1
2384
2385                 while True:
2386                         self.report_download_page(query, pagenum)
2387                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2388                         request = urllib2.Request(result_url)
2389                         try:
2390                                 page = urllib2.urlopen(request).read()
2391                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2392                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2393                                 return
2394
2395                         # Extract video identifiers
2396                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2397                                 video_id = mobj.group(1)
2398                                 if video_id not in already_seen:
2399                                         video_ids.append(video_id)
2400                                         already_seen.add(video_id)
2401                                         if len(video_ids) == n:
2402                                                 # Specified n videos reached
2403                                                 for id in video_ids:
2404                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2405                                                 return
2406
2407                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2408                                 for id in video_ids:
2409                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2410                                 return
2411
2412                         pagenum = pagenum + 1
2413
2414
2415 class YoutubePlaylistIE(InfoExtractor):
2416         """Information Extractor for YouTube playlists."""
2417
2418         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2419         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2420         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2421         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2422         _youtube_ie = None
2423         IE_NAME = u'youtube:playlist'
2424
2425         def __init__(self, youtube_ie, downloader=None):
2426                 InfoExtractor.__init__(self, downloader)
2427                 self._youtube_ie = youtube_ie
2428
2429         def report_download_page(self, playlist_id, pagenum):
2430                 """Report attempt to download playlist page with given number."""
2431                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2432
2433         def _real_initialize(self):
2434                 self._youtube_ie.initialize()
2435
2436         def _real_extract(self, url):
2437                 # Extract playlist id
2438                 mobj = re.match(self._VALID_URL, url)
2439                 if mobj is None:
2440                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2441                         return
2442
2443                 # Single video case
2444                 if mobj.group(3) is not None:
2445                         self._youtube_ie.extract(mobj.group(3))
2446                         return
2447
2448                 # Download playlist pages
2449                 # prefix is 'p' as default for playlists but there are other types that need extra care
2450                 playlist_prefix = mobj.group(1)
2451                 if playlist_prefix == 'a':
2452                         playlist_access = 'artist'
2453                 else:
2454                         playlist_prefix = 'p'
2455                         playlist_access = 'view_play_list'
2456                 playlist_id = mobj.group(2)
2457                 video_ids = []
2458                 pagenum = 1
2459
2460                 while True:
2461                         self.report_download_page(playlist_id, pagenum)
2462                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2463                         try:
2464                                 page = urllib2.urlopen(request).read()
2465                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2466                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2467                                 return
2468
2469                         # Extract video identifiers
2470                         ids_in_page = []
2471                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2472                                 if mobj.group(1) not in ids_in_page:
2473                                         ids_in_page.append(mobj.group(1))
2474                         video_ids.extend(ids_in_page)
2475
2476                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2477                                 break
2478                         pagenum = pagenum + 1
2479
2480                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2481                 playlistend = self._downloader.params.get('playlistend', -1)
2482                 video_ids = video_ids[playliststart:playlistend]
2483
2484                 for id in video_ids:
2485                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2486                 return
2487
2488
2489 class YoutubeUserIE(InfoExtractor):
2490         """Information Extractor for YouTube users."""
2491
2492         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2493         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2494         _GDATA_PAGE_SIZE = 50
2495         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2496         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2497         _youtube_ie = None
2498         IE_NAME = u'youtube:user'
2499
2500         def __init__(self, youtube_ie, downloader=None):
2501                 InfoExtractor.__init__(self, downloader)
2502                 self._youtube_ie = youtube_ie
2503
2504         def report_download_page(self, username, start_index):
2505                 """Report attempt to download user page."""
2506                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2507                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2508
2509         def _real_initialize(self):
2510                 self._youtube_ie.initialize()
2511
2512         def _real_extract(self, url):
2513                 # Extract username
2514                 mobj = re.match(self._VALID_URL, url)
2515                 if mobj is None:
2516                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2517                         return
2518
2519                 username = mobj.group(1)
2520
2521                 # Download video ids using YouTube Data API. Result size per
2522                 # query is limited (currently to 50 videos) so we need to query
2523                 # page by page until there are no video ids - it means we got
2524                 # all of them.
2525
2526                 video_ids = []
2527                 pagenum = 0
2528
2529                 while True:
2530                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2531                         self.report_download_page(username, start_index)
2532
2533                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2534
2535                         try:
2536                                 page = urllib2.urlopen(request).read()
2537                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2538                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2539                                 return
2540
2541                         # Extract video identifiers
2542                         ids_in_page = []
2543
2544                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2545                                 if mobj.group(1) not in ids_in_page:
2546                                         ids_in_page.append(mobj.group(1))
2547
2548                         video_ids.extend(ids_in_page)
2549
2550                         # A little optimization - if current page is not
2551                         # "full", ie. does not contain PAGE_SIZE video ids then
2552                         # we can assume that this page is the last one - there
2553                         # are no more ids on further pages - no need to query
2554                         # again.
2555
2556                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2557                                 break
2558
2559                         pagenum += 1
2560
2561                 all_ids_count = len(video_ids)
2562                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2563                 playlistend = self._downloader.params.get('playlistend', -1)
2564
2565                 if playlistend == -1:
2566                         video_ids = video_ids[playliststart:]
2567                 else:
2568                         video_ids = video_ids[playliststart:playlistend]
2569
2570                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2571                                 (username, all_ids_count, len(video_ids)))
2572
2573                 for video_id in video_ids:
2574                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2575
2576
2577 class DepositFilesIE(InfoExtractor):
2578         """Information extractor for depositfiles.com"""
2579
2580         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2581         IE_NAME = u'DepositFiles'
2582
2583         def __init__(self, downloader=None):
2584                 InfoExtractor.__init__(self, downloader)
2585
2586         def report_download_webpage(self, file_id):
2587                 """Report webpage download."""
2588                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2589
2590         def report_extraction(self, file_id):
2591                 """Report information extraction."""
2592                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2593
2594         def _real_initialize(self):
2595                 return
2596
2597         def _real_extract(self, url):
2598                 # At this point we have a new file
2599                 self._downloader.increment_downloads()
2600
2601                 file_id = url.split('/')[-1]
2602                 # Rebuild url in english locale
2603                 url = 'http://depositfiles.com/en/files/' + file_id
2604
2605                 # Retrieve file webpage with 'Free download' button pressed
2606                 free_download_indication = { 'gateway_result' : '1' }
2607                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2608                 try:
2609                         self.report_download_webpage(file_id)
2610                         webpage = urllib2.urlopen(request).read()
2611                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2612                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2613                         return
2614
2615                 # Search for the real file URL
2616                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2617                 if (mobj is None) or (mobj.group(1) is None):
2618                         # Try to figure out reason of the error.
2619                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2620                         if (mobj is not None) and (mobj.group(1) is not None):
2621                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2622                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2623                         else:
2624                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2625                         return
2626
2627                 file_url = mobj.group(1)
2628                 file_extension = os.path.splitext(file_url)[1][1:]
2629
2630                 # Search for file title
2631                 mobj = re.search(r'<b title="(.*?)">', webpage)
2632                 if mobj is None:
2633                         self._downloader.trouble(u'ERROR: unable to extract title')
2634                         return
2635                 file_title = mobj.group(1).decode('utf-8')
2636
2637                 try:
2638                         # Process file information
2639                         self._downloader.process_info({
2640                                 'id':           file_id.decode('utf-8'),
2641                                 'url':          file_url.decode('utf-8'),
2642                                 'uploader':     u'NA',
2643                                 'upload_date':  u'NA',
2644                                 'title':        file_title,
2645                                 'stitle':       file_title,
2646                                 'ext':          file_extension.decode('utf-8'),
2647                                 'format':       u'NA',
2648                                 'player_url':   None,
2649                         })
2650                 except UnavailableVideoError, err:
2651                         self._downloader.trouble(u'ERROR: unable to download file')
2652
2653
2654 class FacebookIE(InfoExtractor):
2655         """Information Extractor for Facebook"""
2656
2657         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2658         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2659         _NETRC_MACHINE = 'facebook'
2660         _available_formats = ['highqual', 'lowqual']
2661         _video_extensions = {
2662                 'highqual': 'mp4',
2663                 'lowqual': 'mp4',
2664         }
2665         IE_NAME = u'facebook'
2666
2667         def __init__(self, downloader=None):
2668                 InfoExtractor.__init__(self, downloader)
2669
2670         def _reporter(self, message):
2671                 """Add header and report message."""
2672                 self._downloader.to_screen(u'[facebook] %s' % message)
2673
2674         def report_login(self):
2675                 """Report attempt to log in."""
2676                 self._reporter(u'Logging in')
2677
2678         def report_video_webpage_download(self, video_id):
2679                 """Report attempt to download video webpage."""
2680                 self._reporter(u'%s: Downloading video webpage' % video_id)
2681
2682         def report_information_extraction(self, video_id):
2683                 """Report attempt to extract video information."""
2684                 self._reporter(u'%s: Extracting video information' % video_id)
2685
2686         def _parse_page(self, video_webpage):
2687                 """Extract video information from page"""
2688                 # General data
2689                 data = {'title': r'class="video_title datawrap">(.*?)</',
2690                         'description': r'<div class="datawrap">(.*?)</div>',
2691                         'owner': r'\("video_owner_name", "(.*?)"\)',
2692                         'upload_date': r'data-date="(.*?)"',
2693                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2694                         }
2695                 video_info = {}
2696                 for piece in data.keys():
2697                         mobj = re.search(data[piece], video_webpage)
2698                         if mobj is not None:
2699                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2700
2701                 # Video urls
2702                 video_urls = {}
2703                 for fmt in self._available_formats:
2704                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2705                         if mobj is not None:
2706                                 # URL is in a Javascript segment inside an escaped Unicode format within
2707                                 # the generally utf-8 page
2708                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2709                 video_info['video_urls'] = video_urls
2710
2711                 return video_info
2712
2713         def _real_initialize(self):
2714                 if self._downloader is None:
2715                         return
2716
2717                 useremail = None
2718                 password = None
2719                 downloader_params = self._downloader.params
2720
2721                 # Attempt to use provided username and password or .netrc data
2722                 if downloader_params.get('username', None) is not None:
2723                         useremail = downloader_params['username']
2724                         password = downloader_params['password']
2725                 elif downloader_params.get('usenetrc', False):
2726                         try:
2727                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2728                                 if info is not None:
2729                                         useremail = info[0]
2730                                         password = info[2]
2731                                 else:
2732                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2733                         except (IOError, netrc.NetrcParseError), err:
2734                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2735                                 return
2736
2737                 if useremail is None:
2738                         return
2739
2740                 # Log in
2741                 login_form = {
2742                         'email': useremail,
2743                         'pass': password,
2744                         'login': 'Log+In'
2745                         }
2746                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2747                 try:
2748                         self.report_login()
2749                         login_results = urllib2.urlopen(request).read()
2750                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2751                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2752                                 return
2753                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2754                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2755                         return
2756
2757         def _real_extract(self, url):
2758                 mobj = re.match(self._VALID_URL, url)
2759                 if mobj is None:
2760                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2761                         return
2762                 video_id = mobj.group('ID')
2763
2764                 # Get video webpage
2765                 self.report_video_webpage_download(video_id)
2766                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2767                 try:
2768                         page = urllib2.urlopen(request)
2769                         video_webpage = page.read()
2770                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2771                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2772                         return
2773
2774                 # Start extracting information
2775                 self.report_information_extraction(video_id)
2776
2777                 # Extract information
2778                 video_info = self._parse_page(video_webpage)
2779
2780                 # uploader
2781                 if 'owner' not in video_info:
2782                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2783                         return
2784                 video_uploader = video_info['owner']
2785
2786                 # title
2787                 if 'title' not in video_info:
2788                         self._downloader.trouble(u'ERROR: unable to extract video title')
2789                         return
2790                 video_title = video_info['title']
2791                 video_title = video_title.decode('utf-8')
2792                 video_title = sanitize_title(video_title)
2793
2794                 # simplified title
2795                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2796                 simple_title = simple_title.strip(ur'_')
2797
2798                 # thumbnail image
2799                 if 'thumbnail' not in video_info:
2800                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2801                         video_thumbnail = ''
2802                 else:
2803                         video_thumbnail = video_info['thumbnail']
2804
2805                 # upload date
2806                 upload_date = u'NA'
2807                 if 'upload_date' in video_info:
2808                         upload_time = video_info['upload_date']
2809                         timetuple = email.utils.parsedate_tz(upload_time)
2810                         if timetuple is not None:
2811                                 try:
2812                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2813                                 except:
2814                                         pass
2815
2816                 # description
2817                 video_description = video_info.get('description', 'No description available.')
2818
2819                 url_map = video_info['video_urls']
2820                 if len(url_map.keys()) > 0:
2821                         # Decide which formats to download
2822                         req_format = self._downloader.params.get('format', None)
2823                         format_limit = self._downloader.params.get('format_limit', None)
2824
2825                         if format_limit is not None and format_limit in self._available_formats:
2826                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2827                         else:
2828                                 format_list = self._available_formats
2829                         existing_formats = [x for x in format_list if x in url_map]
2830                         if len(existing_formats) == 0:
2831                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2832                                 return
2833                         if req_format is None:
2834                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2835                         elif req_format == '-1':
2836                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2837                         else:
2838                                 # Specific format
2839                                 if req_format not in url_map:
2840                                         self._downloader.trouble(u'ERROR: requested format not available')
2841                                         return
2842                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2843
2844                 for format_param, video_real_url in video_url_list:
2845
2846                         # At this point we have a new video
2847                         self._downloader.increment_downloads()
2848
2849                         # Extension
2850                         video_extension = self._video_extensions.get(format_param, 'mp4')
2851
2852                         try:
2853                                 # Process video information
2854                                 self._downloader.process_info({
2855                                         'id':           video_id.decode('utf-8'),
2856                                         'url':          video_real_url.decode('utf-8'),
2857                                         'uploader':     video_uploader.decode('utf-8'),
2858                                         'upload_date':  upload_date,
2859                                         'title':        video_title,
2860                                         'stitle':       simple_title,
2861                                         'ext':          video_extension.decode('utf-8'),
2862                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2863                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2864                                         'description':  video_description.decode('utf-8'),
2865                                         'player_url':   None,
2866                                 })
2867                         except UnavailableVideoError, err:
2868                                 self._downloader.trouble(u'\nERROR: unable to download video')
2869
2870 class BlipTVIE(InfoExtractor):
2871         """Information extractor for blip.tv"""
2872
2873         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2874         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2875         IE_NAME = u'blip.tv'
2876
2877         def report_extraction(self, file_id):
2878                 """Report information extraction."""
2879                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2880
2881         def _simplify_title(self, title):
2882                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2883                 res = res.strip(ur'_')
2884                 return res
2885
2886         def _real_extract(self, url):
2887                 mobj = re.match(self._VALID_URL, url)
2888                 if mobj is None:
2889                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2890                         return
2891
2892                 if '?' in url:
2893                         cchar = '&'
2894                 else:
2895                         cchar = '?'
2896                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2897                 request = urllib2.Request(json_url)
2898                 self.report_extraction(mobj.group(1))
2899                 try:
2900                         json_code = urllib2.urlopen(request).read()
2901                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2902                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2903                         return
2904                 try:
2905                         json_data = json.loads(json_code)
2906                         if 'Post' in json_data:
2907                                 data = json_data['Post']
2908                         else:
2909                                 data = json_data
2910
2911                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2912                         video_url = data['media']['url']
2913                         umobj = re.match(self._URL_EXT, video_url)
2914                         if umobj is None:
2915                                 raise ValueError('Can not determine filename extension')
2916                         ext = umobj.group(1)
2917
2918                         self._downloader.increment_downloads()
2919
2920                         info = {
2921                                 'id': data['item_id'],
2922                                 'url': video_url,
2923                                 'uploader': data['display_name'],
2924                                 'upload_date': upload_date,
2925                                 'title': data['title'],
2926                                 'stitle': self._simplify_title(data['title']),
2927                                 'ext': ext,
2928                                 'format': data['media']['mimeType'],
2929                                 'thumbnail': data['thumbnailUrl'],
2930                                 'description': data['description'],
2931                                 'player_url': data['embedUrl']
2932                         }
2933                 except (ValueError,KeyError), err:
2934                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2935                         return
2936
2937                 try:
2938                         self._downloader.process_info(info)
2939                 except UnavailableVideoError, err:
2940                         self._downloader.trouble(u'\nERROR: unable to download video')
2941
2942
2943 class MyVideoIE(InfoExtractor):
2944         """Information Extractor for myvideo.de."""
2945
2946         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2947         IE_NAME = u'myvideo'
2948
2949         def __init__(self, downloader=None):
2950                 InfoExtractor.__init__(self, downloader)
2951
2952         def report_download_webpage(self, video_id):
2953                 """Report webpage download."""
2954                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2955
2956         def report_extraction(self, video_id):
2957                 """Report information extraction."""
2958                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2959
2960         def _real_initialize(self):
2961                 return
2962
2963         def _real_extract(self,url):
2964                 mobj = re.match(self._VALID_URL, url)
2965                 if mobj is None:
2966                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2967                         return
2968
2969                 video_id = mobj.group(1)
2970                 simple_title = mobj.group(2).decode('utf-8')
2971                 # should actually not be necessary
2972                 simple_title = sanitize_title(simple_title)
2973                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2974
2975                 # Get video webpage
2976                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2977                 try:
2978                         self.report_download_webpage(video_id)
2979                         webpage = urllib2.urlopen(request).read()
2980                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2981                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2982                         return
2983
2984                 self.report_extraction(video_id)
2985                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2986                                  webpage)
2987                 if mobj is None:
2988                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2989                         return
2990                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2991
2992                 mobj = re.search('<title>([^<]+)</title>', webpage)
2993                 if mobj is None:
2994                         self._downloader.trouble(u'ERROR: unable to extract title')
2995                         return
2996
2997                 video_title = mobj.group(1)
2998                 video_title = sanitize_title(video_title)
2999
3000                 try:
3001                         print(video_url)
3002                         self._downloader.process_info({
3003                                 'id':           video_id,
3004                                 'url':          video_url,
3005                                 'uploader':     u'NA',
3006                                 'upload_date':  u'NA',
3007                                 'title':        video_title,
3008                                 'stitle':       simple_title,
3009                                 'ext':          u'flv',
3010                                 'format':       u'NA',
3011                                 'player_url':   None,
3012                         })
3013                 except UnavailableVideoError:
3014                         self._downloader.trouble(u'\nERROR: Unable to download video')
3015
3016 class ComedyCentralIE(InfoExtractor):
3017         """Information extractor for The Daily Show and Colbert Report """
3018
3019         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3020         IE_NAME = u'comedycentral'
3021
3022         def report_extraction(self, episode_id):
3023                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3024
3025         def report_config_download(self, episode_id):
3026                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3027
3028         def report_index_download(self, episode_id):
3029                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3030
3031         def report_player_url(self, episode_id):
3032                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3033
3034         def _simplify_title(self, title):
3035                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3036                 res = res.strip(ur'_')
3037                 return res
3038
3039         def _real_extract(self, url):
3040                 mobj = re.match(self._VALID_URL, url)
3041                 if mobj is None:
3042                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3043                         return
3044
3045                 if mobj.group('shortname'):
3046                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3047                                 url = 'http://www.thedailyshow.com/full-episodes/'
3048                         else:
3049                                 url = 'http://www.colbertnation.com/full-episodes/'
3050                         mobj = re.match(self._VALID_URL, url)
3051                         assert mobj is not None
3052
3053                 dlNewest = not mobj.group('episode')
3054                 if dlNewest:
3055                         epTitle = mobj.group('showname')
3056                 else:
3057                         epTitle = mobj.group('episode')
3058
3059                 req = urllib2.Request(url)
3060                 self.report_extraction(epTitle)
3061                 try:
3062                         htmlHandle = urllib2.urlopen(req)
3063                         html = htmlHandle.read()
3064                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3065                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3066                         return
3067                 if dlNewest:
3068                         url = htmlHandle.geturl()
3069                         mobj = re.match(self._VALID_URL, url)
3070                         if mobj is None:
3071                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3072                                 return
3073                         if mobj.group('episode') == '':
3074                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3075                                 return
3076                         epTitle = mobj.group('episode')
3077
3078                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3079                 if len(mMovieParams) == 0:
3080                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3081                         return
3082
3083                 playerUrl_raw = mMovieParams[0][0]
3084                 self.report_player_url(epTitle)
3085                 try:
3086                         urlHandle = urllib2.urlopen(playerUrl_raw)
3087                         playerUrl = urlHandle.geturl()
3088                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3089                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3090                         return
3091
3092                 uri = mMovieParams[0][1]
3093                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3094                 self.report_index_download(epTitle)
3095                 try:
3096                         indexXml = urllib2.urlopen(indexUrl).read()
3097                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3098                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3099                         return
3100
3101                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3102                 itemEls = idoc.findall('.//item')
3103                 for itemEl in itemEls:
3104                         mediaId = itemEl.findall('./guid')[0].text
3105                         shortMediaId = mediaId.split(':')[-1]
3106                         showId = mediaId.split(':')[-2].replace('.com', '')
3107                         officialTitle = itemEl.findall('./title')[0].text
3108                         officialDate = itemEl.findall('./pubDate')[0].text
3109
3110                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3111                                                 urllib.urlencode({'uri': mediaId}))
3112                         configReq = urllib2.Request(configUrl)
3113                         self.report_config_download(epTitle)
3114                         try:
3115                                 configXml = urllib2.urlopen(configReq).read()
3116                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3117                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3118                                 return
3119
3120                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3121                         turls = []
3122                         for rendition in cdoc.findall('.//rendition'):
3123                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3124                                 turls.append(finfo)
3125
3126                         if len(turls) == 0:
3127                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3128                                 continue
3129
3130                         # For now, just pick the highest bitrate
3131                         format,video_url = turls[-1]
3132
3133                         self._downloader.increment_downloads()
3134
3135                         effTitle = showId + '-' + epTitle
3136                         info = {
3137                                 'id': shortMediaId,
3138                                 'url': video_url,
3139                                 'uploader': showId,
3140                                 'upload_date': officialDate,
3141                                 'title': effTitle,
3142                                 'stitle': self._simplify_title(effTitle),
3143                                 'ext': 'mp4',
3144                                 'format': format,
3145                                 'thumbnail': None,
3146                                 'description': officialTitle,
3147                                 'player_url': playerUrl
3148                         }
3149
3150                         try:
3151                                 self._downloader.process_info(info)
3152                         except UnavailableVideoError, err:
3153                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3154                                 continue
3155
3156
3157 class EscapistIE(InfoExtractor):
3158         """Information extractor for The Escapist """
3159
3160         _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3161         IE_NAME = u'escapist'
3162
3163         def report_extraction(self, showName):
3164                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3165
3166         def report_config_download(self, showName):
3167                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3168
3169         def _simplify_title(self, title):
3170                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3171                 res = res.strip(ur'_')
3172                 return res
3173
3174         def _real_extract(self, url):
3175                 htmlParser = HTMLParser.HTMLParser()
3176
3177                 mobj = re.match(self._VALID_URL, url)
3178                 if mobj is None:
3179                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3180                         return
3181                 showName = mobj.group('showname')
3182                 videoId = mobj.group('episode')
3183
3184                 self.report_extraction(showName)
3185                 try:
3186                         webPage = urllib2.urlopen(url).read()
3187                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3188                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3189                         return
3190
3191                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3192                 description = htmlParser.unescape(descMatch.group(1))
3193                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3194                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3195                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3196                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3197                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3198                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3199
3200                 self.report_config_download(showName)
3201                 try:
3202                         configJSON = urllib2.urlopen(configUrl).read()
3203                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3204                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3205                         return
3206
3207                 # Technically, it's JavaScript, not JSON
3208                 configJSON = configJSON.replace("'", '"')
3209
3210                 try:
3211                         config = json.loads(configJSON)
3212                 except (ValueError,), err:
3213                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3214                         return
3215
3216                 playlist = config['playlist']
3217                 videoUrl = playlist[1]['url']
3218
3219                 self._downloader.increment_downloads()
3220                 info = {
3221                         'id': videoId,
3222                         'url': videoUrl,
3223                         'uploader': showName,
3224                         'upload_date': None,
3225                         'title': showName,
3226                         'stitle': self._simplify_title(showName),
3227                         'ext': 'flv',
3228                         'format': 'flv',
3229                         'thumbnail': imgUrl,
3230                         'description': description,
3231                         'player_url': playerUrl,
3232                 }
3233
3234                 try:
3235                         self._downloader.process_info(info)
3236                 except UnavailableVideoError, err:
3237                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3238
3239
3240
3241 class PostProcessor(object):
3242         """Post Processor class.
3243
3244         PostProcessor objects can be added to downloaders with their
3245         add_post_processor() method. When the downloader has finished a
3246         successful download, it will take its internal chain of PostProcessors
3247         and start calling the run() method on each one of them, first with
3248         an initial argument and then with the returned value of the previous
3249         PostProcessor.
3250
3251         The chain will be stopped if one of them ever returns None or the end
3252         of the chain is reached.
3253
3254         PostProcessor objects follow a "mutual registration" process similar
3255         to InfoExtractor objects.
3256         """
3257
3258         _downloader = None
3259
3260         def __init__(self, downloader=None):
3261                 self._downloader = downloader
3262
3263         def set_downloader(self, downloader):
3264                 """Sets the downloader for this PP."""
3265                 self._downloader = downloader
3266
3267         def run(self, information):
3268                 """Run the PostProcessor.
3269
3270                 The "information" argument is a dictionary like the ones
3271                 composed by InfoExtractors. The only difference is that this
3272                 one has an extra field called "filepath" that points to the
3273                 downloaded file.
3274
3275                 When this method returns None, the postprocessing chain is
3276                 stopped. However, this method may return an information
3277                 dictionary that will be passed to the next postprocessing
3278                 object in the chain. It can be the one it received after
3279                 changing some fields.
3280
3281                 In addition, this method may raise a PostProcessingError
3282                 exception that will be taken into account by the downloader
3283                 it was called from.
3284                 """
3285                 return information # by default, do nothing
3286
3287
3288 class FFmpegExtractAudioPP(PostProcessor):
3289
3290         def __init__(self, downloader=None, preferredcodec=None):
3291                 PostProcessor.__init__(self, downloader)
3292                 if preferredcodec is None:
3293                         preferredcodec = 'best'
3294                 self._preferredcodec = preferredcodec
3295
3296         @staticmethod
3297         def get_audio_codec(path):
3298                 try:
3299                         cmd = ['ffprobe', '-show_streams', '--', path]
3300                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3301                         output = handle.communicate()[0]
3302                         if handle.wait() != 0:
3303                                 return None
3304                 except (IOError, OSError):
3305                         return None
3306                 audio_codec = None
3307                 for line in output.split('\n'):
3308                         if line.startswith('codec_name='):
3309                                 audio_codec = line.split('=')[1].strip()
3310                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3311                                 return audio_codec
3312                 return None
3313
3314         @staticmethod
3315         def run_ffmpeg(path, out_path, codec, more_opts):
3316                 try:
3317                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3318                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3319                         return (ret == 0)
3320                 except (IOError, OSError):
3321                         return False
3322
3323         def run(self, information):
3324                 path = information['filepath']
3325
3326                 filecodec = self.get_audio_codec(path)
3327                 if filecodec is None:
3328                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3329                         return None
3330
3331                 more_opts = []
3332                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3333                         if filecodec == 'aac' or filecodec == 'mp3':
3334                                 # Lossless if possible
3335                                 acodec = 'copy'
3336                                 extension = filecodec
3337                                 if filecodec == 'aac':
3338                                         more_opts = ['-f', 'adts']
3339                         else:
3340                                 # MP3 otherwise.
3341                                 acodec = 'libmp3lame'
3342                                 extension = 'mp3'
3343                                 more_opts = ['-ab', '128k']
3344                 else:
3345                         # We convert the audio (lossy)
3346                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3347                         extension = self._preferredcodec
3348                         more_opts = ['-ab', '128k']
3349                         if self._preferredcodec == 'aac':
3350                                 more_opts += ['-f', 'adts']
3351
3352                 (prefix, ext) = os.path.splitext(path)
3353                 new_path = prefix + '.' + extension
3354                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3355                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3356
3357                 if not status:
3358                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3359                         return None
3360
3361                 # Try to update the date time for extracted audio file.
3362                 if information.get('filetime') is not None:
3363                         try:
3364                                 os.utime(new_path, (time.time(), information['filetime']))
3365                         except:
3366                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3367
3368                 try:
3369                         os.remove(path)
3370                 except (IOError, OSError):
3371                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3372                         return None
3373
3374                 information['filepath'] = new_path
3375                 return information
3376
3377
3378 def updateSelf(downloader, filename):
3379         ''' Update the program file with the latest version from the repository '''
3380         # Note: downloader only used for options
3381         if not os.access(filename, os.W_OK):
3382                 sys.exit('ERROR: no write permissions on %s' % filename)
3383
3384         downloader.to_screen('Updating to latest version...')
3385
3386         try:
3387                 try:
3388                         urlh = urllib.urlopen(UPDATE_URL)
3389                         newcontent = urlh.read()
3390                 finally:
3391                         urlh.close()
3392         except (IOError, OSError), err:
3393                 sys.exit('ERROR: unable to download latest version')
3394
3395         try:
3396                 outf = open(filename, 'wb')
3397                 try:
3398                         outf.write(newcontent)
3399                 finally:
3400                         outf.close()
3401         except (IOError, OSError), err:
3402                 sys.exit('ERROR: unable to overwrite current version')
3403
3404         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3405
3406 def parseOpts():
3407         # Deferred imports
3408         import getpass
3409         import optparse
3410
3411         def _format_option_string(option):
3412                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3413
3414                 opts = []
3415
3416                 if option._short_opts: opts.append(option._short_opts[0])
3417                 if option._long_opts: opts.append(option._long_opts[0])
3418                 if len(opts) > 1: opts.insert(1, ', ')
3419
3420                 if option.takes_value(): opts.append(' %s' % option.metavar)
3421
3422                 return "".join(opts)
3423
3424         def _find_term_columns():
3425                 columns = os.environ.get('COLUMNS', None)
3426                 if columns:
3427                         return int(columns)
3428
3429                 try:
3430                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3431                         out,err = sp.communicate()
3432                         return int(out.split()[1])
3433                 except:
3434                         pass
3435                 return None
3436
3437         max_width = 80
3438         max_help_position = 80
3439
3440         # No need to wrap help messages if we're on a wide console
3441         columns = _find_term_columns()
3442         if columns: max_width = columns
3443
3444         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3445         fmt.format_option_strings = _format_option_string
3446
3447         kw = {
3448                 'version'   : __version__,
3449                 'formatter' : fmt,
3450                 'usage' : '%prog [options] url [url...]',
3451                 'conflict_handler' : 'resolve',
3452         }
3453
3454         parser = optparse.OptionParser(**kw)
3455
3456         # option groups
3457         general        = optparse.OptionGroup(parser, 'General Options')
3458         selection      = optparse.OptionGroup(parser, 'Video Selection')
3459         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3460         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3461         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3462         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3463         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3464
3465         general.add_option('-h', '--help',
3466                         action='help', help='print this help text and exit')
3467         general.add_option('-v', '--version',
3468                         action='version', help='print program version and exit')
3469         general.add_option('-U', '--update',
3470                         action='store_true', dest='update_self', help='update this program to latest version')
3471         general.add_option('-i', '--ignore-errors',
3472                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3473         general.add_option('-r', '--rate-limit',
3474                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3475         general.add_option('-R', '--retries',
3476                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3477         general.add_option('--dump-user-agent',
3478                         action='store_true', dest='dump_user_agent',
3479                         help='display the current browser identification', default=False)
3480         general.add_option('--list-extractors',
3481                         action='store_true', dest='list_extractors',
3482                         help='List all supported extractors and the URLs they would handle', default=False)
3483
3484         selection.add_option('--playlist-start',
3485                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3486         selection.add_option('--playlist-end',
3487                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3488         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3489         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3490
3491         authentication.add_option('-u', '--username',
3492                         dest='username', metavar='USERNAME', help='account username')
3493         authentication.add_option('-p', '--password',
3494                         dest='password', metavar='PASSWORD', help='account password')
3495         authentication.add_option('-n', '--netrc',
3496                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3497
3498
3499         video_format.add_option('-f', '--format',
3500                         action='store', dest='format', metavar='FORMAT', help='video format code')
3501         video_format.add_option('--all-formats',
3502                         action='store_const', dest='format', help='download all available video formats', const='-1')
3503         video_format.add_option('--max-quality',
3504                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3505
3506
3507         verbosity.add_option('-q', '--quiet',
3508                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3509         verbosity.add_option('-s', '--simulate',
3510                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3511         verbosity.add_option('--skip-download',
3512                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3513         verbosity.add_option('-g', '--get-url',
3514                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3515         verbosity.add_option('-e', '--get-title',
3516                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3517         verbosity.add_option('--get-thumbnail',
3518                         action='store_true', dest='getthumbnail',
3519                         help='simulate, quiet but print thumbnail URL', default=False)
3520         verbosity.add_option('--get-description',
3521                         action='store_true', dest='getdescription',
3522                         help='simulate, quiet but print video description', default=False)
3523         verbosity.add_option('--get-filename',
3524                         action='store_true', dest='getfilename',
3525                         help='simulate, quiet but print output filename', default=False)
3526         verbosity.add_option('--no-progress',
3527                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3528         verbosity.add_option('--console-title',
3529                         action='store_true', dest='consoletitle',
3530                         help='display progress in console titlebar', default=False)
3531
3532
3533         filesystem.add_option('-t', '--title',
3534                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3535         filesystem.add_option('-l', '--literal',
3536                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3537         filesystem.add_option('-A', '--auto-number',
3538                         action='store_true', dest='autonumber',
3539                         help='number downloaded files starting from 00000', default=False)
3540         filesystem.add_option('-o', '--output',
3541                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3542         filesystem.add_option('-a', '--batch-file',
3543                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3544         filesystem.add_option('-w', '--no-overwrites',
3545                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3546         filesystem.add_option('-c', '--continue',
3547                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3548         filesystem.add_option('--cookies',
3549                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3550         filesystem.add_option('--no-part',
3551                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3552         filesystem.add_option('--no-mtime',
3553                         action='store_false', dest='updatetime',
3554                         help='do not use the Last-modified header to set the file modification time', default=True)
3555         filesystem.add_option('--write-description',
3556                         action='store_true', dest='writedescription',
3557                         help='write video description to a .description file', default=False)
3558         filesystem.add_option('--write-info-json',
3559                         action='store_true', dest='writeinfojson',
3560                         help='write video metadata to a .info.json file', default=False)
3561
3562
3563         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3564                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3565         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3566                         help='"best", "aac" or "mp3"; best by default')
3567
3568
3569         parser.add_option_group(general)
3570         parser.add_option_group(selection)
3571         parser.add_option_group(filesystem)
3572         parser.add_option_group(verbosity)
3573         parser.add_option_group(video_format)
3574         parser.add_option_group(authentication)
3575         parser.add_option_group(postproc)
3576
3577         opts, args = parser.parse_args()
3578
3579         return parser, opts, args
3580
3581 def gen_extractors():
3582         """ Return a list of an instance of every supported extractor.
3583         The order does matter; the first extractor matched is the one handling the URL.
3584         """
3585         youtube_ie = YoutubeIE()
3586         google_ie = GoogleIE()
3587         yahoo_ie = YahooIE()
3588         return [
3589                 youtube_ie,
3590                 MetacafeIE(youtube_ie),
3591                 DailymotionIE(),
3592                 YoutubePlaylistIE(youtube_ie),
3593                 YoutubeUserIE(youtube_ie),
3594                 YoutubeSearchIE(youtube_ie),
3595                 google_ie,
3596                 GoogleSearchIE(google_ie),
3597                 PhotobucketIE(),
3598                 yahoo_ie,
3599                 YahooSearchIE(yahoo_ie),
3600                 DepositFilesIE(),
3601                 FacebookIE(),
3602                 BlipTVIE(),
3603                 VimeoIE(),
3604                 MyVideoIE(),
3605                 ComedyCentralIE(),
3606                 EscapistIE(),
3607
3608                 GenericIE()
3609         ]
3610
3611 def main():
3612         parser, opts, args = parseOpts()
3613
3614         # Open appropriate CookieJar
3615         if opts.cookiefile is None:
3616                 jar = cookielib.CookieJar()
3617         else:
3618                 try:
3619                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3620                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3621                                 jar.load()
3622                 except (IOError, OSError), err:
3623                         sys.exit(u'ERROR: unable to open cookie file')
3624
3625         # Dump user agent
3626         if opts.dump_user_agent:
3627                 print std_headers['User-Agent']
3628                 sys.exit(0)
3629
3630         # Batch file verification
3631         batchurls = []
3632         if opts.batchfile is not None:
3633                 try:
3634                         if opts.batchfile == '-':
3635                                 batchfd = sys.stdin
3636                         else:
3637                                 batchfd = open(opts.batchfile, 'r')
3638                         batchurls = batchfd.readlines()
3639                         batchurls = [x.strip() for x in batchurls]
3640                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3641                 except IOError:
3642                         sys.exit(u'ERROR: batch file could not be read')
3643         all_urls = batchurls + args
3644
3645         # General configuration
3646         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3647         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3648         urllib2.install_opener(opener)
3649         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3650
3651         extractors = gen_extractors()
3652
3653         if opts.list_extractors:
3654                 for ie in extractors:
3655                         print(ie.IE_NAME)
3656                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3657                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3658                         for mu in matchedUrls:
3659                                 print(u'  ' + mu)
3660                 sys.exit(0)
3661
3662         # Conflicting, missing and erroneous options
3663         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3664                 parser.error(u'using .netrc conflicts with giving username/password')
3665         if opts.password is not None and opts.username is None:
3666                 parser.error(u'account username missing')
3667         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3668                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3669         if opts.usetitle and opts.useliteral:
3670                 parser.error(u'using title conflicts with using literal title')
3671         if opts.username is not None and opts.password is None:
3672                 opts.password = getpass.getpass(u'Type account password and press return:')
3673         if opts.ratelimit is not None:
3674                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3675                 if numeric_limit is None:
3676                         parser.error(u'invalid rate limit specified')
3677                 opts.ratelimit = numeric_limit
3678         if opts.retries is not None:
3679                 try:
3680                         opts.retries = long(opts.retries)
3681                 except (TypeError, ValueError), err:
3682                         parser.error(u'invalid retry count specified')
3683         try:
3684                 opts.playliststart = int(opts.playliststart)
3685                 if opts.playliststart <= 0:
3686                         raise ValueError(u'Playlist start must be positive')
3687         except (TypeError, ValueError), err:
3688                 parser.error(u'invalid playlist start number specified')
3689         try:
3690                 opts.playlistend = int(opts.playlistend)
3691                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3692                         raise ValueError(u'Playlist end must be greater than playlist start')
3693         except (TypeError, ValueError), err:
3694                 parser.error(u'invalid playlist end number specified')
3695         if opts.extractaudio:
3696                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3697                         parser.error(u'invalid audio format specified')
3698
3699         # File downloader
3700         fd = FileDownloader({
3701                 'usenetrc': opts.usenetrc,
3702                 'username': opts.username,
3703                 'password': opts.password,
3704                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3705                 'forceurl': opts.geturl,
3706                 'forcetitle': opts.gettitle,
3707                 'forcethumbnail': opts.getthumbnail,
3708                 'forcedescription': opts.getdescription,
3709                 'forcefilename': opts.getfilename,
3710                 'simulate': opts.simulate,
3711                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3712                 'format': opts.format,
3713                 'format_limit': opts.format_limit,
3714                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3715                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3716                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3717                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3718                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3719                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3720                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3721                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3722                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3723                         or u'%(id)s.%(ext)s'),
3724                 'ignoreerrors': opts.ignoreerrors,
3725                 'ratelimit': opts.ratelimit,
3726                 'nooverwrites': opts.nooverwrites,
3727                 'retries': opts.retries,
3728                 'continuedl': opts.continue_dl,
3729                 'noprogress': opts.noprogress,
3730                 'playliststart': opts.playliststart,
3731                 'playlistend': opts.playlistend,
3732                 'logtostderr': opts.outtmpl == '-',
3733                 'consoletitle': opts.consoletitle,
3734                 'nopart': opts.nopart,
3735                 'updatetime': opts.updatetime,
3736                 'writedescription': opts.writedescription,
3737                 'writeinfojson': opts.writeinfojson,
3738                 'matchtitle': opts.matchtitle,
3739                 'rejecttitle': opts.rejecttitle,
3740                 })
3741         for extractor in extractors:
3742                 fd.add_info_extractor(extractor)
3743
3744         # PostProcessors
3745         if opts.extractaudio:
3746                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3747
3748         # Update version
3749         if opts.update_self:
3750                 updateSelf(fd, sys.argv[0])
3751
3752         # Maybe do nothing
3753         if len(all_urls) < 1:
3754                 if not opts.update_self:
3755                         parser.error(u'you must provide at least one URL')
3756                 else:
3757                         sys.exit()
3758         retcode = fd.download(all_urls)
3759
3760         # Dump cookie jar if requested
3761         if opts.cookiefile is not None:
3762                 try:
3763                         jar.save()
3764                 except (IOError, OSError), err:
3765                         sys.exit(u'ERROR: unable to save cookie jar')
3766
3767         sys.exit(retcode)
3768
3769
3770 if __name__ == '__main__':
3771         try:
3772                 main()
3773         except DownloadError:
3774                 sys.exit(1)
3775         except SameFileError:
3776                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3777         except KeyboardInterrupt:
3778                 sys.exit(u'\nERROR: Interrupted by user')
3779
3780 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: