youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.09.15'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import HTMLParser
  27 import httplib
  28 import locale
  29 import math
  30 import netrc
  31 import os
  32 import os.path
  33 import re
  34 import socket
  35 import string
  36 import subprocess
  37 import sys
  38 import time
  39 import urllib
  40 import urllib2
  41 import warnings
  42 import zlib
  43
  44 if os.name == 'nt':
  45         import ctypes
  46
  47 try:
  48         import email.utils
  49 except ImportError: # Python 2.4
  50         import email.Utils
  51 try:
  52         import cStringIO as StringIO
  53 except ImportError:
  54         import StringIO
  55
  56 # parse_qs was moved from the cgi module to the urlparse module recently.
  57 try:
  58         from urlparse import parse_qs
  59 except ImportError:
  60         from cgi import parse_qs
  61
  62 try:
  63         import lxml.etree
  64 except ImportError:
  65         pass # Handled below
  66
  67 try:
  68         import xml.etree.ElementTree
  69 except ImportError: # Python<2.5: Not officially supported, but let it slip
  70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  71
  72 std_headers = {
  73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  76         'Accept-Encoding': 'gzip, deflate',
  77         'Accept-Language': 'en-us,en;q=0.5',
  78 }
  79
  80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280
 281 class DownloadError(Exception):
 282         """Download Error exception.
 283
 284         This exception may be thrown by FileDownloader objects if they are not
 285         configured to continue on errors. They will contain the appropriate
 286         error message.
 287         """
 288         pass
 289
 290
 291 class SameFileError(Exception):
 292         """Same File exception.
 293
 294         This exception will be thrown by FileDownloader objects if they detect
 295         multiple files would have to be downloaded to the same file on disk.
 296         """
 297         pass
 298
 299
 300 class PostProcessingError(Exception):
 301         """Post Processing exception.
 302
 303         This exception may be raised by PostProcessor's .run() method to
 304         indicate an error in the postprocessing task.
 305         """
 306         pass
 307
 308
 309 class UnavailableVideoError(Exception):
 310         """Unavailable Format exception.
 311
 312         This exception will be thrown when a video is requested
 313         in a format that is not available for that video.
 314         """
 315         pass
 316
 317
 318 class ContentTooShortError(Exception):
 319         """Content Too Short exception.
 320
 321         This exception may be raised by FileDownloader objects when a file they
 322         download is too small for what the server announced first, indicating
 323         the connection was probably interrupted.
 324         """
 325         # Both in bytes
 326         downloaded = None
 327         expected = None
 328
 329         def __init__(self, downloaded, expected):
 330                 self.downloaded = downloaded
 331                 self.expected = expected
 332
 333
 334 class YoutubeDLHandler(urllib2.HTTPHandler):
 335         """Handler for HTTP requests and responses.
 336
 337         This class, when installed with an OpenerDirector, automatically adds
 338         the standard headers to every HTTP request and handles gzipped and
 339         deflated responses from web servers. If compression is to be avoided in
 340         a particular request, the original request in the program code only has
 341         to include the HTTP header "Youtubedl-No-Compression", which will be
 342         removed before making the real request.
 343
 344         Part of this code was copied from:
 345
 346         http://techknack.net/python-urllib2-handlers/
 347
 348         Andrew Rowls, the author of that code, agreed to release it to the
 349         public domain.
 350         """
 351
 352         @staticmethod
 353         def deflate(data):
 354                 try:
 355                         return zlib.decompress(data, -zlib.MAX_WBITS)
 356                 except zlib.error:
 357                         return zlib.decompress(data)
 358
 359         @staticmethod
 360         def addinfourl_wrapper(stream, headers, url, code):
 361                 if hasattr(urllib2.addinfourl, 'getcode'):
 362                         return urllib2.addinfourl(stream, headers, url, code)
 363                 ret = urllib2.addinfourl(stream, headers, url)
 364                 ret.code = code
 365                 return ret
 366
 367         def http_request(self, req):
 368                 for h in std_headers:
 369                         if h in req.headers:
 370                                 del req.headers[h]
 371                         req.add_header(h, std_headers[h])
 372                 if 'Youtubedl-no-compression' in req.headers:
 373                         if 'Accept-encoding' in req.headers:
 374                                 del req.headers['Accept-encoding']
 375                         del req.headers['Youtubedl-no-compression']
 376                 return req
 377
 378         def http_response(self, req, resp):
 379                 old_resp = resp
 380                 # gzip
 381                 if resp.headers.get('Content-encoding', '') == 'gzip':
 382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 384                         resp.msg = old_resp.msg
 385                 # deflate
 386                 if resp.headers.get('Content-encoding', '') == 'deflate':
 387                         gz = StringIO.StringIO(self.deflate(resp.read()))
 388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 389                         resp.msg = old_resp.msg
 390                 return resp
 391
 392
 393 class FileDownloader(object):
 394         """File Downloader class.
 395
 396         File downloader objects are the ones responsible of downloading the
 397         actual video file and writing it to disk if the user has requested
 398         it, among some other tasks. In most cases there should be one per
 399         program. As, given a video URL, the downloader doesn't know how to
 400         extract all the needed information, task that InfoExtractors do, it
 401         has to pass the URL to one of them.
 402
 403         For this, file downloader objects have a method that allows
 404         InfoExtractors to be registered in a given order. When it is passed
 405         a URL, the file downloader handles it to the first InfoExtractor it
 406         finds that reports being able to handle it. The InfoExtractor extracts
 407         all the information about the video or videos the URL refers to, and
 408         asks the FileDownloader to process the video information, possibly
 409         downloading the video.
 410
 411         File downloaders accept a lot of parameters. In order not to saturate
 412         the object constructor with arguments, it receives a dictionary of
 413         options instead. These options are available through the params
 414         attribute for the InfoExtractors to use. The FileDownloader also
 415         registers itself as the downloader in charge for the InfoExtractors
 416         that are added to it, so this is a "mutual registration".
 417
 418         Available options:
 419
 420         username:         Username for authentication purposes.
 421         password:         Password for authentication purposes.
 422         usenetrc:         Use netrc for authentication instead.
 423         quiet:            Do not print messages to stdout.
 424         forceurl:         Force printing final URL.
 425         forcetitle:       Force printing title.
 426         forcethumbnail:   Force printing thumbnail URL.
 427         forcedescription: Force printing description.
 428         forcefilename:    Force printing final filename.
 429         simulate:         Do not download the video files.
 430         format:           Video format code.
 431         format_limit:     Highest quality format to try.
 432         outtmpl:          Template for output names.
 433         ignoreerrors:     Do not stop on download errors.
 434         ratelimit:        Download speed limit, in bytes/sec.
 435         nooverwrites:     Prevent overwriting files.
 436         retries:          Number of times to retry for HTTP error 5xx
 437         continuedl:       Try to continue downloads if possible.
 438         noprogress:       Do not print the progress bar.
 439         playliststart:    Playlist item to start at.
 440         playlistend:      Playlist item to end at.
 441         matchtitle:       Download only matching titles.
 442         rejecttitle:      Reject downloads for matching titles.
 443         logtostderr:      Log messages to stderr instead of stdout.
 444         consoletitle:     Display progress in console window's titlebar.
 445         nopart:           Do not use temporary .part files.
 446         updatetime:       Use the Last-modified header to set output file timestamps.
 447         writedescription: Write the video description to a .description file
 448         writeinfojson:    Write the video description to a .info.json file
 449         """
 450
 451         params = None
 452         _ies = []
 453         _pps = []
 454         _download_retcode = None
 455         _num_downloads = None
 456         _screen_file = None
 457
 458         def __init__(self, params):
 459                 """Create a FileDownloader object with the given options."""
 460                 self._ies = []
 461                 self._pps = []
 462                 self._download_retcode = 0
 463                 self._num_downloads = 0
 464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 465                 self.params = params
 466
 467         @staticmethod
 468         def format_bytes(bytes):
 469                 if bytes is None:
 470                         return 'N/A'
 471                 if type(bytes) is str:
 472                         bytes = float(bytes)
 473                 if bytes == 0.0:
 474                         exponent = 0
 475                 else:
 476                         exponent = long(math.log(bytes, 1024.0))
 477                 suffix = 'bkMGTPEZY'[exponent]
 478                 converted = float(bytes) / float(1024 ** exponent)
 479                 return '%.2f%s' % (converted, suffix)
 480
 481         @staticmethod
 482         def calc_percent(byte_counter, data_len):
 483                 if data_len is None:
 484                         return '---.-%'
 485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 486
 487         @staticmethod
 488         def calc_eta(start, now, total, current):
 489                 if total is None:
 490                         return '--:--'
 491                 dif = now - start
 492                 if current == 0 or dif < 0.001: # One millisecond
 493                         return '--:--'
 494                 rate = float(current) / dif
 495                 eta = long((float(total) - float(current)) / rate)
 496                 (eta_mins, eta_secs) = divmod(eta, 60)
 497                 if eta_mins > 99:
 498                         return '--:--'
 499                 return '%02d:%02d' % (eta_mins, eta_secs)
 500
 501         @staticmethod
 502         def calc_speed(start, now, bytes):
 503                 dif = now - start
 504                 if bytes == 0 or dif < 0.001: # One millisecond
 505                         return '%10s' % '---b/s'
 506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 507
 508         @staticmethod
 509         def best_block_size(elapsed_time, bytes):
 510                 new_min = max(bytes / 2.0, 1.0)
 511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 512                 if elapsed_time < 0.001:
 513                         return long(new_max)
 514                 rate = bytes / elapsed_time
 515                 if rate > new_max:
 516                         return long(new_max)
 517                 if rate < new_min:
 518                         return long(new_min)
 519                 return long(rate)
 520
 521         @staticmethod
 522         def parse_bytes(bytestr):
 523                 """Parse a string indicating a byte quantity into a long integer."""
 524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 525                 if matchobj is None:
 526                         return None
 527                 number = float(matchobj.group(1))
 528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 529                 return long(round(number * multiplier))
 530
 531         def add_info_extractor(self, ie):
 532                 """Add an InfoExtractor object to the end of the list."""
 533                 self._ies.append(ie)
 534                 ie.set_downloader(self)
 535
 536         def add_post_processor(self, pp):
 537                 """Add a PostProcessor object to the end of the chain."""
 538                 self._pps.append(pp)
 539                 pp.set_downloader(self)
 540
 541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 542                 """Print message to stdout if not in quiet mode."""
 543                 try:
 544                         if not self.params.get('quiet', False):
 545                                 terminator = [u'\n', u''][skip_eol]
 546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 547                         self._screen_file.flush()
 548                 except (UnicodeEncodeError), err:
 549                         if not ignore_encoding_errors:
 550                                 raise
 551
 552         def to_stderr(self, message):
 553                 """Print message to stderr."""
 554                 print >>sys.stderr, message.encode(preferredencoding())
 555
 556         def to_cons_title(self, message):
 557                 """Set console/terminal window title to message."""
 558                 if not self.params.get('consoletitle', False):
 559                         return
 560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 561                         # c_wchar_p() might not be necessary if `message` is
 562                         # already of type unicode()
 563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 564                 elif 'TERM' in os.environ:
 565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 566
 567         def fixed_template(self):
 568                 """Checks if the output template is fixed."""
 569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 570
 571         def trouble(self, message=None):
 572                 """Determine action to take when a download problem appears.
 573
 574                 Depending on if the downloader has been configured to ignore
 575                 download errors or not, this method may throw an exception or
 576                 not when errors are found, after printing the message.
 577                 """
 578                 if message is not None:
 579                         self.to_stderr(message)
 580                 if not self.params.get('ignoreerrors', False):
 581                         raise DownloadError(message)
 582                 self._download_retcode = 1
 583
 584         def slow_down(self, start_time, byte_counter):
 585                 """Sleep if the download speed is over the rate limit."""
 586                 rate_limit = self.params.get('ratelimit', None)
 587                 if rate_limit is None or byte_counter == 0:
 588                         return
 589                 now = time.time()
 590                 elapsed = now - start_time
 591                 if elapsed <= 0.0:
 592                         return
 593                 speed = float(byte_counter) / elapsed
 594                 if speed > rate_limit:
 595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 596
 597         def temp_name(self, filename):
 598                 """Returns a temporary filename for the given filename."""
 599                 if self.params.get('nopart', False) or filename == u'-' or \
 600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 601                         return filename
 602                 return filename + u'.part'
 603
 604         def undo_temp_name(self, filename):
 605                 if filename.endswith(u'.part'):
 606                         return filename[:-len(u'.part')]
 607                 return filename
 608
 609         def try_rename(self, old_filename, new_filename):
 610                 try:
 611                         if old_filename == new_filename:
 612                                 return
 613                         os.rename(old_filename, new_filename)
 614                 except (IOError, OSError), err:
 615                         self.trouble(u'ERROR: unable to rename file')
 616
 617         def try_utime(self, filename, last_modified_hdr):
 618                 """Try to set the last-modified time of the given file."""
 619                 if last_modified_hdr is None:
 620                         return
 621                 if not os.path.isfile(filename):
 622                         return
 623                 timestr = last_modified_hdr
 624                 if timestr is None:
 625                         return
 626                 filetime = timeconvert(timestr)
 627                 if filetime is None:
 628                         return filetime
 629                 try:
 630                         os.utime(filename, (time.time(), filetime))
 631                 except:
 632                         pass
 633                 return filetime
 634
 635         def report_writedescription(self, descfn):
 636                 """ Report that the description file is being written """
 637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 638
 639         def report_writeinfojson(self, infofn):
 640                 """ Report that the metadata file has been written """
 641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 642
 643         def report_destination(self, filename):
 644                 """Report destination filename."""
 645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 646
 647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 648                 """Report download progress."""
 649                 if self.params.get('noprogress', False):
 650                         return
 651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 655
 656         def report_resuming_byte(self, resume_len):
 657                 """Report attempt to resume at given byte."""
 658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 659
 660         def report_retry(self, count, retries):
 661                 """Report retry in case of HTTP error 5xx"""
 662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 663
 664         def report_file_already_downloaded(self, file_name):
 665                 """Report file has already been fully downloaded."""
 666                 try:
 667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 668                 except (UnicodeEncodeError), err:
 669                         self.to_screen(u'[download] The file has already been downloaded')
 670
 671         def report_unable_to_resume(self):
 672                 """Report it was impossible to resume download."""
 673                 self.to_screen(u'[download] Unable to resume')
 674
 675         def report_finish(self):
 676                 """Report download finished."""
 677                 if self.params.get('noprogress', False):
 678                         self.to_screen(u'[download] Download completed')
 679                 else:
 680                         self.to_screen(u'')
 681
 682         def increment_downloads(self):
 683                 """Increment the ordinal that assigns a number to each file."""
 684                 self._num_downloads += 1
 685
 686         def prepare_filename(self, info_dict):
 687                 """Generate the output filename."""
 688                 try:
 689                         template_dict = dict(info_dict)
 690                         template_dict['epoch'] = unicode(long(time.time()))
 691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 692                         filename = self.params['outtmpl'] % template_dict
 693                         return filename
 694                 except (ValueError, KeyError), err:
 695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 696                         return None
 697
 698         def process_info(self, info_dict):
 699                 """Process a single dictionary returned by an InfoExtractor."""
 700                 filename = self.prepare_filename(info_dict)
 701
 702                 # Forced printings
 703                 if self.params.get('forcetitle', False):
 704                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 705                 if self.params.get('forceurl', False):
 706                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 707                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 708                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 710                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 711                 if self.params.get('forcefilename', False) and filename is not None:
 712                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 713
 714                 # Do nothing else if in simulate mode
 715                 if self.params.get('simulate', False):
 716                         return
 717
 718                 if filename is None:
 719                         return
 720
 721                 matchtitle=self.params.get('matchtitle',False)
 722                 rejecttitle=self.params.get('rejecttitle',False)
 723                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 724                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 725                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 726                         return
 727                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 728                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 729                         return
 730
 731                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 732                         self.to_stderr(u'WARNING: file exists and will be skipped')
 733                         return
 734
 735                 try:
 736                         dn = os.path.dirname(filename)
 737                         if dn != '' and not os.path.exists(dn):
 738                                 os.makedirs(dn)
 739                 except (OSError, IOError), err:
 740                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 741                         return
 742
 743                 if self.params.get('writedescription', False):
 744                         try:
 745                                 descfn = filename + '.description'
 746                                 self.report_writedescription(descfn)
 747                                 descfile = open(descfn, 'wb')
 748                                 try:
 749                                         descfile.write(info_dict['description'].encode('utf-8'))
 750                                 finally:
 751                                         descfile.close()
 752                         except (OSError, IOError):
 753                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 754                                 return
 755
 756                 if self.params.get('writeinfojson', False):
 757                         infofn = filename + '.info.json'
 758                         self.report_writeinfojson(infofn)
 759                         try:
 760                                 json.dump
 761                         except (NameError,AttributeError):
 762                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 763                                 return
 764                         try:
 765                                 infof = open(infofn, 'wb')
 766                                 try:
 767                                         json.dump(info_dict, infof)
 768                                 finally:
 769                                         infof.close()
 770                         except (OSError, IOError):
 771                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 772                                 return
 773
 774                 if not self.params.get('skip_download', False):
 775                         try:
 776                                 success,add_data = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 777                                 info_dict.update(add_data)
 778                         except (OSError, IOError), err:
 779                                 raise UnavailableVideoError
 780                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 781                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 782                                 return
 783                         except (ContentTooShortError, ), err:
 784                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 785                                 return
 786
 787                         if success:
 788                                 try:
 789                                         self.post_process(filename, info_dict)
 790                                 except (PostProcessingError), err:
 791                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 792                                         return
 793
 794         def download(self, url_list):
 795                 """Download a given list of URLs."""
 796                 if len(url_list) > 1 and self.fixed_template():
 797                         raise SameFileError(self.params['outtmpl'])
 798
 799                 for url in url_list:
 800                         suitable_found = False
 801                         for ie in self._ies:
 802                                 # Go to next InfoExtractor if not suitable
 803                                 if not ie.suitable(url):
 804                                         continue
 805
 806                                 # Suitable InfoExtractor found
 807                                 suitable_found = True
 808
 809                                 # Extract information from URL and process it
 810                                 ie.extract(url)
 811
 812                                 # Suitable InfoExtractor had been found; go to next URL
 813                                 break
 814
 815                         if not suitable_found:
 816                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 817
 818                 return self._download_retcode
 819
 820         def post_process(self, filename, ie_info):
 821                 """Run the postprocessing chain on the given file."""
 822                 info = dict(ie_info)
 823                 info['filepath'] = filename
 824                 for pp in self._pps:
 825                         info = pp.run(info)
 826                         if info is None:
 827                                 break
 828
 829         def _download_with_rtmpdump(self, filename, url, player_url):
 830                 self.report_destination(filename)
 831                 tmpfilename = self.temp_name(filename)
 832
 833                 # Check for rtmpdump first
 834                 try:
 835                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 836                 except (OSError, IOError):
 837                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 838                         return False
 839
 840                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 841                 # the connection was interrumpted and resuming appears to be
 842                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 843                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 844                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 845                 while retval == 2 or retval == 1:
 846                         prevsize = os.path.getsize(tmpfilename)
 847                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 848                         time.sleep(5.0) # This seems to be needed
 849                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 850                         cursize = os.path.getsize(tmpfilename)
 851                         if prevsize == cursize and retval == 1:
 852                                 break
 853                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 854                         if prevsize == cursize and retval == 2 and cursize > 1024:
 855                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 856                                 retval = 0
 857                                 break
 858                 if retval == 0:
 859                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 860                         self.try_rename(tmpfilename, filename)
 861                         return True
 862                 else:
 863                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 864                         return False
 865
 866         def _do_download(self, filename, url, player_url):
 867                 # Check file already present
 868                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 869                         self.report_file_already_downloaded(filename)
 870                         return True
 871
 872                 # Attempt to download using rtmpdump
 873                 if url.startswith('rtmp'):
 874                         return self._download_with_rtmpdump(filename, url, player_url)
 875
 876                 tmpfilename = self.temp_name(filename)
 877                 stream = None
 878                 open_mode = 'wb'
 879
 880                 # Do not include the Accept-Encoding header
 881                 headers = {'Youtubedl-no-compression': 'True'}
 882                 basic_request = urllib2.Request(url, None, headers)
 883                 request = urllib2.Request(url, None, headers)
 884
 885                 # Establish possible resume length
 886                 if os.path.isfile(tmpfilename):
 887                         resume_len = os.path.getsize(tmpfilename)
 888                 else:
 889                         resume_len = 0
 890
 891                 # Request parameters in case of being able to resume
 892                 if self.params.get('continuedl', False) and resume_len != 0:
 893                         self.report_resuming_byte(resume_len)
 894                         request.add_header('Range', 'bytes=%d-' % resume_len)
 895                         open_mode = 'ab'
 896
 897                 count = 0
 898                 retries = self.params.get('retries', 0)
 899                 while count <= retries:
 900                         # Establish connection
 901                         try:
 902                                 data = urllib2.urlopen(request)
 903                                 break
 904                         except (urllib2.HTTPError, ), err:
 905                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 906                                         # Unexpected HTTP error
 907                                         raise
 908                                 elif err.code == 416:
 909                                         # Unable to resume (requested range not satisfiable)
 910                                         try:
 911                                                 # Open the connection again without the range header
 912                                                 data = urllib2.urlopen(basic_request)
 913                                                 content_length = data.info()['Content-Length']
 914                                         except (urllib2.HTTPError, ), err:
 915                                                 if err.code < 500 or err.code >= 600:
 916                                                         raise
 917                                         else:
 918                                                 # Examine the reported length
 919                                                 if (content_length is not None and
 920                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 921                                                         # The file had already been fully downloaded.
 922                                                         # Explanation to the above condition: in issue #175 it was revealed that
 923                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 924                                                         # changing the file size slightly and causing problems for some users. So
 925                                                         # I decided to implement a suggested change and consider the file
 926                                                         # completely downloaded if the file size differs less than 100 bytes from
 927                                                         # the one in the hard drive.
 928                                                         self.report_file_already_downloaded(filename)
 929                                                         self.try_rename(tmpfilename, filename)
 930                                                         return True
 931                                                 else:
 932                                                         # The length does not match, we start the download over
 933                                                         self.report_unable_to_resume()
 934                                                         open_mode = 'wb'
 935                                                         break
 936                         # Retry
 937                         count += 1
 938                         if count <= retries:
 939                                 self.report_retry(count, retries)
 940
 941                 if count > retries:
 942                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 943                         return False
 944
 945                 data_len = data.info().get('Content-length', None)
 946                 if data_len is not None:
 947                         data_len = long(data_len) + resume_len
 948                 data_len_str = self.format_bytes(data_len)
 949                 byte_counter = 0 + resume_len
 950                 block_size = 1024
 951                 start = time.time()
 952                 while True:
 953                         # Download and write
 954                         before = time.time()
 955                         data_block = data.read(block_size)
 956                         after = time.time()
 957                         if len(data_block) == 0:
 958                                 break
 959                         byte_counter += len(data_block)
 960
 961                         # Open file just in time
 962                         if stream is None:
 963                                 try:
 964                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 965                                         assert stream is not None
 966                                         filename = self.undo_temp_name(tmpfilename)
 967                                         self.report_destination(filename)
 968                                 except (OSError, IOError), err:
 969                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 970                                         return False
 971                         try:
 972                                 stream.write(data_block)
 973                         except (IOError, OSError), err:
 974                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 975                                 return False
 976                         block_size = self.best_block_size(after - before, len(data_block))
 977
 978                         # Progress message
 979                         percent_str = self.calc_percent(byte_counter, data_len)
 980                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 981                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 982                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 983
 984                         # Apply rate limit
 985                         self.slow_down(start, byte_counter - resume_len)
 986
 987                 if stream is None:
 988                         self.trouble(u'\nERROR: Did not get any data blocks')
 989                         return False
 990                 stream.close()
 991                 self.report_finish()
 992                 if data_len is not None and byte_counter != data_len:
 993                         raise ContentTooShortError(byte_counter, long(data_len))
 994                 self.try_rename(tmpfilename, filename)
 995
 996                 # Update file modification time
 997                 filetime = None
 998                 if self.params.get('updatetime', True):
 999                         filetime = self.try_utime(filename, data.info().get('last-modified', None))
1000
1001                 return True, {'filetime': filetime}
1002
1003
1004 class InfoExtractor(object):
1005         """Information Extractor class.
1006
1007         Information extractors are the classes that, given a URL, extract
1008         information from the video (or videos) the URL refers to. This
1009         information includes the real video URL, the video title and simplified
1010         title, author and others. The information is stored in a dictionary
1011         which is then passed to the FileDownloader. The FileDownloader
1012         processes this information possibly downloading the video to the file
1013         system, among other possible outcomes. The dictionaries must include
1014         the following fields:
1015
1016         id:             Video identifier.
1017         url:            Final video URL.
1018         uploader:       Nickname of the video uploader.
1019         title:          Literal title.
1020         stitle:         Simplified title.
1021         ext:            Video filename extension.
1022         format:         Video format.
1023         player_url:     SWF Player URL (may be None).
1024
1025         The following fields are optional. Their primary purpose is to allow
1026         youtube-dl to serve as the backend for a video search function, such
1027         as the one in youtube2mp3.  They are only used when their respective
1028         forced printing functions are called:
1029
1030         thumbnail:      Full URL to a video thumbnail image.
1031         description:    One-line video description.
1032
1033         Subclasses of this one should re-define the _real_initialize() and
1034         _real_extract() methods and define a _VALID_URL regexp.
1035         Probably, they should also be added to the list of extractors.
1036         """
1037
1038         _ready = False
1039         _downloader = None
1040
1041         def __init__(self, downloader=None):
1042                 """Constructor. Receives an optional downloader."""
1043                 self._ready = False
1044                 self.set_downloader(downloader)
1045
1046         def suitable(self, url):
1047                 """Receives a URL and returns True if suitable for this IE."""
1048                 return re.match(self._VALID_URL, url) is not None
1049
1050         def initialize(self):
1051                 """Initializes an instance (authentication, etc)."""
1052                 if not self._ready:
1053                         self._real_initialize()
1054                         self._ready = True
1055
1056         def extract(self, url):
1057                 """Extracts URL information and returns it in list of dicts."""
1058                 self.initialize()
1059                 return self._real_extract(url)
1060
1061         def set_downloader(self, downloader):
1062                 """Sets the downloader for this IE."""
1063                 self._downloader = downloader
1064
1065         def _real_initialize(self):
1066                 """Real initialization process. Redefine in subclasses."""
1067                 pass
1068
1069         def _real_extract(self, url):
1070                 """Real extraction process. Redefine in subclasses."""
1071                 pass
1072
1073
1074 class YoutubeIE(InfoExtractor):
1075         """Information extractor for youtube.com."""
1076
1077         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1078         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1079         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1080         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1081         _NETRC_MACHINE = 'youtube'
1082         # Listed in order of quality
1083         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1084         _video_extensions = {
1085                 '13': '3gp',
1086                 '17': 'mp4',
1087                 '18': 'mp4',
1088                 '22': 'mp4',
1089                 '37': 'mp4',
1090                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1091                 '43': 'webm',
1092                 '45': 'webm',
1093         }
1094         IE_NAME = u'youtube'
1095
1096         def report_lang(self):
1097                 """Report attempt to set language."""
1098                 self._downloader.to_screen(u'[youtube] Setting language')
1099
1100         def report_login(self):
1101                 """Report attempt to log in."""
1102                 self._downloader.to_screen(u'[youtube] Logging in')
1103
1104         def report_age_confirmation(self):
1105                 """Report attempt to confirm age."""
1106                 self._downloader.to_screen(u'[youtube] Confirming age')
1107
1108         def report_video_webpage_download(self, video_id):
1109                 """Report attempt to download video webpage."""
1110                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1111
1112         def report_video_info_webpage_download(self, video_id):
1113                 """Report attempt to download video info webpage."""
1114                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1115
1116         def report_information_extraction(self, video_id):
1117                 """Report attempt to extract video information."""
1118                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1119
1120         def report_unavailable_format(self, video_id, format):
1121                 """Report extracted video URL."""
1122                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1123
1124         def report_rtmp_download(self):
1125                 """Indicate the download will use the RTMP protocol."""
1126                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1127
1128         def _real_initialize(self):
1129                 if self._downloader is None:
1130                         return
1131
1132                 username = None
1133                 password = None
1134                 downloader_params = self._downloader.params
1135
1136                 # Attempt to use provided username and password or .netrc data
1137                 if downloader_params.get('username', None) is not None:
1138                         username = downloader_params['username']
1139                         password = downloader_params['password']
1140                 elif downloader_params.get('usenetrc', False):
1141                         try:
1142                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1143                                 if info is not None:
1144                                         username = info[0]
1145                                         password = info[2]
1146                                 else:
1147                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1148                         except (IOError, netrc.NetrcParseError), err:
1149                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1150                                 return
1151
1152                 # Set language
1153                 request = urllib2.Request(self._LANG_URL)
1154                 try:
1155                         self.report_lang()
1156                         urllib2.urlopen(request).read()
1157                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1158                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1159                         return
1160
1161                 # No authentication to be performed
1162                 if username is None:
1163                         return
1164
1165                 # Log in
1166                 login_form = {
1167                                 'current_form': 'loginForm',
1168                                 'next':         '/',
1169                                 'action_login': 'Log In',
1170                                 'username':     username,
1171                                 'password':     password,
1172                                 }
1173                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1174                 try:
1175                         self.report_login()
1176                         login_results = urllib2.urlopen(request).read()
1177                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1178                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1179                                 return
1180                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1182                         return
1183
1184                 # Confirm age
1185                 age_form = {
1186                                 'next_url':             '/',
1187                                 'action_confirm':       'Confirm',
1188                                 }
1189                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1190                 try:
1191                         self.report_age_confirmation()
1192                         age_results = urllib2.urlopen(request).read()
1193                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1195                         return
1196
1197         def _real_extract(self, url):
1198                 # Extract video id from URL
1199                 mobj = re.match(self._VALID_URL, url)
1200                 if mobj is None:
1201                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1202                         return
1203                 video_id = mobj.group(2)
1204
1205                 # Get video webpage
1206                 self.report_video_webpage_download(video_id)
1207                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1208                 try:
1209                         video_webpage = urllib2.urlopen(request).read()
1210                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1212                         return
1213
1214                 # Attempt to extract SWF player URL
1215                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1216                 if mobj is not None:
1217                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1218                 else:
1219                         player_url = None
1220
1221                 # Get video info
1222                 self.report_video_info_webpage_download(video_id)
1223                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1224                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1225                                         % (video_id, el_type))
1226                         request = urllib2.Request(video_info_url)
1227                         try:
1228                                 video_info_webpage = urllib2.urlopen(request).read()
1229                                 video_info = parse_qs(video_info_webpage)
1230                                 if 'token' in video_info:
1231                                         break
1232                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1233                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1234                                 return
1235                 if 'token' not in video_info:
1236                         if 'reason' in video_info:
1237                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1238                         else:
1239                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1240                         return
1241
1242                 # Start extracting information
1243                 self.report_information_extraction(video_id)
1244
1245                 # uploader
1246                 if 'author' not in video_info:
1247                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1248                         return
1249                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1250
1251                 # title
1252                 if 'title' not in video_info:
1253                         self._downloader.trouble(u'ERROR: unable to extract video title')
1254                         return
1255                 video_title = urllib.unquote_plus(video_info['title'][0])
1256                 video_title = video_title.decode('utf-8')
1257                 video_title = sanitize_title(video_title)
1258
1259                 # simplified title
1260                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1261                 simple_title = simple_title.strip(ur'_')
1262
1263                 # thumbnail image
1264                 if 'thumbnail_url' not in video_info:
1265                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1266                         video_thumbnail = ''
1267                 else:   # don't panic if we can't find it
1268                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1269
1270                 # upload date
1271                 upload_date = u'NA'
1272                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1273                 if mobj is not None:
1274                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1275                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1276                         for expression in format_expressions:
1277                                 try:
1278                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1279                                 except:
1280                                         pass
1281
1282                 # description
1283                 try:
1284                         lxml.etree
1285                 except NameError:
1286                         video_description = u'No description available.'
1287                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1288                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1289                                 if mobj is not None:
1290                                         video_description = mobj.group(1).decode('utf-8')
1291                 else:
1292                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1293                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1294                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1295                         # TODO use another parser
1296
1297                 # token
1298                 video_token = urllib.unquote_plus(video_info['token'][0])
1299
1300                 # Decide which formats to download
1301                 req_format = self._downloader.params.get('format', None)
1302
1303                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1304                         self.report_rtmp_download()
1305                         video_url_list = [(None, video_info['conn'][0])]
1306                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1307                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1308                         url_data = [parse_qs(uds) for uds in url_data_strs]
1309                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1310                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1311
1312                         format_limit = self._downloader.params.get('format_limit', None)
1313                         if format_limit is not None and format_limit in self._available_formats:
1314                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1315                         else:
1316                                 format_list = self._available_formats
1317                         existing_formats = [x for x in format_list if x in url_map]
1318                         if len(existing_formats) == 0:
1319                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1320                                 return
1321                         if req_format is None:
1322                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1323                         elif req_format == 'worst':
1324                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1325                         elif req_format == '-1':
1326                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1327                         else:
1328                                 # Specific format
1329                                 if req_format not in url_map:
1330                                         self._downloader.trouble(u'ERROR: requested format not available')
1331                                         return
1332                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1333                 else:
1334                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1335                         return
1336
1337                 for format_param, video_real_url in video_url_list:
1338                         # At this point we have a new video
1339                         self._downloader.increment_downloads()
1340
1341                         # Extension
1342                         video_extension = self._video_extensions.get(format_param, 'flv')
1343
1344                         try:
1345                                 # Process video information
1346                                 self._downloader.process_info({
1347                                         'id':           video_id.decode('utf-8'),
1348                                         'url':          video_real_url.decode('utf-8'),
1349                                         'uploader':     video_uploader.decode('utf-8'),
1350                                         'upload_date':  upload_date,
1351                                         'title':        video_title,
1352                                         'stitle':       simple_title,
1353                                         'ext':          video_extension.decode('utf-8'),
1354                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1355                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1356                                         'description':  video_description,
1357                                         'player_url':   player_url,
1358                                 })
1359                         except UnavailableVideoError, err:
1360                                 self._downloader.trouble(u'\nERROR: unable to download video')
1361
1362
1363 class MetacafeIE(InfoExtractor):
1364         """Information Extractor for metacafe.com."""
1365
1366         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1367         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1368         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1369         _youtube_ie = None
1370         IE_NAME = u'metacafe'
1371
1372         def __init__(self, youtube_ie, downloader=None):
1373                 InfoExtractor.__init__(self, downloader)
1374                 self._youtube_ie = youtube_ie
1375
1376         def report_disclaimer(self):
1377                 """Report disclaimer retrieval."""
1378                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1379
1380         def report_age_confirmation(self):
1381                 """Report attempt to confirm age."""
1382                 self._downloader.to_screen(u'[metacafe] Confirming age')
1383
1384         def report_download_webpage(self, video_id):
1385                 """Report webpage download."""
1386                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1387
1388         def report_extraction(self, video_id):
1389                 """Report information extraction."""
1390                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1391
1392         def _real_initialize(self):
1393                 # Retrieve disclaimer
1394                 request = urllib2.Request(self._DISCLAIMER)
1395                 try:
1396                         self.report_disclaimer()
1397                         disclaimer = urllib2.urlopen(request).read()
1398                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1399                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1400                         return
1401
1402                 # Confirm age
1403                 disclaimer_form = {
1404                         'filters': '0',
1405                         'submit': "Continue - I'm over 18",
1406                         }
1407                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1408                 try:
1409                         self.report_age_confirmation()
1410                         disclaimer = urllib2.urlopen(request).read()
1411                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1413                         return
1414
1415         def _real_extract(self, url):
1416                 # Extract id and simplified title from URL
1417                 mobj = re.match(self._VALID_URL, url)
1418                 if mobj is None:
1419                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1420                         return
1421
1422                 video_id = mobj.group(1)
1423
1424                 # Check if video comes from YouTube
1425                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1426                 if mobj2 is not None:
1427                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1428                         return
1429
1430                 # At this point we have a new video
1431                 self._downloader.increment_downloads()
1432
1433                 simple_title = mobj.group(2).decode('utf-8')
1434
1435                 # Retrieve video webpage to extract further information
1436                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1437                 try:
1438                         self.report_download_webpage(video_id)
1439                         webpage = urllib2.urlopen(request).read()
1440                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1441                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1442                         return
1443
1444                 # Extract URL, uploader and title from webpage
1445                 self.report_extraction(video_id)
1446                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1447                 if mobj is not None:
1448                         mediaURL = urllib.unquote(mobj.group(1))
1449                         video_extension = mediaURL[-3:]
1450
1451                         # Extract gdaKey if available
1452                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1453                         if mobj is None:
1454                                 video_url = mediaURL
1455                         else:
1456                                 gdaKey = mobj.group(1)
1457                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1458                 else:
1459                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1460                         if mobj is None:
1461                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1462                                 return
1463                         vardict = parse_qs(mobj.group(1))
1464                         if 'mediaData' not in vardict:
1465                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1466                                 return
1467                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1468                         if mobj is None:
1469                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1470                                 return
1471                         mediaURL = mobj.group(1).replace('\\/', '/')
1472                         video_extension = mediaURL[-3:]
1473                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1474
1475                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1476                 if mobj is None:
1477                         self._downloader.trouble(u'ERROR: unable to extract title')
1478                         return
1479                 video_title = mobj.group(1).decode('utf-8')
1480                 video_title = sanitize_title(video_title)
1481
1482                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1483                 if mobj is None:
1484                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1485                         return
1486                 video_uploader = mobj.group(1)
1487
1488                 try:
1489                         # Process video information
1490                         self._downloader.process_info({
1491                                 'id':           video_id.decode('utf-8'),
1492                                 'url':          video_url.decode('utf-8'),
1493                                 'uploader':     video_uploader.decode('utf-8'),
1494                                 'upload_date':  u'NA',
1495                                 'title':        video_title,
1496                                 'stitle':       simple_title,
1497                                 'ext':          video_extension.decode('utf-8'),
1498                                 'format':       u'NA',
1499                                 'player_url':   None,
1500                         })
1501                 except UnavailableVideoError:
1502                         self._downloader.trouble(u'\nERROR: unable to download video')
1503
1504
1505 class DailymotionIE(InfoExtractor):
1506         """Information Extractor for Dailymotion"""
1507
1508         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1509         IE_NAME = u'dailymotion'
1510
1511         def __init__(self, downloader=None):
1512                 InfoExtractor.__init__(self, downloader)
1513
1514         def report_download_webpage(self, video_id):
1515                 """Report webpage download."""
1516                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1517
1518         def report_extraction(self, video_id):
1519                 """Report information extraction."""
1520                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1521
1522         def _real_initialize(self):
1523                 return
1524
1525         def _real_extract(self, url):
1526                 # Extract id and simplified title from URL
1527                 mobj = re.match(self._VALID_URL, url)
1528                 if mobj is None:
1529                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1530                         return
1531
1532                 # At this point we have a new video
1533                 self._downloader.increment_downloads()
1534                 video_id = mobj.group(1)
1535
1536                 simple_title = mobj.group(2).decode('utf-8')
1537                 video_extension = 'flv'
1538
1539                 # Retrieve video webpage to extract further information
1540                 request = urllib2.Request(url)
1541                 request.add_header('Cookie', 'family_filter=off')
1542                 try:
1543                         self.report_download_webpage(video_id)
1544                         webpage = urllib2.urlopen(request).read()
1545                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1546                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1547                         return
1548
1549                 # Extract URL, uploader and title from webpage
1550                 self.report_extraction(video_id)
1551                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1552                 if mobj is None:
1553                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1554                         return
1555                 sequence = urllib.unquote(mobj.group(1))
1556                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1557                 if mobj is None:
1558                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1559                         return
1560                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1561
1562                 # if needed add http://www.dailymotion.com/ if relative URL
1563
1564                 video_url = mediaURL
1565
1566                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1567                 if mobj is None:
1568                         self._downloader.trouble(u'ERROR: unable to extract title')
1569                         return
1570                 video_title = mobj.group(1).decode('utf-8')
1571                 video_title = sanitize_title(video_title)
1572
1573                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1574                 if mobj is None:
1575                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1576                         return
1577                 video_uploader = mobj.group(1)
1578
1579                 try:
1580                         # Process video information
1581                         self._downloader.process_info({
1582                                 'id':           video_id.decode('utf-8'),
1583                                 'url':          video_url.decode('utf-8'),
1584                                 'uploader':     video_uploader.decode('utf-8'),
1585                                 'upload_date':  u'NA',
1586                                 'title':        video_title,
1587                                 'stitle':       simple_title,
1588                                 'ext':          video_extension.decode('utf-8'),
1589                                 'format':       u'NA',
1590                                 'player_url':   None,
1591                         })
1592                 except UnavailableVideoError:
1593                         self._downloader.trouble(u'\nERROR: unable to download video')
1594
1595
1596 class GoogleIE(InfoExtractor):
1597         """Information extractor for video.google.com."""
1598
1599         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1600         IE_NAME = u'video.google'
1601
1602         def __init__(self, downloader=None):
1603                 InfoExtractor.__init__(self, downloader)
1604
1605         def report_download_webpage(self, video_id):
1606                 """Report webpage download."""
1607                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1608
1609         def report_extraction(self, video_id):
1610                 """Report information extraction."""
1611                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1612
1613         def _real_initialize(self):
1614                 return
1615
1616         def _real_extract(self, url):
1617                 # Extract id from URL
1618                 mobj = re.match(self._VALID_URL, url)
1619                 if mobj is None:
1620                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1621                         return
1622
1623                 # At this point we have a new video
1624                 self._downloader.increment_downloads()
1625                 video_id = mobj.group(1)
1626
1627                 video_extension = 'mp4'
1628
1629                 # Retrieve video webpage to extract further information
1630                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1631                 try:
1632                         self.report_download_webpage(video_id)
1633                         webpage = urllib2.urlopen(request).read()
1634                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1635                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1636                         return
1637
1638                 # Extract URL, uploader, and title from webpage
1639                 self.report_extraction(video_id)
1640                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1641                 if mobj is None:
1642                         video_extension = 'flv'
1643                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1644                 if mobj is None:
1645                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1646                         return
1647                 mediaURL = urllib.unquote(mobj.group(1))
1648                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1649                 mediaURL = mediaURL.replace('\\x26', '\x26')
1650
1651                 video_url = mediaURL
1652
1653                 mobj = re.search(r'<title>(.*)</title>', webpage)
1654                 if mobj is None:
1655                         self._downloader.trouble(u'ERROR: unable to extract title')
1656                         return
1657                 video_title = mobj.group(1).decode('utf-8')
1658                 video_title = sanitize_title(video_title)
1659                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1660
1661                 # Extract video description
1662                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1663                 if mobj is None:
1664                         self._downloader.trouble(u'ERROR: unable to extract video description')
1665                         return
1666                 video_description = mobj.group(1).decode('utf-8')
1667                 if not video_description:
1668                         video_description = 'No description available.'
1669
1670                 # Extract video thumbnail
1671                 if self._downloader.params.get('forcethumbnail', False):
1672                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1673                         try:
1674                                 webpage = urllib2.urlopen(request).read()
1675                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1677                                 return
1678                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1679                         if mobj is None:
1680                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1681                                 return
1682                         video_thumbnail = mobj.group(1)
1683                 else:   # we need something to pass to process_info
1684                         video_thumbnail = ''
1685
1686                 try:
1687                         # Process video information
1688                         self._downloader.process_info({
1689                                 'id':           video_id.decode('utf-8'),
1690                                 'url':          video_url.decode('utf-8'),
1691                                 'uploader':     u'NA',
1692                                 'upload_date':  u'NA',
1693                                 'title':        video_title,
1694                                 'stitle':       simple_title,
1695                                 'ext':          video_extension.decode('utf-8'),
1696                                 'format':       u'NA',
1697                                 'player_url':   None,
1698                         })
1699                 except UnavailableVideoError:
1700                         self._downloader.trouble(u'\nERROR: unable to download video')
1701
1702
1703 class PhotobucketIE(InfoExtractor):
1704         """Information extractor for photobucket.com."""
1705
1706         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1707         IE_NAME = u'photobucket'
1708
1709         def __init__(self, downloader=None):
1710                 InfoExtractor.__init__(self, downloader)
1711
1712         def report_download_webpage(self, video_id):
1713                 """Report webpage download."""
1714                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1715
1716         def report_extraction(self, video_id):
1717                 """Report information extraction."""
1718                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1719
1720         def _real_initialize(self):
1721                 return
1722
1723         def _real_extract(self, url):
1724                 # Extract id from URL
1725                 mobj = re.match(self._VALID_URL, url)
1726                 if mobj is None:
1727                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1728                         return
1729
1730                 # At this point we have a new video
1731                 self._downloader.increment_downloads()
1732                 video_id = mobj.group(1)
1733
1734                 video_extension = 'flv'
1735
1736                 # Retrieve video webpage to extract further information
1737                 request = urllib2.Request(url)
1738                 try:
1739                         self.report_download_webpage(video_id)
1740                         webpage = urllib2.urlopen(request).read()
1741                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1742                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1743                         return
1744
1745                 # Extract URL, uploader, and title from webpage
1746                 self.report_extraction(video_id)
1747                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1748                 if mobj is None:
1749                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1750                         return
1751                 mediaURL = urllib.unquote(mobj.group(1))
1752
1753                 video_url = mediaURL
1754
1755                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1756                 if mobj is None:
1757                         self._downloader.trouble(u'ERROR: unable to extract title')
1758                         return
1759                 video_title = mobj.group(1).decode('utf-8')
1760                 video_title = sanitize_title(video_title)
1761                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1762
1763                 video_uploader = mobj.group(2).decode('utf-8')
1764
1765                 try:
1766                         # Process video information
1767                         self._downloader.process_info({
1768                                 'id':           video_id.decode('utf-8'),
1769                                 'url':          video_url.decode('utf-8'),
1770                                 'uploader':     video_uploader,
1771                                 'upload_date':  u'NA',
1772                                 'title':        video_title,
1773                                 'stitle':       simple_title,
1774                                 'ext':          video_extension.decode('utf-8'),
1775                                 'format':       u'NA',
1776                                 'player_url':   None,
1777                         })
1778                 except UnavailableVideoError:
1779                         self._downloader.trouble(u'\nERROR: unable to download video')
1780
1781
1782 class YahooIE(InfoExtractor):
1783         """Information extractor for video.yahoo.com."""
1784
1785         # _VALID_URL matches all Yahoo! Video URLs
1786         # _VPAGE_URL matches only the extractable '/watch/' URLs
1787         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1788         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1789         IE_NAME = u'video.yahoo'
1790
1791         def __init__(self, downloader=None):
1792                 InfoExtractor.__init__(self, downloader)
1793
1794         def report_download_webpage(self, video_id):
1795                 """Report webpage download."""
1796                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1797
1798         def report_extraction(self, video_id):
1799                 """Report information extraction."""
1800                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1801
1802         def _real_initialize(self):
1803                 return
1804
1805         def _real_extract(self, url, new_video=True):
1806                 # Extract ID from URL
1807                 mobj = re.match(self._VALID_URL, url)
1808                 if mobj is None:
1809                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1810                         return
1811
1812                 # At this point we have a new video
1813                 self._downloader.increment_downloads()
1814                 video_id = mobj.group(2)
1815                 video_extension = 'flv'
1816
1817                 # Rewrite valid but non-extractable URLs as
1818                 # extractable English language /watch/ URLs
1819                 if re.match(self._VPAGE_URL, url) is None:
1820                         request = urllib2.Request(url)
1821                         try:
1822                                 webpage = urllib2.urlopen(request).read()
1823                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1824                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1825                                 return
1826
1827                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1828                         if mobj is None:
1829                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1830                                 return
1831                         yahoo_id = mobj.group(1)
1832
1833                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1834                         if mobj is None:
1835                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1836                                 return
1837                         yahoo_vid = mobj.group(1)
1838
1839                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1840                         return self._real_extract(url, new_video=False)
1841
1842                 # Retrieve video webpage to extract further information
1843                 request = urllib2.Request(url)
1844                 try:
1845                         self.report_download_webpage(video_id)
1846                         webpage = urllib2.urlopen(request).read()
1847                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1848                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1849                         return
1850
1851                 # Extract uploader and title from webpage
1852                 self.report_extraction(video_id)
1853                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1854                 if mobj is None:
1855                         self._downloader.trouble(u'ERROR: unable to extract video title')
1856                         return
1857                 video_title = mobj.group(1).decode('utf-8')
1858                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1859
1860                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1861                 if mobj is None:
1862                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1863                         return
1864                 video_uploader = mobj.group(1).decode('utf-8')
1865
1866                 # Extract video thumbnail
1867                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1868                 if mobj is None:
1869                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1870                         return
1871                 video_thumbnail = mobj.group(1).decode('utf-8')
1872
1873                 # Extract video description
1874                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1875                 if mobj is None:
1876                         self._downloader.trouble(u'ERROR: unable to extract video description')
1877                         return
1878                 video_description = mobj.group(1).decode('utf-8')
1879                 if not video_description:
1880                         video_description = 'No description available.'
1881
1882                 # Extract video height and width
1883                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1884                 if mobj is None:
1885                         self._downloader.trouble(u'ERROR: unable to extract video height')
1886                         return
1887                 yv_video_height = mobj.group(1)
1888
1889                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1890                 if mobj is None:
1891                         self._downloader.trouble(u'ERROR: unable to extract video width')
1892                         return
1893                 yv_video_width = mobj.group(1)
1894
1895                 # Retrieve video playlist to extract media URL
1896                 # I'm not completely sure what all these options are, but we
1897                 # seem to need most of them, otherwise the server sends a 401.
1898                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1899                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1900                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1901                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1902                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1903                 try:
1904                         self.report_download_webpage(video_id)
1905                         webpage = urllib2.urlopen(request).read()
1906                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1907                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1908                         return
1909
1910                 # Extract media URL from playlist XML
1911                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1912                 if mobj is None:
1913                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1914                         return
1915                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1916                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1917
1918                 try:
1919                         # Process video information
1920                         self._downloader.process_info({
1921                                 'id':           video_id.decode('utf-8'),
1922                                 'url':          video_url,
1923                                 'uploader':     video_uploader,
1924                                 'upload_date':  u'NA',
1925                                 'title':        video_title,
1926                                 'stitle':       simple_title,
1927                                 'ext':          video_extension.decode('utf-8'),
1928                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1929                                 'description':  video_description,
1930                                 'thumbnail':    video_thumbnail,
1931                                 'player_url':   None,
1932                         })
1933                 except UnavailableVideoError:
1934                         self._downloader.trouble(u'\nERROR: unable to download video')
1935
1936
1937 class VimeoIE(InfoExtractor):
1938         """Information extractor for vimeo.com."""
1939
1940         # _VALID_URL matches Vimeo URLs
1941         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1942         IE_NAME = u'vimeo'
1943
1944         def __init__(self, downloader=None):
1945                 InfoExtractor.__init__(self, downloader)
1946
1947         def report_download_webpage(self, video_id):
1948                 """Report webpage download."""
1949                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1950
1951         def report_extraction(self, video_id):
1952                 """Report information extraction."""
1953                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1954
1955         def _real_initialize(self):
1956                 return
1957
1958         def _real_extract(self, url, new_video=True):
1959                 # Extract ID from URL
1960                 mobj = re.match(self._VALID_URL, url)
1961                 if mobj is None:
1962                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1963                         return
1964
1965                 # At this point we have a new video
1966                 self._downloader.increment_downloads()
1967                 video_id = mobj.group(1)
1968
1969                 # Retrieve video webpage to extract further information
1970                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1971                 try:
1972                         self.report_download_webpage(video_id)
1973                         webpage = urllib2.urlopen(request).read()
1974                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1975                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1976                         return
1977
1978                 # Now we begin extracting as much information as we can from what we
1979                 # retrieved. First we extract the information common to all extractors,
1980                 # and latter we extract those that are Vimeo specific.
1981                 self.report_extraction(video_id)
1982
1983                 # Extract title
1984                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1985                 if mobj is None:
1986                         self._downloader.trouble(u'ERROR: unable to extract video title')
1987                         return
1988                 video_title = mobj.group(1).decode('utf-8')
1989                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1990
1991                 # Extract uploader
1992                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1993                 if mobj is None:
1994                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1995                         return
1996                 video_uploader = mobj.group(1).decode('utf-8')
1997
1998                 # Extract video thumbnail
1999                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2000                 if mobj is None:
2001                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2002                         return
2003                 video_thumbnail = mobj.group(1).decode('utf-8')
2004
2005                 # # Extract video description
2006                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2007                 # if mobj is None:
2008                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2009                 #       return
2010                 # video_description = mobj.group(1).decode('utf-8')
2011                 # if not video_description: video_description = 'No description available.'
2012                 video_description = 'Foo.'
2013
2014                 # Vimeo specific: extract request signature
2015                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2016                 if mobj is None:
2017                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2018                         return
2019                 sig = mobj.group(1).decode('utf-8')
2020
2021                 # Vimeo specific: Extract request signature expiration
2022                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2023                 if mobj is None:
2024                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2025                         return
2026                 sig_exp = mobj.group(1).decode('utf-8')
2027
2028                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2029
2030                 try:
2031                         # Process video information
2032                         self._downloader.process_info({
2033                                 'id':           video_id.decode('utf-8'),
2034                                 'url':          video_url,
2035                                 'uploader':     video_uploader,
2036                                 'upload_date':  u'NA',
2037                                 'title':        video_title,
2038                                 'stitle':       simple_title,
2039                                 'ext':          u'mp4',
2040                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2041                                 'description':  video_description,
2042                                 'thumbnail':    video_thumbnail,
2043                                 'description':  video_description,
2044                                 'player_url':   None,
2045                         })
2046                 except UnavailableVideoError:
2047                         self._downloader.trouble(u'ERROR: unable to download video')
2048
2049
2050 class GenericIE(InfoExtractor):
2051         """Generic last-resort information extractor."""
2052
2053         _VALID_URL = r'.*'
2054         IE_NAME = u'generic'
2055
2056         def __init__(self, downloader=None):
2057                 InfoExtractor.__init__(self, downloader)
2058
2059         def report_download_webpage(self, video_id):
2060                 """Report webpage download."""
2061                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2062                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2063
2064         def report_extraction(self, video_id):
2065                 """Report information extraction."""
2066                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2067
2068         def _real_initialize(self):
2069                 return
2070
2071         def _real_extract(self, url):
2072                 # At this point we have a new video
2073                 self._downloader.increment_downloads()
2074
2075                 video_id = url.split('/')[-1]
2076                 request = urllib2.Request(url)
2077                 try:
2078                         self.report_download_webpage(video_id)
2079                         webpage = urllib2.urlopen(request).read()
2080                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2081                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2082                         return
2083                 except ValueError, err:
2084                         # since this is the last-resort InfoExtractor, if
2085                         # this error is thrown, it'll be thrown here
2086                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2087                         return
2088
2089                 self.report_extraction(video_id)
2090                 # Start with something easy: JW Player in SWFObject
2091                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2092                 if mobj is None:
2093                         # Broaden the search a little bit
2094                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2095                 if mobj is None:
2096                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2097                         return
2098
2099                 # It's possible that one of the regexes
2100                 # matched, but returned an empty group:
2101                 if mobj.group(1) is None:
2102                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2103                         return
2104
2105                 video_url = urllib.unquote(mobj.group(1))
2106                 video_id = os.path.basename(video_url)
2107
2108                 # here's a fun little line of code for you:
2109                 video_extension = os.path.splitext(video_id)[1][1:]
2110                 video_id = os.path.splitext(video_id)[0]
2111
2112                 # it's tempting to parse this further, but you would
2113                 # have to take into account all the variations like
2114                 #   Video Title - Site Name
2115                 #   Site Name | Video Title
2116                 #   Video Title - Tagline | Site Name
2117                 # and so on and so forth; it's just not practical
2118                 mobj = re.search(r'<title>(.*)</title>', webpage)
2119                 if mobj is None:
2120                         self._downloader.trouble(u'ERROR: unable to extract title')
2121                         return
2122                 video_title = mobj.group(1).decode('utf-8')
2123                 video_title = sanitize_title(video_title)
2124                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2125
2126                 # video uploader is domain name
2127                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2128                 if mobj is None:
2129                         self._downloader.trouble(u'ERROR: unable to extract title')
2130                         return
2131                 video_uploader = mobj.group(1).decode('utf-8')
2132
2133                 try:
2134                         # Process video information
2135                         self._downloader.process_info({
2136                                 'id':           video_id.decode('utf-8'),
2137                                 'url':          video_url.decode('utf-8'),
2138                                 'uploader':     video_uploader,
2139                                 'upload_date':  u'NA',
2140                                 'title':        video_title,
2141                                 'stitle':       simple_title,
2142                                 'ext':          video_extension.decode('utf-8'),
2143                                 'format':       u'NA',
2144                                 'player_url':   None,
2145                         })
2146                 except UnavailableVideoError, err:
2147                         self._downloader.trouble(u'\nERROR: unable to download video')
2148
2149
2150 class YoutubeSearchIE(InfoExtractor):
2151         """Information Extractor for YouTube search queries."""
2152         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2153         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2154         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2155         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2156         _youtube_ie = None
2157         _max_youtube_results = 1000
2158         IE_NAME = u'youtube:search'
2159
2160         def __init__(self, youtube_ie, downloader=None):
2161                 InfoExtractor.__init__(self, downloader)
2162                 self._youtube_ie = youtube_ie
2163
2164         def report_download_page(self, query, pagenum):
2165                 """Report attempt to download playlist page with given number."""
2166                 query = query.decode(preferredencoding())
2167                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2168
2169         def _real_initialize(self):
2170                 self._youtube_ie.initialize()
2171
2172         def _real_extract(self, query):
2173                 mobj = re.match(self._VALID_URL, query)
2174                 if mobj is None:
2175                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2176                         return
2177
2178                 prefix, query = query.split(':')
2179                 prefix = prefix[8:]
2180                 query = query.encode('utf-8')
2181                 if prefix == '':
2182                         self._download_n_results(query, 1)
2183                         return
2184                 elif prefix == 'all':
2185                         self._download_n_results(query, self._max_youtube_results)
2186                         return
2187                 else:
2188                         try:
2189                                 n = long(prefix)
2190                                 if n <= 0:
2191                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2192                                         return
2193                                 elif n > self._max_youtube_results:
2194                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2195                                         n = self._max_youtube_results
2196                                 self._download_n_results(query, n)
2197                                 return
2198                         except ValueError: # parsing prefix as integer fails
2199                                 self._download_n_results(query, 1)
2200                                 return
2201
2202         def _download_n_results(self, query, n):
2203                 """Downloads a specified number of results for a query"""
2204
2205                 video_ids = []
2206                 already_seen = set()
2207                 pagenum = 1
2208
2209                 while True:
2210                         self.report_download_page(query, pagenum)
2211                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2212                         request = urllib2.Request(result_url)
2213                         try:
2214                                 page = urllib2.urlopen(request).read()
2215                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2216                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2217                                 return
2218
2219                         # Extract video identifiers
2220                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2221                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2222                                 if video_id not in already_seen:
2223                                         video_ids.append(video_id)
2224                                         already_seen.add(video_id)
2225                                         if len(video_ids) == n:
2226                                                 # Specified n videos reached
2227                                                 for id in video_ids:
2228                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2229                                                 return
2230
2231                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2232                                 for id in video_ids:
2233                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2234                                 return
2235
2236                         pagenum = pagenum + 1
2237
2238
2239 class GoogleSearchIE(InfoExtractor):
2240         """Information Extractor for Google Video search queries."""
2241         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2242         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2243         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2244         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2245         _google_ie = None
2246         _max_google_results = 1000
2247         IE_NAME = u'video.google:search'
2248
2249         def __init__(self, google_ie, downloader=None):
2250                 InfoExtractor.__init__(self, downloader)
2251                 self._google_ie = google_ie
2252
2253         def report_download_page(self, query, pagenum):
2254                 """Report attempt to download playlist page with given number."""
2255                 query = query.decode(preferredencoding())
2256                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2257
2258         def _real_initialize(self):
2259                 self._google_ie.initialize()
2260
2261         def _real_extract(self, query):
2262                 mobj = re.match(self._VALID_URL, query)
2263                 if mobj is None:
2264                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2265                         return
2266
2267                 prefix, query = query.split(':')
2268                 prefix = prefix[8:]
2269                 query = query.encode('utf-8')
2270                 if prefix == '':
2271                         self._download_n_results(query, 1)
2272                         return
2273                 elif prefix == 'all':
2274                         self._download_n_results(query, self._max_google_results)
2275                         return
2276                 else:
2277                         try:
2278                                 n = long(prefix)
2279                                 if n <= 0:
2280                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2281                                         return
2282                                 elif n > self._max_google_results:
2283                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2284                                         n = self._max_google_results
2285                                 self._download_n_results(query, n)
2286                                 return
2287                         except ValueError: # parsing prefix as integer fails
2288                                 self._download_n_results(query, 1)
2289                                 return
2290
2291         def _download_n_results(self, query, n):
2292                 """Downloads a specified number of results for a query"""
2293
2294                 video_ids = []
2295                 already_seen = set()
2296                 pagenum = 1
2297
2298                 while True:
2299                         self.report_download_page(query, pagenum)
2300                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2301                         request = urllib2.Request(result_url)
2302                         try:
2303                                 page = urllib2.urlopen(request).read()
2304                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2305                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2306                                 return
2307
2308                         # Extract video identifiers
2309                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2310                                 video_id = mobj.group(1)
2311                                 if video_id not in already_seen:
2312                                         video_ids.append(video_id)
2313                                         already_seen.add(video_id)
2314                                         if len(video_ids) == n:
2315                                                 # Specified n videos reached
2316                                                 for id in video_ids:
2317                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2318                                                 return
2319
2320                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2321                                 for id in video_ids:
2322                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2323                                 return
2324
2325                         pagenum = pagenum + 1
2326
2327
2328 class YahooSearchIE(InfoExtractor):
2329         """Information Extractor for Yahoo! Video search queries."""
2330         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2331         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2332         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2333         _MORE_PAGES_INDICATOR = r'\s*Next'
2334         _yahoo_ie = None
2335         _max_yahoo_results = 1000
2336         IE_NAME = u'video.yahoo:search'
2337
2338         def __init__(self, yahoo_ie, downloader=None):
2339                 InfoExtractor.__init__(self, downloader)
2340                 self._yahoo_ie = yahoo_ie
2341
2342         def report_download_page(self, query, pagenum):
2343                 """Report attempt to download playlist page with given number."""
2344                 query = query.decode(preferredencoding())
2345                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2346
2347         def _real_initialize(self):
2348                 self._yahoo_ie.initialize()
2349
2350         def _real_extract(self, query):
2351                 mobj = re.match(self._VALID_URL, query)
2352                 if mobj is None:
2353                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2354                         return
2355
2356                 prefix, query = query.split(':')
2357                 prefix = prefix[8:]
2358                 query = query.encode('utf-8')
2359                 if prefix == '':
2360                         self._download_n_results(query, 1)
2361                         return
2362                 elif prefix == 'all':
2363                         self._download_n_results(query, self._max_yahoo_results)
2364                         return
2365                 else:
2366                         try:
2367                                 n = long(prefix)
2368                                 if n <= 0:
2369                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2370                                         return
2371                                 elif n > self._max_yahoo_results:
2372                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2373                                         n = self._max_yahoo_results
2374                                 self._download_n_results(query, n)
2375                                 return
2376                         except ValueError: # parsing prefix as integer fails
2377                                 self._download_n_results(query, 1)
2378                                 return
2379
2380         def _download_n_results(self, query, n):
2381                 """Downloads a specified number of results for a query"""
2382
2383                 video_ids = []
2384                 already_seen = set()
2385                 pagenum = 1
2386
2387                 while True:
2388                         self.report_download_page(query, pagenum)
2389                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2390                         request = urllib2.Request(result_url)
2391                         try:
2392                                 page = urllib2.urlopen(request).read()
2393                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2394                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2395                                 return
2396
2397                         # Extract video identifiers
2398                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2399                                 video_id = mobj.group(1)
2400                                 if video_id not in already_seen:
2401                                         video_ids.append(video_id)
2402                                         already_seen.add(video_id)
2403                                         if len(video_ids) == n:
2404                                                 # Specified n videos reached
2405                                                 for id in video_ids:
2406                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2407                                                 return
2408
2409                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2410                                 for id in video_ids:
2411                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2412                                 return
2413
2414                         pagenum = pagenum + 1
2415
2416
2417 class YoutubePlaylistIE(InfoExtractor):
2418         """Information Extractor for YouTube playlists."""
2419
2420         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2421         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2422         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2423         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2424         _youtube_ie = None
2425         IE_NAME = u'youtube:playlist'
2426
2427         def __init__(self, youtube_ie, downloader=None):
2428                 InfoExtractor.__init__(self, downloader)
2429                 self._youtube_ie = youtube_ie
2430
2431         def report_download_page(self, playlist_id, pagenum):
2432                 """Report attempt to download playlist page with given number."""
2433                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2434
2435         def _real_initialize(self):
2436                 self._youtube_ie.initialize()
2437
2438         def _real_extract(self, url):
2439                 # Extract playlist id
2440                 mobj = re.match(self._VALID_URL, url)
2441                 if mobj is None:
2442                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2443                         return
2444
2445                 # Single video case
2446                 if mobj.group(3) is not None:
2447                         self._youtube_ie.extract(mobj.group(3))
2448                         return
2449
2450                 # Download playlist pages
2451                 # prefix is 'p' as default for playlists but there are other types that need extra care
2452                 playlist_prefix = mobj.group(1)
2453                 if playlist_prefix == 'a':
2454                         playlist_access = 'artist'
2455                 else:
2456                         playlist_prefix = 'p'
2457                         playlist_access = 'view_play_list'
2458                 playlist_id = mobj.group(2)
2459                 video_ids = []
2460                 pagenum = 1
2461
2462                 while True:
2463                         self.report_download_page(playlist_id, pagenum)
2464                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2465                         try:
2466                                 page = urllib2.urlopen(request).read()
2467                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2469                                 return
2470
2471                         # Extract video identifiers
2472                         ids_in_page = []
2473                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2474                                 if mobj.group(1) not in ids_in_page:
2475                                         ids_in_page.append(mobj.group(1))
2476                         video_ids.extend(ids_in_page)
2477
2478                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2479                                 break
2480                         pagenum = pagenum + 1
2481
2482                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2483                 playlistend = self._downloader.params.get('playlistend', -1)
2484                 video_ids = video_ids[playliststart:playlistend]
2485
2486                 for id in video_ids:
2487                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2488                 return
2489
2490
2491 class YoutubeUserIE(InfoExtractor):
2492         """Information Extractor for YouTube users."""
2493
2494         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2495         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2496         _GDATA_PAGE_SIZE = 50
2497         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2498         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2499         _youtube_ie = None
2500         IE_NAME = u'youtube:user'
2501
2502         def __init__(self, youtube_ie, downloader=None):
2503                 InfoExtractor.__init__(self, downloader)
2504                 self._youtube_ie = youtube_ie
2505
2506         def report_download_page(self, username, start_index):
2507                 """Report attempt to download user page."""
2508                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2509                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2510
2511         def _real_initialize(self):
2512                 self._youtube_ie.initialize()
2513
2514         def _real_extract(self, url):
2515                 # Extract username
2516                 mobj = re.match(self._VALID_URL, url)
2517                 if mobj is None:
2518                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2519                         return
2520
2521                 username = mobj.group(1)
2522
2523                 # Download video ids using YouTube Data API. Result size per
2524                 # query is limited (currently to 50 videos) so we need to query
2525                 # page by page until there are no video ids - it means we got
2526                 # all of them.
2527
2528                 video_ids = []
2529                 pagenum = 0
2530
2531                 while True:
2532                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2533                         self.report_download_page(username, start_index)
2534
2535                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2536
2537                         try:
2538                                 page = urllib2.urlopen(request).read()
2539                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2540                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2541                                 return
2542
2543                         # Extract video identifiers
2544                         ids_in_page = []
2545
2546                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2547                                 if mobj.group(1) not in ids_in_page:
2548                                         ids_in_page.append(mobj.group(1))
2549
2550                         video_ids.extend(ids_in_page)
2551
2552                         # A little optimization - if current page is not
2553                         # "full", ie. does not contain PAGE_SIZE video ids then
2554                         # we can assume that this page is the last one - there
2555                         # are no more ids on further pages - no need to query
2556                         # again.
2557
2558                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2559                                 break
2560
2561                         pagenum += 1
2562
2563                 all_ids_count = len(video_ids)
2564                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2565                 playlistend = self._downloader.params.get('playlistend', -1)
2566
2567                 if playlistend == -1:
2568                         video_ids = video_ids[playliststart:]
2569                 else:
2570                         video_ids = video_ids[playliststart:playlistend]
2571
2572                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2573                                 (username, all_ids_count, len(video_ids)))
2574
2575                 for video_id in video_ids:
2576                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2577
2578
2579 class DepositFilesIE(InfoExtractor):
2580         """Information extractor for depositfiles.com"""
2581
2582         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2583         IE_NAME = u'DepositFiles'
2584
2585         def __init__(self, downloader=None):
2586                 InfoExtractor.__init__(self, downloader)
2587
2588         def report_download_webpage(self, file_id):
2589                 """Report webpage download."""
2590                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2591
2592         def report_extraction(self, file_id):
2593                 """Report information extraction."""
2594                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2595
2596         def _real_initialize(self):
2597                 return
2598
2599         def _real_extract(self, url):
2600                 # At this point we have a new file
2601                 self._downloader.increment_downloads()
2602
2603                 file_id = url.split('/')[-1]
2604                 # Rebuild url in english locale
2605                 url = 'http://depositfiles.com/en/files/' + file_id
2606
2607                 # Retrieve file webpage with 'Free download' button pressed
2608                 free_download_indication = { 'gateway_result' : '1' }
2609                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2610                 try:
2611                         self.report_download_webpage(file_id)
2612                         webpage = urllib2.urlopen(request).read()
2613                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2614                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2615                         return
2616
2617                 # Search for the real file URL
2618                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2619                 if (mobj is None) or (mobj.group(1) is None):
2620                         # Try to figure out reason of the error.
2621                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2622                         if (mobj is not None) and (mobj.group(1) is not None):
2623                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2624                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2625                         else:
2626                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2627                         return
2628
2629                 file_url = mobj.group(1)
2630                 file_extension = os.path.splitext(file_url)[1][1:]
2631
2632                 # Search for file title
2633                 mobj = re.search(r'<b title="(.*?)">', webpage)
2634                 if mobj is None:
2635                         self._downloader.trouble(u'ERROR: unable to extract title')
2636                         return
2637                 file_title = mobj.group(1).decode('utf-8')
2638
2639                 try:
2640                         # Process file information
2641                         self._downloader.process_info({
2642                                 'id':           file_id.decode('utf-8'),
2643                                 'url':          file_url.decode('utf-8'),
2644                                 'uploader':     u'NA',
2645                                 'upload_date':  u'NA',
2646                                 'title':        file_title,
2647                                 'stitle':       file_title,
2648                                 'ext':          file_extension.decode('utf-8'),
2649                                 'format':       u'NA',
2650                                 'player_url':   None,
2651                         })
2652                 except UnavailableVideoError, err:
2653                         self._downloader.trouble(u'ERROR: unable to download file')
2654
2655
2656 class FacebookIE(InfoExtractor):
2657         """Information Extractor for Facebook"""
2658
2659         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2660         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2661         _NETRC_MACHINE = 'facebook'
2662         _available_formats = ['highqual', 'lowqual']
2663         _video_extensions = {
2664                 'highqual': 'mp4',
2665                 'lowqual': 'mp4',
2666         }
2667         IE_NAME = u'facebook'
2668
2669         def __init__(self, downloader=None):
2670                 InfoExtractor.__init__(self, downloader)
2671
2672         def _reporter(self, message):
2673                 """Add header and report message."""
2674                 self._downloader.to_screen(u'[facebook] %s' % message)
2675
2676         def report_login(self):
2677                 """Report attempt to log in."""
2678                 self._reporter(u'Logging in')
2679
2680         def report_video_webpage_download(self, video_id):
2681                 """Report attempt to download video webpage."""
2682                 self._reporter(u'%s: Downloading video webpage' % video_id)
2683
2684         def report_information_extraction(self, video_id):
2685                 """Report attempt to extract video information."""
2686                 self._reporter(u'%s: Extracting video information' % video_id)
2687
2688         def _parse_page(self, video_webpage):
2689                 """Extract video information from page"""
2690                 # General data
2691                 data = {'title': r'class="video_title datawrap">(.*?)</',
2692                         'description': r'<div class="datawrap">(.*?)</div>',
2693                         'owner': r'\("video_owner_name", "(.*?)"\)',
2694                         'upload_date': r'data-date="(.*?)"',
2695                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2696                         }
2697                 video_info = {}
2698                 for piece in data.keys():
2699                         mobj = re.search(data[piece], video_webpage)
2700                         if mobj is not None:
2701                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2702
2703                 # Video urls
2704                 video_urls = {}
2705                 for fmt in self._available_formats:
2706                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2707                         if mobj is not None:
2708                                 # URL is in a Javascript segment inside an escaped Unicode format within
2709                                 # the generally utf-8 page
2710                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2711                 video_info['video_urls'] = video_urls
2712
2713                 return video_info
2714
2715         def _real_initialize(self):
2716                 if self._downloader is None:
2717                         return
2718
2719                 useremail = None
2720                 password = None
2721                 downloader_params = self._downloader.params
2722
2723                 # Attempt to use provided username and password or .netrc data
2724                 if downloader_params.get('username', None) is not None:
2725                         useremail = downloader_params['username']
2726                         password = downloader_params['password']
2727                 elif downloader_params.get('usenetrc', False):
2728                         try:
2729                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2730                                 if info is not None:
2731                                         useremail = info[0]
2732                                         password = info[2]
2733                                 else:
2734                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2735                         except (IOError, netrc.NetrcParseError), err:
2736                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2737                                 return
2738
2739                 if useremail is None:
2740                         return
2741
2742                 # Log in
2743                 login_form = {
2744                         'email': useremail,
2745                         'pass': password,
2746                         'login': 'Log+In'
2747                         }
2748                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2749                 try:
2750                         self.report_login()
2751                         login_results = urllib2.urlopen(request).read()
2752                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2753                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2754                                 return
2755                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2756                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2757                         return
2758
2759         def _real_extract(self, url):
2760                 mobj = re.match(self._VALID_URL, url)
2761                 if mobj is None:
2762                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2763                         return
2764                 video_id = mobj.group('ID')
2765
2766                 # Get video webpage
2767                 self.report_video_webpage_download(video_id)
2768                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2769                 try:
2770                         page = urllib2.urlopen(request)
2771                         video_webpage = page.read()
2772                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2773                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2774                         return
2775
2776                 # Start extracting information
2777                 self.report_information_extraction(video_id)
2778
2779                 # Extract information
2780                 video_info = self._parse_page(video_webpage)
2781
2782                 # uploader
2783                 if 'owner' not in video_info:
2784                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2785                         return
2786                 video_uploader = video_info['owner']
2787
2788                 # title
2789                 if 'title' not in video_info:
2790                         self._downloader.trouble(u'ERROR: unable to extract video title')
2791                         return
2792                 video_title = video_info['title']
2793                 video_title = video_title.decode('utf-8')
2794                 video_title = sanitize_title(video_title)
2795
2796                 # simplified title
2797                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2798                 simple_title = simple_title.strip(ur'_')
2799
2800                 # thumbnail image
2801                 if 'thumbnail' not in video_info:
2802                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2803                         video_thumbnail = ''
2804                 else:
2805                         video_thumbnail = video_info['thumbnail']
2806
2807                 # upload date
2808                 upload_date = u'NA'
2809                 if 'upload_date' in video_info:
2810                         upload_time = video_info['upload_date']
2811                         timetuple = email.utils.parsedate_tz(upload_time)
2812                         if timetuple is not None:
2813                                 try:
2814                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2815                                 except:
2816                                         pass
2817
2818                 # description
2819                 video_description = video_info.get('description', 'No description available.')
2820
2821                 url_map = video_info['video_urls']
2822                 if len(url_map.keys()) > 0:
2823                         # Decide which formats to download
2824                         req_format = self._downloader.params.get('format', None)
2825                         format_limit = self._downloader.params.get('format_limit', None)
2826
2827                         if format_limit is not None and format_limit in self._available_formats:
2828                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2829                         else:
2830                                 format_list = self._available_formats
2831                         existing_formats = [x for x in format_list if x in url_map]
2832                         if len(existing_formats) == 0:
2833                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2834                                 return
2835                         if req_format is None:
2836                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2837                         elif req_format == 'worst':
2838                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2839                         elif req_format == '-1':
2840                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2841                         else:
2842                                 # Specific format
2843                                 if req_format not in url_map:
2844                                         self._downloader.trouble(u'ERROR: requested format not available')
2845                                         return
2846                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2847
2848                 for format_param, video_real_url in video_url_list:
2849
2850                         # At this point we have a new video
2851                         self._downloader.increment_downloads()
2852
2853                         # Extension
2854                         video_extension = self._video_extensions.get(format_param, 'mp4')
2855
2856                         try:
2857                                 # Process video information
2858                                 self._downloader.process_info({
2859                                         'id':           video_id.decode('utf-8'),
2860                                         'url':          video_real_url.decode('utf-8'),
2861                                         'uploader':     video_uploader.decode('utf-8'),
2862                                         'upload_date':  upload_date,
2863                                         'title':        video_title,
2864                                         'stitle':       simple_title,
2865                                         'ext':          video_extension.decode('utf-8'),
2866                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2867                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2868                                         'description':  video_description.decode('utf-8'),
2869                                         'player_url':   None,
2870                                 })
2871                         except UnavailableVideoError, err:
2872                                 self._downloader.trouble(u'\nERROR: unable to download video')
2873
2874 class BlipTVIE(InfoExtractor):
2875         """Information extractor for blip.tv"""
2876
2877         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2878         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2879         IE_NAME = u'blip.tv'
2880
2881         def report_extraction(self, file_id):
2882                 """Report information extraction."""
2883                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2884
2885         def _simplify_title(self, title):
2886                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2887                 res = res.strip(ur'_')
2888                 return res
2889
2890         def _real_extract(self, url):
2891                 mobj = re.match(self._VALID_URL, url)
2892                 if mobj is None:
2893                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2894                         return
2895
2896                 if '?' in url:
2897                         cchar = '&'
2898                 else:
2899                         cchar = '?'
2900                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2901                 request = urllib2.Request(json_url)
2902                 self.report_extraction(mobj.group(1))
2903                 try:
2904                         json_code = urllib2.urlopen(request).read()
2905                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2906                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2907                         return
2908                 try:
2909                         json_data = json.loads(json_code)
2910                         if 'Post' in json_data:
2911                                 data = json_data['Post']
2912                         else:
2913                                 data = json_data
2914
2915                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2916                         video_url = data['media']['url']
2917                         umobj = re.match(self._URL_EXT, video_url)
2918                         if umobj is None:
2919                                 raise ValueError('Can not determine filename extension')
2920                         ext = umobj.group(1)
2921
2922                         self._downloader.increment_downloads()
2923
2924                         info = {
2925                                 'id': data['item_id'],
2926                                 'url': video_url,
2927                                 'uploader': data['display_name'],
2928                                 'upload_date': upload_date,
2929                                 'title': data['title'],
2930                                 'stitle': self._simplify_title(data['title']),
2931                                 'ext': ext,
2932                                 'format': data['media']['mimeType'],
2933                                 'thumbnail': data['thumbnailUrl'],
2934                                 'description': data['description'],
2935                                 'player_url': data['embedUrl']
2936                         }
2937                 except (ValueError,KeyError), err:
2938                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2939                         return
2940
2941                 try:
2942                         self._downloader.process_info(info)
2943                 except UnavailableVideoError, err:
2944                         self._downloader.trouble(u'\nERROR: unable to download video')
2945
2946
2947 class MyVideoIE(InfoExtractor):
2948         """Information Extractor for myvideo.de."""
2949
2950         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2951         IE_NAME = u'myvideo'
2952
2953         def __init__(self, downloader=None):
2954                 InfoExtractor.__init__(self, downloader)
2955
2956         def report_download_webpage(self, video_id):
2957                 """Report webpage download."""
2958                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2959
2960         def report_extraction(self, video_id):
2961                 """Report information extraction."""
2962                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2963
2964         def _real_initialize(self):
2965                 return
2966
2967         def _real_extract(self,url):
2968                 mobj = re.match(self._VALID_URL, url)
2969                 if mobj is None:
2970                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2971                         return
2972
2973                 video_id = mobj.group(1)
2974                 simple_title = mobj.group(2).decode('utf-8')
2975                 # should actually not be necessary
2976                 simple_title = sanitize_title(simple_title)
2977                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2978
2979                 # Get video webpage
2980                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2981                 try:
2982                         self.report_download_webpage(video_id)
2983                         webpage = urllib2.urlopen(request).read()
2984                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2985                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2986                         return
2987
2988                 self.report_extraction(video_id)
2989                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2990                                  webpage)
2991                 if mobj is None:
2992                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2993                         return
2994                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2995
2996                 mobj = re.search('<title>([^<]+)</title>', webpage)
2997                 if mobj is None:
2998                         self._downloader.trouble(u'ERROR: unable to extract title')
2999                         return
3000
3001                 video_title = mobj.group(1)
3002                 video_title = sanitize_title(video_title)
3003
3004                 try:
3005                         print(video_url)
3006                         self._downloader.process_info({
3007                                 'id':           video_id,
3008                                 'url':          video_url,
3009                                 'uploader':     u'NA',
3010                                 'upload_date':  u'NA',
3011                                 'title':        video_title,
3012                                 'stitle':       simple_title,
3013                                 'ext':          u'flv',
3014                                 'format':       u'NA',
3015                                 'player_url':   None,
3016                         })
3017                 except UnavailableVideoError:
3018                         self._downloader.trouble(u'\nERROR: Unable to download video')
3019
3020 class ComedyCentralIE(InfoExtractor):
3021         """Information extractor for The Daily Show and Colbert Report """
3022
3023         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3024         IE_NAME = u'comedycentral'
3025
3026         def report_extraction(self, episode_id):
3027                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3028
3029         def report_config_download(self, episode_id):
3030                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3031
3032         def report_index_download(self, episode_id):
3033                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3034
3035         def report_player_url(self, episode_id):
3036                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3037
3038         def _simplify_title(self, title):
3039                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3040                 res = res.strip(ur'_')
3041                 return res
3042
3043         def _real_extract(self, url):
3044                 mobj = re.match(self._VALID_URL, url)
3045                 if mobj is None:
3046                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3047                         return
3048
3049                 if mobj.group('shortname'):
3050                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3051                                 url = 'http://www.thedailyshow.com/full-episodes/'
3052                         else:
3053                                 url = 'http://www.colbertnation.com/full-episodes/'
3054                         mobj = re.match(self._VALID_URL, url)
3055                         assert mobj is not None
3056
3057                 dlNewest = not mobj.group('episode')
3058                 if dlNewest:
3059                         epTitle = mobj.group('showname')
3060                 else:
3061                         epTitle = mobj.group('episode')
3062
3063                 req = urllib2.Request(url)
3064                 self.report_extraction(epTitle)
3065                 try:
3066                         htmlHandle = urllib2.urlopen(req)
3067                         html = htmlHandle.read()
3068                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3069                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3070                         return
3071                 if dlNewest:
3072                         url = htmlHandle.geturl()
3073                         mobj = re.match(self._VALID_URL, url)
3074                         if mobj is None:
3075                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3076                                 return
3077                         if mobj.group('episode') == '':
3078                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3079                                 return
3080                         epTitle = mobj.group('episode')
3081
3082                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3083                 if len(mMovieParams) == 0:
3084                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3085                         return
3086
3087                 playerUrl_raw = mMovieParams[0][0]
3088                 self.report_player_url(epTitle)
3089                 try:
3090                         urlHandle = urllib2.urlopen(playerUrl_raw)
3091                         playerUrl = urlHandle.geturl()
3092                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3093                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3094                         return
3095
3096                 uri = mMovieParams[0][1]
3097                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3098                 self.report_index_download(epTitle)
3099                 try:
3100                         indexXml = urllib2.urlopen(indexUrl).read()
3101                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3102                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3103                         return
3104
3105                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3106                 itemEls = idoc.findall('.//item')
3107                 for itemEl in itemEls:
3108                         mediaId = itemEl.findall('./guid')[0].text
3109                         shortMediaId = mediaId.split(':')[-1]
3110                         showId = mediaId.split(':')[-2].replace('.com', '')
3111                         officialTitle = itemEl.findall('./title')[0].text
3112                         officialDate = itemEl.findall('./pubDate')[0].text
3113
3114                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3115                                                 urllib.urlencode({'uri': mediaId}))
3116                         configReq = urllib2.Request(configUrl)
3117                         self.report_config_download(epTitle)
3118                         try:
3119                                 configXml = urllib2.urlopen(configReq).read()
3120                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3121                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3122                                 return
3123
3124                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3125                         turls = []
3126                         for rendition in cdoc.findall('.//rendition'):
3127                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3128                                 turls.append(finfo)
3129
3130                         if len(turls) == 0:
3131                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3132                                 continue
3133
3134                         # For now, just pick the highest bitrate
3135                         format,video_url = turls[-1]
3136
3137                         self._downloader.increment_downloads()
3138
3139                         effTitle = showId + '-' + epTitle
3140                         info = {
3141                                 'id': shortMediaId,
3142                                 'url': video_url,
3143                                 'uploader': showId,
3144                                 'upload_date': officialDate,
3145                                 'title': effTitle,
3146                                 'stitle': self._simplify_title(effTitle),
3147                                 'ext': 'mp4',
3148                                 'format': format,
3149                                 'thumbnail': None,
3150                                 'description': officialTitle,
3151                                 'player_url': playerUrl
3152                         }
3153
3154                         try:
3155                                 self._downloader.process_info(info)
3156                         except UnavailableVideoError, err:
3157                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3158                                 continue
3159
3160
3161 class EscapistIE(InfoExtractor):
3162         """Information extractor for The Escapist """
3163
3164         _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3165         IE_NAME = u'escapist'
3166
3167         def report_extraction(self, showName):
3168                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3169
3170         def report_config_download(self, showName):
3171                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3172
3173         def _simplify_title(self, title):
3174                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3175                 res = res.strip(ur'_')
3176                 return res
3177
3178         def _real_extract(self, url):
3179                 htmlParser = HTMLParser.HTMLParser()
3180
3181                 mobj = re.match(self._VALID_URL, url)
3182                 if mobj is None:
3183                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3184                         return
3185                 showName = mobj.group('showname')
3186                 videoId = mobj.group('episode')
3187
3188                 self.report_extraction(showName)
3189                 try:
3190                         webPage = urllib2.urlopen(url).read()
3191                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3192                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3193                         return
3194
3195                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3196                 description = htmlParser.unescape(descMatch.group(1))
3197                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3198                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3199                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3200                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3201                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3202                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3203
3204                 self.report_config_download(showName)
3205                 try:
3206                         configJSON = urllib2.urlopen(configUrl).read()
3207                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3208                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3209                         return
3210
3211                 # Technically, it's JavaScript, not JSON
3212                 configJSON = configJSON.replace("'", '"')
3213
3214                 try:
3215                         config = json.loads(configJSON)
3216                 except (ValueError,), err:
3217                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3218                         return
3219
3220                 playlist = config['playlist']
3221                 videoUrl = playlist[1]['url']
3222
3223                 self._downloader.increment_downloads()
3224                 info = {
3225                         'id': videoId,
3226                         'url': videoUrl,
3227                         'uploader': showName,
3228                         'upload_date': None,
3229                         'title': showName,
3230                         'stitle': self._simplify_title(showName),
3231                         'ext': 'flv',
3232                         'format': 'flv',
3233                         'thumbnail': imgUrl,
3234                         'description': description,
3235                         'player_url': playerUrl,
3236                 }
3237
3238                 try:
3239                         self._downloader.process_info(info)
3240                 except UnavailableVideoError, err:
3241                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3242
3243
3244
3245 class PostProcessor(object):
3246         """Post Processor class.
3247
3248         PostProcessor objects can be added to downloaders with their
3249         add_post_processor() method. When the downloader has finished a
3250         successful download, it will take its internal chain of PostProcessors
3251         and start calling the run() method on each one of them, first with
3252         an initial argument and then with the returned value of the previous
3253         PostProcessor.
3254
3255         The chain will be stopped if one of them ever returns None or the end
3256         of the chain is reached.
3257
3258         PostProcessor objects follow a "mutual registration" process similar
3259         to InfoExtractor objects.
3260         """
3261
3262         _downloader = None
3263
3264         def __init__(self, downloader=None):
3265                 self._downloader = downloader
3266
3267         def set_downloader(self, downloader):
3268                 """Sets the downloader for this PP."""
3269                 self._downloader = downloader
3270
3271         def run(self, information):
3272                 """Run the PostProcessor.
3273
3274                 The "information" argument is a dictionary like the ones
3275                 composed by InfoExtractors. The only difference is that this
3276                 one has an extra field called "filepath" that points to the
3277                 downloaded file.
3278
3279                 When this method returns None, the postprocessing chain is
3280                 stopped. However, this method may return an information
3281                 dictionary that will be passed to the next postprocessing
3282                 object in the chain. It can be the one it received after
3283                 changing some fields.
3284
3285                 In addition, this method may raise a PostProcessingError
3286                 exception that will be taken into account by the downloader
3287                 it was called from.
3288                 """
3289                 return information # by default, do nothing
3290
3291
3292 class FFmpegExtractAudioPP(PostProcessor):
3293
3294         def __init__(self, downloader=None, preferredcodec=None):
3295                 PostProcessor.__init__(self, downloader)
3296                 if preferredcodec is None:
3297                         preferredcodec = 'best'
3298                 self._preferredcodec = preferredcodec
3299
3300         @staticmethod
3301         def get_audio_codec(path):
3302                 try:
3303                         cmd = ['ffprobe', '-show_streams', '--', path]
3304                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3305                         output = handle.communicate()[0]
3306                         if handle.wait() != 0:
3307                                 return None
3308                 except (IOError, OSError):
3309                         return None
3310                 audio_codec = None
3311                 for line in output.split('\n'):
3312                         if line.startswith('codec_name='):
3313                                 audio_codec = line.split('=')[1].strip()
3314                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3315                                 return audio_codec
3316                 return None
3317
3318         @staticmethod
3319         def run_ffmpeg(path, out_path, codec, more_opts):
3320                 try:
3321                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3322                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3323                         return (ret == 0)
3324                 except (IOError, OSError):
3325                         return False
3326
3327         def run(self, information):
3328                 path = information['filepath']
3329
3330                 filecodec = self.get_audio_codec(path)
3331                 if filecodec is None:
3332                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3333                         return None
3334
3335                 more_opts = []
3336                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3337                         if filecodec == 'aac' or filecodec == 'mp3':
3338                                 # Lossless if possible
3339                                 acodec = 'copy'
3340                                 extension = filecodec
3341                                 if filecodec == 'aac':
3342                                         more_opts = ['-f', 'adts']
3343                         else:
3344                                 # MP3 otherwise.
3345                                 acodec = 'libmp3lame'
3346                                 extension = 'mp3'
3347                                 more_opts = ['-ab', '128k']
3348                 else:
3349                         # We convert the audio (lossy)
3350                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3351                         extension = self._preferredcodec
3352                         more_opts = ['-ab', '128k']
3353                         if self._preferredcodec == 'aac':
3354                                 more_opts += ['-f', 'adts']
3355
3356                 (prefix, ext) = os.path.splitext(path)
3357                 new_path = prefix + '.' + extension
3358                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3359                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3360
3361                 if not status:
3362                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3363                         return None
3364
3365                 # Try to update the date time for extracted audio file.
3366                 if information.get('filetime') is not None:
3367                         try:
3368                                 os.utime(new_path, (time.time(), information['filetime']))
3369                         except:
3370                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3371
3372                 try:
3373                         os.remove(path)
3374                 except (IOError, OSError):
3375                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3376                         return None
3377
3378                 information['filepath'] = new_path
3379                 return information
3380
3381
3382 def updateSelf(downloader, filename):
3383         ''' Update the program file with the latest version from the repository '''
3384         # Note: downloader only used for options
3385         if not os.access(filename, os.W_OK):
3386                 sys.exit('ERROR: no write permissions on %s' % filename)
3387
3388         downloader.to_screen('Updating to latest version...')
3389
3390         try:
3391                 try:
3392                         urlh = urllib.urlopen(UPDATE_URL)
3393                         newcontent = urlh.read()
3394                 finally:
3395                         urlh.close()
3396         except (IOError, OSError), err:
3397                 sys.exit('ERROR: unable to download latest version')
3398
3399         try:
3400                 outf = open(filename, 'wb')
3401                 try:
3402                         outf.write(newcontent)
3403                 finally:
3404                         outf.close()
3405         except (IOError, OSError), err:
3406                 sys.exit('ERROR: unable to overwrite current version')
3407
3408         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3409
3410 def parseOpts():
3411         # Deferred imports
3412         import getpass
3413         import optparse
3414
3415         def _format_option_string(option):
3416                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3417
3418                 opts = []
3419
3420                 if option._short_opts: opts.append(option._short_opts[0])
3421                 if option._long_opts: opts.append(option._long_opts[0])
3422                 if len(opts) > 1: opts.insert(1, ', ')
3423
3424                 if option.takes_value(): opts.append(' %s' % option.metavar)
3425
3426                 return "".join(opts)
3427
3428         def _find_term_columns():
3429                 columns = os.environ.get('COLUMNS', None)
3430                 if columns:
3431                         return int(columns)
3432
3433                 try:
3434                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3435                         out,err = sp.communicate()
3436                         return int(out.split()[1])
3437                 except:
3438                         pass
3439                 return None
3440
3441         max_width = 80
3442         max_help_position = 80
3443
3444         # No need to wrap help messages if we're on a wide console
3445         columns = _find_term_columns()
3446         if columns: max_width = columns
3447
3448         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3449         fmt.format_option_strings = _format_option_string
3450
3451         kw = {
3452                 'version'   : __version__,
3453                 'formatter' : fmt,
3454                 'usage' : '%prog [options] url [url...]',
3455                 'conflict_handler' : 'resolve',
3456         }
3457
3458         parser = optparse.OptionParser(**kw)
3459
3460         # option groups
3461         general        = optparse.OptionGroup(parser, 'General Options')
3462         selection      = optparse.OptionGroup(parser, 'Video Selection')
3463         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3464         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3465         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3466         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3467         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3468
3469         general.add_option('-h', '--help',
3470                         action='help', help='print this help text and exit')
3471         general.add_option('-v', '--version',
3472                         action='version', help='print program version and exit')
3473         general.add_option('-U', '--update',
3474                         action='store_true', dest='update_self', help='update this program to latest version')
3475         general.add_option('-i', '--ignore-errors',
3476                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3477         general.add_option('-r', '--rate-limit',
3478                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3479         general.add_option('-R', '--retries',
3480                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3481         general.add_option('--dump-user-agent',
3482                         action='store_true', dest='dump_user_agent',
3483                         help='display the current browser identification', default=False)
3484         general.add_option('--list-extractors',
3485                         action='store_true', dest='list_extractors',
3486                         help='List all supported extractors and the URLs they would handle', default=False)
3487
3488         selection.add_option('--playlist-start',
3489                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3490         selection.add_option('--playlist-end',
3491                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3492         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3493         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3494
3495         authentication.add_option('-u', '--username',
3496                         dest='username', metavar='USERNAME', help='account username')
3497         authentication.add_option('-p', '--password',
3498                         dest='password', metavar='PASSWORD', help='account password')
3499         authentication.add_option('-n', '--netrc',
3500                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3501
3502
3503         video_format.add_option('-f', '--format',
3504                         action='store', dest='format', metavar='FORMAT', help='video format code')
3505         video_format.add_option('--all-formats',
3506                         action='store_const', dest='format', help='download all available video formats', const='-1')
3507         video_format.add_option('--max-quality',
3508                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3509
3510
3511         verbosity.add_option('-q', '--quiet',
3512                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3513         verbosity.add_option('-s', '--simulate',
3514                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3515         verbosity.add_option('--skip-download',
3516                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3517         verbosity.add_option('-g', '--get-url',
3518                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3519         verbosity.add_option('-e', '--get-title',
3520                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3521         verbosity.add_option('--get-thumbnail',
3522                         action='store_true', dest='getthumbnail',
3523                         help='simulate, quiet but print thumbnail URL', default=False)
3524         verbosity.add_option('--get-description',
3525                         action='store_true', dest='getdescription',
3526                         help='simulate, quiet but print video description', default=False)
3527         verbosity.add_option('--get-filename',
3528                         action='store_true', dest='getfilename',
3529                         help='simulate, quiet but print output filename', default=False)
3530         verbosity.add_option('--no-progress',
3531                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3532         verbosity.add_option('--console-title',
3533                         action='store_true', dest='consoletitle',
3534                         help='display progress in console titlebar', default=False)
3535
3536
3537         filesystem.add_option('-t', '--title',
3538                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3539         filesystem.add_option('-l', '--literal',
3540                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3541         filesystem.add_option('-A', '--auto-number',
3542                         action='store_true', dest='autonumber',
3543                         help='number downloaded files starting from 00000', default=False)
3544         filesystem.add_option('-o', '--output',
3545                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3546         filesystem.add_option('-a', '--batch-file',
3547                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3548         filesystem.add_option('-w', '--no-overwrites',
3549                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3550         filesystem.add_option('-c', '--continue',
3551                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3552         filesystem.add_option('--cookies',
3553                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3554         filesystem.add_option('--no-part',
3555                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3556         filesystem.add_option('--no-mtime',
3557                         action='store_false', dest='updatetime',
3558                         help='do not use the Last-modified header to set the file modification time', default=True)
3559         filesystem.add_option('--write-description',
3560                         action='store_true', dest='writedescription',
3561                         help='write video description to a .description file', default=False)
3562         filesystem.add_option('--write-info-json',
3563                         action='store_true', dest='writeinfojson',
3564                         help='write video metadata to a .info.json file', default=False)
3565
3566
3567         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3568                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3569         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3570                         help='"best", "aac" or "mp3"; best by default')
3571
3572
3573         parser.add_option_group(general)
3574         parser.add_option_group(selection)
3575         parser.add_option_group(filesystem)
3576         parser.add_option_group(verbosity)
3577         parser.add_option_group(video_format)
3578         parser.add_option_group(authentication)
3579         parser.add_option_group(postproc)
3580
3581         opts, args = parser.parse_args()
3582
3583         return parser, opts, args
3584
3585 def gen_extractors():
3586         """ Return a list of an instance of every supported extractor.
3587         The order does matter; the first extractor matched is the one handling the URL.
3588         """
3589         youtube_ie = YoutubeIE()
3590         google_ie = GoogleIE()
3591         yahoo_ie = YahooIE()
3592         return [
3593                 youtube_ie,
3594                 MetacafeIE(youtube_ie),
3595                 DailymotionIE(),
3596                 YoutubePlaylistIE(youtube_ie),
3597                 YoutubeUserIE(youtube_ie),
3598                 YoutubeSearchIE(youtube_ie),
3599                 google_ie,
3600                 GoogleSearchIE(google_ie),
3601                 PhotobucketIE(),
3602                 yahoo_ie,
3603                 YahooSearchIE(yahoo_ie),
3604                 DepositFilesIE(),
3605                 FacebookIE(),
3606                 BlipTVIE(),
3607                 VimeoIE(),
3608                 MyVideoIE(),
3609                 ComedyCentralIE(),
3610                 EscapistIE(),
3611
3612                 GenericIE()
3613         ]
3614
3615 def main():
3616         parser, opts, args = parseOpts()
3617
3618         # Open appropriate CookieJar
3619         if opts.cookiefile is None:
3620                 jar = cookielib.CookieJar()
3621         else:
3622                 try:
3623                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3624                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3625                                 jar.load()
3626                 except (IOError, OSError), err:
3627                         sys.exit(u'ERROR: unable to open cookie file')
3628
3629         # Dump user agent
3630         if opts.dump_user_agent:
3631                 print std_headers['User-Agent']
3632                 sys.exit(0)
3633
3634         # Batch file verification
3635         batchurls = []
3636         if opts.batchfile is not None:
3637                 try:
3638                         if opts.batchfile == '-':
3639                                 batchfd = sys.stdin
3640                         else:
3641                                 batchfd = open(opts.batchfile, 'r')
3642                         batchurls = batchfd.readlines()
3643                         batchurls = [x.strip() for x in batchurls]
3644                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3645                 except IOError:
3646                         sys.exit(u'ERROR: batch file could not be read')
3647         all_urls = batchurls + args
3648
3649         # General configuration
3650         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3651         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3652         urllib2.install_opener(opener)
3653         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3654
3655         extractors = gen_extractors()
3656
3657         if opts.list_extractors:
3658                 for ie in extractors:
3659                         print(ie.IE_NAME)
3660                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3661                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3662                         for mu in matchedUrls:
3663                                 print(u'  ' + mu)
3664                 sys.exit(0)
3665
3666         # Conflicting, missing and erroneous options
3667         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3668                 parser.error(u'using .netrc conflicts with giving username/password')
3669         if opts.password is not None and opts.username is None:
3670                 parser.error(u'account username missing')
3671         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3672                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3673         if opts.usetitle and opts.useliteral:
3674                 parser.error(u'using title conflicts with using literal title')
3675         if opts.username is not None and opts.password is None:
3676                 opts.password = getpass.getpass(u'Type account password and press return:')
3677         if opts.ratelimit is not None:
3678                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3679                 if numeric_limit is None:
3680                         parser.error(u'invalid rate limit specified')
3681                 opts.ratelimit = numeric_limit
3682         if opts.retries is not None:
3683                 try:
3684                         opts.retries = long(opts.retries)
3685                 except (TypeError, ValueError), err:
3686                         parser.error(u'invalid retry count specified')
3687         try:
3688                 opts.playliststart = int(opts.playliststart)
3689                 if opts.playliststart <= 0:
3690                         raise ValueError(u'Playlist start must be positive')
3691         except (TypeError, ValueError), err:
3692                 parser.error(u'invalid playlist start number specified')
3693         try:
3694                 opts.playlistend = int(opts.playlistend)
3695                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3696                         raise ValueError(u'Playlist end must be greater than playlist start')
3697         except (TypeError, ValueError), err:
3698                 parser.error(u'invalid playlist end number specified')
3699         if opts.extractaudio:
3700                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3701                         parser.error(u'invalid audio format specified')
3702
3703         # File downloader
3704         fd = FileDownloader({
3705                 'usenetrc': opts.usenetrc,
3706                 'username': opts.username,
3707                 'password': opts.password,
3708                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3709                 'forceurl': opts.geturl,
3710                 'forcetitle': opts.gettitle,
3711                 'forcethumbnail': opts.getthumbnail,
3712                 'forcedescription': opts.getdescription,
3713                 'forcefilename': opts.getfilename,
3714                 'simulate': opts.simulate,
3715                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3716                 'format': opts.format,
3717                 'format_limit': opts.format_limit,
3718                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3719                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3720                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3721                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3722                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3723                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3724                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3725                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3726                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3727                         or u'%(id)s.%(ext)s'),
3728                 'ignoreerrors': opts.ignoreerrors,
3729                 'ratelimit': opts.ratelimit,
3730                 'nooverwrites': opts.nooverwrites,
3731                 'retries': opts.retries,
3732                 'continuedl': opts.continue_dl,
3733                 'noprogress': opts.noprogress,
3734                 'playliststart': opts.playliststart,
3735                 'playlistend': opts.playlistend,
3736                 'logtostderr': opts.outtmpl == '-',
3737                 'consoletitle': opts.consoletitle,
3738                 'nopart': opts.nopart,
3739                 'updatetime': opts.updatetime,
3740                 'writedescription': opts.writedescription,
3741                 'writeinfojson': opts.writeinfojson,
3742                 'matchtitle': opts.matchtitle,
3743                 'rejecttitle': opts.rejecttitle,
3744                 })
3745         for extractor in extractors:
3746                 fd.add_info_extractor(extractor)
3747
3748         # PostProcessors
3749         if opts.extractaudio:
3750                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3751
3752         # Update version
3753         if opts.update_self:
3754                 updateSelf(fd, sys.argv[0])
3755
3756         # Maybe do nothing
3757         if len(all_urls) < 1:
3758                 if not opts.update_self:
3759                         parser.error(u'you must provide at least one URL')
3760                 else:
3761                         sys.exit()
3762         retcode = fd.download(all_urls)
3763
3764         # Dump cookie jar if requested
3765         if opts.cookiefile is not None:
3766                 try:
3767                         jar.save()
3768                 except (IOError, OSError), err:
3769                         sys.exit(u'ERROR: unable to save cookie jar')
3770
3771         sys.exit(retcode)
3772
3773
3774 if __name__ == '__main__':
3775         try:
3776                 main()
3777         except DownloadError:
3778                 sys.exit(1)
3779         except SameFileError:
3780                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3781         except KeyboardInterrupt:
3782                 sys.exit(u'\nERROR: Interrupted by user')
3783
3784 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: