youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.09.16'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import HTMLParser
  27 import httplib
  28 import locale
  29 import math
  30 import netrc
  31 import os
  32 import os.path
  33 import re
  34 import socket
  35 import string
  36 import subprocess
  37 import sys
  38 import time
  39 import urllib
  40 import urllib2
  41 import warnings
  42 import zlib
  43
  44 if os.name == 'nt':
  45         import ctypes
  46
  47 try:
  48         import email.utils
  49 except ImportError: # Python 2.4
  50         import email.Utils
  51 try:
  52         import cStringIO as StringIO
  53 except ImportError:
  54         import StringIO
  55
  56 # parse_qs was moved from the cgi module to the urlparse module recently.
  57 try:
  58         from urlparse import parse_qs
  59 except ImportError:
  60         from cgi import parse_qs
  61
  62 try:
  63         import lxml.etree
  64 except ImportError:
  65         pass # Handled below
  66
  67 try:
  68         import xml.etree.ElementTree
  69 except ImportError: # Python<2.5: Not officially supported, but let it slip
  70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  71
  72 std_headers = {
  73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  76         'Accept-Encoding': 'gzip, deflate',
  77         'Accept-Language': 'en-us,en;q=0.5',
  78 }
  79
  80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280
 281 class DownloadError(Exception):
 282         """Download Error exception.
 283
 284         This exception may be thrown by FileDownloader objects if they are not
 285         configured to continue on errors. They will contain the appropriate
 286         error message.
 287         """
 288         pass
 289
 290
 291 class SameFileError(Exception):
 292         """Same File exception.
 293
 294         This exception will be thrown by FileDownloader objects if they detect
 295         multiple files would have to be downloaded to the same file on disk.
 296         """
 297         pass
 298
 299
 300 class PostProcessingError(Exception):
 301         """Post Processing exception.
 302
 303         This exception may be raised by PostProcessor's .run() method to
 304         indicate an error in the postprocessing task.
 305         """
 306         pass
 307
 308
 309 class UnavailableVideoError(Exception):
 310         """Unavailable Format exception.
 311
 312         This exception will be thrown when a video is requested
 313         in a format that is not available for that video.
 314         """
 315         pass
 316
 317
 318 class ContentTooShortError(Exception):
 319         """Content Too Short exception.
 320
 321         This exception may be raised by FileDownloader objects when a file they
 322         download is too small for what the server announced first, indicating
 323         the connection was probably interrupted.
 324         """
 325         # Both in bytes
 326         downloaded = None
 327         expected = None
 328
 329         def __init__(self, downloaded, expected):
 330                 self.downloaded = downloaded
 331                 self.expected = expected
 332
 333
 334 class YoutubeDLHandler(urllib2.HTTPHandler):
 335         """Handler for HTTP requests and responses.
 336
 337         This class, when installed with an OpenerDirector, automatically adds
 338         the standard headers to every HTTP request and handles gzipped and
 339         deflated responses from web servers. If compression is to be avoided in
 340         a particular request, the original request in the program code only has
 341         to include the HTTP header "Youtubedl-No-Compression", which will be
 342         removed before making the real request.
 343
 344         Part of this code was copied from:
 345
 346         http://techknack.net/python-urllib2-handlers/
 347
 348         Andrew Rowls, the author of that code, agreed to release it to the
 349         public domain.
 350         """
 351
 352         @staticmethod
 353         def deflate(data):
 354                 try:
 355                         return zlib.decompress(data, -zlib.MAX_WBITS)
 356                 except zlib.error:
 357                         return zlib.decompress(data)
 358
 359         @staticmethod
 360         def addinfourl_wrapper(stream, headers, url, code):
 361                 if hasattr(urllib2.addinfourl, 'getcode'):
 362                         return urllib2.addinfourl(stream, headers, url, code)
 363                 ret = urllib2.addinfourl(stream, headers, url)
 364                 ret.code = code
 365                 return ret
 366
 367         def http_request(self, req):
 368                 for h in std_headers:
 369                         if h in req.headers:
 370                                 del req.headers[h]
 371                         req.add_header(h, std_headers[h])
 372                 if 'Youtubedl-no-compression' in req.headers:
 373                         if 'Accept-encoding' in req.headers:
 374                                 del req.headers['Accept-encoding']
 375                         del req.headers['Youtubedl-no-compression']
 376                 return req
 377
 378         def http_response(self, req, resp):
 379                 old_resp = resp
 380                 # gzip
 381                 if resp.headers.get('Content-encoding', '') == 'gzip':
 382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 384                         resp.msg = old_resp.msg
 385                 # deflate
 386                 if resp.headers.get('Content-encoding', '') == 'deflate':
 387                         gz = StringIO.StringIO(self.deflate(resp.read()))
 388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 389                         resp.msg = old_resp.msg
 390                 return resp
 391
 392
 393 class FileDownloader(object):
 394         """File Downloader class.
 395
 396         File downloader objects are the ones responsible of downloading the
 397         actual video file and writing it to disk if the user has requested
 398         it, among some other tasks. In most cases there should be one per
 399         program. As, given a video URL, the downloader doesn't know how to
 400         extract all the needed information, task that InfoExtractors do, it
 401         has to pass the URL to one of them.
 402
 403         For this, file downloader objects have a method that allows
 404         InfoExtractors to be registered in a given order. When it is passed
 405         a URL, the file downloader handles it to the first InfoExtractor it
 406         finds that reports being able to handle it. The InfoExtractor extracts
 407         all the information about the video or videos the URL refers to, and
 408         asks the FileDownloader to process the video information, possibly
 409         downloading the video.
 410
 411         File downloaders accept a lot of parameters. In order not to saturate
 412         the object constructor with arguments, it receives a dictionary of
 413         options instead. These options are available through the params
 414         attribute for the InfoExtractors to use. The FileDownloader also
 415         registers itself as the downloader in charge for the InfoExtractors
 416         that are added to it, so this is a "mutual registration".
 417
 418         Available options:
 419
 420         username:         Username for authentication purposes.
 421         password:         Password for authentication purposes.
 422         usenetrc:         Use netrc for authentication instead.
 423         quiet:            Do not print messages to stdout.
 424         forceurl:         Force printing final URL.
 425         forcetitle:       Force printing title.
 426         forcethumbnail:   Force printing thumbnail URL.
 427         forcedescription: Force printing description.
 428         forcefilename:    Force printing final filename.
 429         simulate:         Do not download the video files.
 430         format:           Video format code.
 431         format_limit:     Highest quality format to try.
 432         outtmpl:          Template for output names.
 433         ignoreerrors:     Do not stop on download errors.
 434         ratelimit:        Download speed limit, in bytes/sec.
 435         nooverwrites:     Prevent overwriting files.
 436         retries:          Number of times to retry for HTTP error 5xx
 437         continuedl:       Try to continue downloads if possible.
 438         noprogress:       Do not print the progress bar.
 439         playliststart:    Playlist item to start at.
 440         playlistend:      Playlist item to end at.
 441         matchtitle:       Download only matching titles.
 442         rejecttitle:      Reject downloads for matching titles.
 443         logtostderr:      Log messages to stderr instead of stdout.
 444         consoletitle:     Display progress in console window's titlebar.
 445         nopart:           Do not use temporary .part files.
 446         updatetime:       Use the Last-modified header to set output file timestamps.
 447         writedescription: Write the video description to a .description file
 448         writeinfojson:    Write the video description to a .info.json file
 449         """
 450
 451         params = None
 452         _ies = []
 453         _pps = []
 454         _download_retcode = None
 455         _num_downloads = None
 456         _screen_file = None
 457
 458         def __init__(self, params):
 459                 """Create a FileDownloader object with the given options."""
 460                 self._ies = []
 461                 self._pps = []
 462                 self._download_retcode = 0
 463                 self._num_downloads = 0
 464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 465                 self.params = params
 466
 467         @staticmethod
 468         def format_bytes(bytes):
 469                 if bytes is None:
 470                         return 'N/A'
 471                 if type(bytes) is str:
 472                         bytes = float(bytes)
 473                 if bytes == 0.0:
 474                         exponent = 0
 475                 else:
 476                         exponent = long(math.log(bytes, 1024.0))
 477                 suffix = 'bkMGTPEZY'[exponent]
 478                 converted = float(bytes) / float(1024 ** exponent)
 479                 return '%.2f%s' % (converted, suffix)
 480
 481         @staticmethod
 482         def calc_percent(byte_counter, data_len):
 483                 if data_len is None:
 484                         return '---.-%'
 485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 486
 487         @staticmethod
 488         def calc_eta(start, now, total, current):
 489                 if total is None:
 490                         return '--:--'
 491                 dif = now - start
 492                 if current == 0 or dif < 0.001: # One millisecond
 493                         return '--:--'
 494                 rate = float(current) / dif
 495                 eta = long((float(total) - float(current)) / rate)
 496                 (eta_mins, eta_secs) = divmod(eta, 60)
 497                 if eta_mins > 99:
 498                         return '--:--'
 499                 return '%02d:%02d' % (eta_mins, eta_secs)
 500
 501         @staticmethod
 502         def calc_speed(start, now, bytes):
 503                 dif = now - start
 504                 if bytes == 0 or dif < 0.001: # One millisecond
 505                         return '%10s' % '---b/s'
 506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 507
 508         @staticmethod
 509         def best_block_size(elapsed_time, bytes):
 510                 new_min = max(bytes / 2.0, 1.0)
 511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 512                 if elapsed_time < 0.001:
 513                         return long(new_max)
 514                 rate = bytes / elapsed_time
 515                 if rate > new_max:
 516                         return long(new_max)
 517                 if rate < new_min:
 518                         return long(new_min)
 519                 return long(rate)
 520
 521         @staticmethod
 522         def parse_bytes(bytestr):
 523                 """Parse a string indicating a byte quantity into a long integer."""
 524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 525                 if matchobj is None:
 526                         return None
 527                 number = float(matchobj.group(1))
 528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 529                 return long(round(number * multiplier))
 530
 531         def add_info_extractor(self, ie):
 532                 """Add an InfoExtractor object to the end of the list."""
 533                 self._ies.append(ie)
 534                 ie.set_downloader(self)
 535
 536         def add_post_processor(self, pp):
 537                 """Add a PostProcessor object to the end of the chain."""
 538                 self._pps.append(pp)
 539                 pp.set_downloader(self)
 540
 541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 542                 """Print message to stdout if not in quiet mode."""
 543                 try:
 544                         if not self.params.get('quiet', False):
 545                                 terminator = [u'\n', u''][skip_eol]
 546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 547                         self._screen_file.flush()
 548                 except (UnicodeEncodeError), err:
 549                         if not ignore_encoding_errors:
 550                                 raise
 551
 552         def to_stderr(self, message):
 553                 """Print message to stderr."""
 554                 print >>sys.stderr, message.encode(preferredencoding())
 555
 556         def to_cons_title(self, message):
 557                 """Set console/terminal window title to message."""
 558                 if not self.params.get('consoletitle', False):
 559                         return
 560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 561                         # c_wchar_p() might not be necessary if `message` is
 562                         # already of type unicode()
 563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 564                 elif 'TERM' in os.environ:
 565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 566
 567         def fixed_template(self):
 568                 """Checks if the output template is fixed."""
 569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 570
 571         def trouble(self, message=None):
 572                 """Determine action to take when a download problem appears.
 573
 574                 Depending on if the downloader has been configured to ignore
 575                 download errors or not, this method may throw an exception or
 576                 not when errors are found, after printing the message.
 577                 """
 578                 if message is not None:
 579                         self.to_stderr(message)
 580                 if not self.params.get('ignoreerrors', False):
 581                         raise DownloadError(message)
 582                 self._download_retcode = 1
 583
 584         def slow_down(self, start_time, byte_counter):
 585                 """Sleep if the download speed is over the rate limit."""
 586                 rate_limit = self.params.get('ratelimit', None)
 587                 if rate_limit is None or byte_counter == 0:
 588                         return
 589                 now = time.time()
 590                 elapsed = now - start_time
 591                 if elapsed <= 0.0:
 592                         return
 593                 speed = float(byte_counter) / elapsed
 594                 if speed > rate_limit:
 595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 596
 597         def temp_name(self, filename):
 598                 """Returns a temporary filename for the given filename."""
 599                 if self.params.get('nopart', False) or filename == u'-' or \
 600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 601                         return filename
 602                 return filename + u'.part'
 603
 604         def undo_temp_name(self, filename):
 605                 if filename.endswith(u'.part'):
 606                         return filename[:-len(u'.part')]
 607                 return filename
 608
 609         def try_rename(self, old_filename, new_filename):
 610                 try:
 611                         if old_filename == new_filename:
 612                                 return
 613                         os.rename(old_filename, new_filename)
 614                 except (IOError, OSError), err:
 615                         self.trouble(u'ERROR: unable to rename file')
 616
 617         def try_utime(self, filename, last_modified_hdr):
 618                 """Try to set the last-modified time of the given file."""
 619                 if last_modified_hdr is None:
 620                         return
 621                 if not os.path.isfile(filename):
 622                         return
 623                 timestr = last_modified_hdr
 624                 if timestr is None:
 625                         return
 626                 filetime = timeconvert(timestr)
 627                 if filetime is None:
 628                         return filetime
 629                 try:
 630                         os.utime(filename, (time.time(), filetime))
 631                 except:
 632                         pass
 633                 return filetime
 634
 635         def report_writedescription(self, descfn):
 636                 """ Report that the description file is being written """
 637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 638
 639         def report_writeinfojson(self, infofn):
 640                 """ Report that the metadata file has been written """
 641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 642
 643         def report_destination(self, filename):
 644                 """Report destination filename."""
 645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 646
 647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 648                 """Report download progress."""
 649                 if self.params.get('noprogress', False):
 650                         return
 651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 655
 656         def report_resuming_byte(self, resume_len):
 657                 """Report attempt to resume at given byte."""
 658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 659
 660         def report_retry(self, count, retries):
 661                 """Report retry in case of HTTP error 5xx"""
 662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 663
 664         def report_file_already_downloaded(self, file_name):
 665                 """Report file has already been fully downloaded."""
 666                 try:
 667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 668                 except (UnicodeEncodeError), err:
 669                         self.to_screen(u'[download] The file has already been downloaded')
 670
 671         def report_unable_to_resume(self):
 672                 """Report it was impossible to resume download."""
 673                 self.to_screen(u'[download] Unable to resume')
 674
 675         def report_finish(self):
 676                 """Report download finished."""
 677                 if self.params.get('noprogress', False):
 678                         self.to_screen(u'[download] Download completed')
 679                 else:
 680                         self.to_screen(u'')
 681
 682         def increment_downloads(self):
 683                 """Increment the ordinal that assigns a number to each file."""
 684                 self._num_downloads += 1
 685
 686         def prepare_filename(self, info_dict):
 687                 """Generate the output filename."""
 688                 try:
 689                         template_dict = dict(info_dict)
 690                         template_dict['epoch'] = unicode(long(time.time()))
 691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 692                         filename = self.params['outtmpl'] % template_dict
 693                         return filename
 694                 except (ValueError, KeyError), err:
 695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 696                         return None
 697
 698         def process_info(self, info_dict):
 699                 """Process a single dictionary returned by an InfoExtractor."""
 700                 filename = self.prepare_filename(info_dict)
 701
 702                 # Forced printings
 703                 if self.params.get('forcetitle', False):
 704                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 705                 if self.params.get('forceurl', False):
 706                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 707                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 708                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 710                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 711                 if self.params.get('forcefilename', False) and filename is not None:
 712                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 713                 if self.params.get('forceformat', False):
 714                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 715
 716                 # Do nothing else if in simulate mode
 717                 if self.params.get('simulate', False):
 718                         return
 719
 720                 if filename is None:
 721                         return
 722
 723                 matchtitle=self.params.get('matchtitle',False)
 724                 rejecttitle=self.params.get('rejecttitle',False)
 725                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 726                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 727                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 728                         return
 729                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 730                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 731                         return
 732
 733                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 734                         self.to_stderr(u'WARNING: file exists and will be skipped')
 735                         return
 736
 737                 try:
 738                         dn = os.path.dirname(filename)
 739                         if dn != '' and not os.path.exists(dn):
 740                                 os.makedirs(dn)
 741                 except (OSError, IOError), err:
 742                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 743                         return
 744
 745                 if self.params.get('writedescription', False):
 746                         try:
 747                                 descfn = filename + '.description'
 748                                 self.report_writedescription(descfn)
 749                                 descfile = open(descfn, 'wb')
 750                                 try:
 751                                         descfile.write(info_dict['description'].encode('utf-8'))
 752                                 finally:
 753                                         descfile.close()
 754                         except (OSError, IOError):
 755                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 756                                 return
 757
 758                 if self.params.get('writeinfojson', False):
 759                         infofn = filename + '.info.json'
 760                         self.report_writeinfojson(infofn)
 761                         try:
 762                                 json.dump
 763                         except (NameError,AttributeError):
 764                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 765                                 return
 766                         try:
 767                                 infof = open(infofn, 'wb')
 768                                 try:
 769                                         json.dump(info_dict, infof)
 770                                 finally:
 771                                         infof.close()
 772                         except (OSError, IOError):
 773                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 774                                 return
 775
 776                 if not self.params.get('skip_download', False):
 777                         try:
 778                                 success,add_data = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 779                                 info_dict.update(add_data)
 780                         except (OSError, IOError), err:
 781                                 raise UnavailableVideoError
 782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 783                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 784                                 return
 785                         except (ContentTooShortError, ), err:
 786                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 787                                 return
 788
 789                         if success:
 790                                 try:
 791                                         self.post_process(filename, info_dict)
 792                                 except (PostProcessingError), err:
 793                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 794                                         return
 795
 796         def download(self, url_list):
 797                 """Download a given list of URLs."""
 798                 if len(url_list) > 1 and self.fixed_template():
 799                         raise SameFileError(self.params['outtmpl'])
 800
 801                 for url in url_list:
 802                         suitable_found = False
 803                         for ie in self._ies:
 804                                 # Go to next InfoExtractor if not suitable
 805                                 if not ie.suitable(url):
 806                                         continue
 807
 808                                 # Suitable InfoExtractor found
 809                                 suitable_found = True
 810
 811                                 # Extract information from URL and process it
 812                                 ie.extract(url)
 813
 814                                 # Suitable InfoExtractor had been found; go to next URL
 815                                 break
 816
 817                         if not suitable_found:
 818                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 819
 820                 return self._download_retcode
 821
 822         def post_process(self, filename, ie_info):
 823                 """Run the postprocessing chain on the given file."""
 824                 info = dict(ie_info)
 825                 info['filepath'] = filename
 826                 for pp in self._pps:
 827                         info = pp.run(info)
 828                         if info is None:
 829                                 break
 830
 831         def _download_with_rtmpdump(self, filename, url, player_url):
 832                 self.report_destination(filename)
 833                 tmpfilename = self.temp_name(filename)
 834
 835                 # Check for rtmpdump first
 836                 try:
 837                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 838                 except (OSError, IOError):
 839                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 840                         return False
 841
 842                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 843                 # the connection was interrumpted and resuming appears to be
 844                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 845                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 846                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 847                 while retval == 2 or retval == 1:
 848                         prevsize = os.path.getsize(tmpfilename)
 849                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 850                         time.sleep(5.0) # This seems to be needed
 851                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 852                         cursize = os.path.getsize(tmpfilename)
 853                         if prevsize == cursize and retval == 1:
 854                                 break
 855                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 856                         if prevsize == cursize and retval == 2 and cursize > 1024:
 857                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 858                                 retval = 0
 859                                 break
 860                 if retval == 0:
 861                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 862                         self.try_rename(tmpfilename, filename)
 863                         return True
 864                 else:
 865                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 866                         return False
 867
 868         def _do_download(self, filename, url, player_url):
 869                 # Check file already present
 870                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 871                         self.report_file_already_downloaded(filename)
 872                         return True
 873
 874                 # Attempt to download using rtmpdump
 875                 if url.startswith('rtmp'):
 876                         return self._download_with_rtmpdump(filename, url, player_url)
 877
 878                 tmpfilename = self.temp_name(filename)
 879                 stream = None
 880
 881                 # Do not include the Accept-Encoding header
 882                 headers = {'Youtubedl-no-compression': 'True'}
 883                 basic_request = urllib2.Request(url, None, headers)
 884                 request = urllib2.Request(url, None, headers)
 885
 886                 # Establish possible resume length
 887                 if os.path.isfile(tmpfilename):
 888                         resume_len = os.path.getsize(tmpfilename)
 889                 else:
 890                         resume_len = 0
 891
 892                 open_mode = 'wb'
 893                 if resume_len != 0:
 894                         if self.params.get('continuedl', False):
 895                                 self.report_resuming_byte(resume_len)
 896                                 request.add_header('Range','bytes=%d-' % resume_len)
 897                                 open_mode = 'ab'
 898                         else:
 899                                 resume_len = 0
 900
 901                 count = 0
 902                 retries = self.params.get('retries', 0)
 903                 while count <= retries:
 904                         # Establish connection
 905                         try:
 906                                 data = urllib2.urlopen(request)
 907                                 break
 908                         except (urllib2.HTTPError, ), err:
 909                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 910                                         # Unexpected HTTP error
 911                                         raise
 912                                 elif err.code == 416:
 913                                         # Unable to resume (requested range not satisfiable)
 914                                         try:
 915                                                 # Open the connection again without the range header
 916                                                 data = urllib2.urlopen(basic_request)
 917                                                 content_length = data.info()['Content-Length']
 918                                         except (urllib2.HTTPError, ), err:
 919                                                 if err.code < 500 or err.code >= 600:
 920                                                         raise
 921                                         else:
 922                                                 # Examine the reported length
 923                                                 if (content_length is not None and
 924                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 925                                                         # The file had already been fully downloaded.
 926                                                         # Explanation to the above condition: in issue #175 it was revealed that
 927                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 928                                                         # changing the file size slightly and causing problems for some users. So
 929                                                         # I decided to implement a suggested change and consider the file
 930                                                         # completely downloaded if the file size differs less than 100 bytes from
 931                                                         # the one in the hard drive.
 932                                                         self.report_file_already_downloaded(filename)
 933                                                         self.try_rename(tmpfilename, filename)
 934                                                         return True
 935                                                 else:
 936                                                         # The length does not match, we start the download over
 937                                                         self.report_unable_to_resume()
 938                                                         open_mode = 'wb'
 939                                                         break
 940                         # Retry
 941                         count += 1
 942                         if count <= retries:
 943                                 self.report_retry(count, retries)
 944
 945                 if count > retries:
 946                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 947                         return False
 948
 949                 data_len = data.info().get('Content-length', None)
 950                 if data_len is not None:
 951                         data_len = long(data_len) + resume_len
 952                 data_len_str = self.format_bytes(data_len)
 953                 byte_counter = 0 + resume_len
 954                 block_size = 1024
 955                 start = time.time()
 956                 while True:
 957                         # Download and write
 958                         before = time.time()
 959                         data_block = data.read(block_size)
 960                         after = time.time()
 961                         if len(data_block) == 0:
 962                                 break
 963                         byte_counter += len(data_block)
 964
 965                         # Open file just in time
 966                         if stream is None:
 967                                 try:
 968                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 969                                         assert stream is not None
 970                                         filename = self.undo_temp_name(tmpfilename)
 971                                         self.report_destination(filename)
 972                                 except (OSError, IOError), err:
 973                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 974                                         return False
 975                         try:
 976                                 stream.write(data_block)
 977                         except (IOError, OSError), err:
 978                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 979                                 return False
 980                         block_size = self.best_block_size(after - before, len(data_block))
 981
 982                         # Progress message
 983                         percent_str = self.calc_percent(byte_counter, data_len)
 984                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 985                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 986                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 987
 988                         # Apply rate limit
 989                         self.slow_down(start, byte_counter - resume_len)
 990
 991                 if stream is None:
 992                         self.trouble(u'\nERROR: Did not get any data blocks')
 993                         return False
 994                 stream.close()
 995                 self.report_finish()
 996                 if data_len is not None and byte_counter != data_len:
 997                         raise ContentTooShortError(byte_counter, long(data_len))
 998                 self.try_rename(tmpfilename, filename)
 999
1000                 # Update file modification time
1001                 filetime = None
1002                 if self.params.get('updatetime', True):
1003                         filetime = self.try_utime(filename, data.info().get('last-modified', None))
1004
1005                 return True, {'filetime': filetime}
1006
1007
1008 class InfoExtractor(object):
1009         """Information Extractor class.
1010
1011         Information extractors are the classes that, given a URL, extract
1012         information from the video (or videos) the URL refers to. This
1013         information includes the real video URL, the video title and simplified
1014         title, author and others. The information is stored in a dictionary
1015         which is then passed to the FileDownloader. The FileDownloader
1016         processes this information possibly downloading the video to the file
1017         system, among other possible outcomes. The dictionaries must include
1018         the following fields:
1019
1020         id:             Video identifier.
1021         url:            Final video URL.
1022         uploader:       Nickname of the video uploader.
1023         title:          Literal title.
1024         stitle:         Simplified title.
1025         ext:            Video filename extension.
1026         format:         Video format.
1027         player_url:     SWF Player URL (may be None).
1028
1029         The following fields are optional. Their primary purpose is to allow
1030         youtube-dl to serve as the backend for a video search function, such
1031         as the one in youtube2mp3.  They are only used when their respective
1032         forced printing functions are called:
1033
1034         thumbnail:      Full URL to a video thumbnail image.
1035         description:    One-line video description.
1036
1037         Subclasses of this one should re-define the _real_initialize() and
1038         _real_extract() methods and define a _VALID_URL regexp.
1039         Probably, they should also be added to the list of extractors.
1040         """
1041
1042         _ready = False
1043         _downloader = None
1044
1045         def __init__(self, downloader=None):
1046                 """Constructor. Receives an optional downloader."""
1047                 self._ready = False
1048                 self.set_downloader(downloader)
1049
1050         def suitable(self, url):
1051                 """Receives a URL and returns True if suitable for this IE."""
1052                 return re.match(self._VALID_URL, url) is not None
1053
1054         def initialize(self):
1055                 """Initializes an instance (authentication, etc)."""
1056                 if not self._ready:
1057                         self._real_initialize()
1058                         self._ready = True
1059
1060         def extract(self, url):
1061                 """Extracts URL information and returns it in list of dicts."""
1062                 self.initialize()
1063                 return self._real_extract(url)
1064
1065         def set_downloader(self, downloader):
1066                 """Sets the downloader for this IE."""
1067                 self._downloader = downloader
1068
1069         def _real_initialize(self):
1070                 """Real initialization process. Redefine in subclasses."""
1071                 pass
1072
1073         def _real_extract(self, url):
1074                 """Real extraction process. Redefine in subclasses."""
1075                 pass
1076
1077
1078 class YoutubeIE(InfoExtractor):
1079         """Information extractor for youtube.com."""
1080
1081         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1082         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1083         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1084         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1085         _NETRC_MACHINE = 'youtube'
1086         # Listed in order of quality
1087         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1088         _video_extensions = {
1089                 '13': '3gp',
1090                 '17': 'mp4',
1091                 '18': 'mp4',
1092                 '22': 'mp4',
1093                 '37': 'mp4',
1094                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1095                 '43': 'webm',
1096                 '45': 'webm',
1097         }
1098         IE_NAME = u'youtube'
1099
1100         def report_lang(self):
1101                 """Report attempt to set language."""
1102                 self._downloader.to_screen(u'[youtube] Setting language')
1103
1104         def report_login(self):
1105                 """Report attempt to log in."""
1106                 self._downloader.to_screen(u'[youtube] Logging in')
1107
1108         def report_age_confirmation(self):
1109                 """Report attempt to confirm age."""
1110                 self._downloader.to_screen(u'[youtube] Confirming age')
1111
1112         def report_video_webpage_download(self, video_id):
1113                 """Report attempt to download video webpage."""
1114                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1115
1116         def report_video_info_webpage_download(self, video_id):
1117                 """Report attempt to download video info webpage."""
1118                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1119
1120         def report_information_extraction(self, video_id):
1121                 """Report attempt to extract video information."""
1122                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1123
1124         def report_unavailable_format(self, video_id, format):
1125                 """Report extracted video URL."""
1126                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1127
1128         def report_rtmp_download(self):
1129                 """Indicate the download will use the RTMP protocol."""
1130                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1131
1132         def _real_initialize(self):
1133                 if self._downloader is None:
1134                         return
1135
1136                 username = None
1137                 password = None
1138                 downloader_params = self._downloader.params
1139
1140                 # Attempt to use provided username and password or .netrc data
1141                 if downloader_params.get('username', None) is not None:
1142                         username = downloader_params['username']
1143                         password = downloader_params['password']
1144                 elif downloader_params.get('usenetrc', False):
1145                         try:
1146                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1147                                 if info is not None:
1148                                         username = info[0]
1149                                         password = info[2]
1150                                 else:
1151                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1152                         except (IOError, netrc.NetrcParseError), err:
1153                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1154                                 return
1155
1156                 # Set language
1157                 request = urllib2.Request(self._LANG_URL)
1158                 try:
1159                         self.report_lang()
1160                         urllib2.urlopen(request).read()
1161                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1162                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1163                         return
1164
1165                 # No authentication to be performed
1166                 if username is None:
1167                         return
1168
1169                 # Log in
1170                 login_form = {
1171                                 'current_form': 'loginForm',
1172                                 'next':         '/',
1173                                 'action_login': 'Log In',
1174                                 'username':     username,
1175                                 'password':     password,
1176                                 }
1177                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1178                 try:
1179                         self.report_login()
1180                         login_results = urllib2.urlopen(request).read()
1181                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1182                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1183                                 return
1184                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1185                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1186                         return
1187
1188                 # Confirm age
1189                 age_form = {
1190                                 'next_url':             '/',
1191                                 'action_confirm':       'Confirm',
1192                                 }
1193                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1194                 try:
1195                         self.report_age_confirmation()
1196                         age_results = urllib2.urlopen(request).read()
1197                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1198                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1199                         return
1200
1201         def _real_extract(self, url):
1202                 # Extract video id from URL
1203                 mobj = re.match(self._VALID_URL, url)
1204                 if mobj is None:
1205                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1206                         return
1207                 video_id = mobj.group(2)
1208
1209                 # Get video webpage
1210                 self.report_video_webpage_download(video_id)
1211                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1212                 try:
1213                         video_webpage = urllib2.urlopen(request).read()
1214                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1215                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1216                         return
1217
1218                 # Attempt to extract SWF player URL
1219                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1220                 if mobj is not None:
1221                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1222                 else:
1223                         player_url = None
1224
1225                 # Get video info
1226                 self.report_video_info_webpage_download(video_id)
1227                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1228                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1229                                         % (video_id, el_type))
1230                         request = urllib2.Request(video_info_url)
1231                         try:
1232                                 video_info_webpage = urllib2.urlopen(request).read()
1233                                 video_info = parse_qs(video_info_webpage)
1234                                 if 'token' in video_info:
1235                                         break
1236                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1237                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1238                                 return
1239                 if 'token' not in video_info:
1240                         if 'reason' in video_info:
1241                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1242                         else:
1243                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1244                         return
1245
1246                 # Start extracting information
1247                 self.report_information_extraction(video_id)
1248
1249                 # uploader
1250                 if 'author' not in video_info:
1251                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1252                         return
1253                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1254
1255                 # title
1256                 if 'title' not in video_info:
1257                         self._downloader.trouble(u'ERROR: unable to extract video title')
1258                         return
1259                 video_title = urllib.unquote_plus(video_info['title'][0])
1260                 video_title = video_title.decode('utf-8')
1261                 video_title = sanitize_title(video_title)
1262
1263                 # simplified title
1264                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1265                 simple_title = simple_title.strip(ur'_')
1266
1267                 # thumbnail image
1268                 if 'thumbnail_url' not in video_info:
1269                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1270                         video_thumbnail = ''
1271                 else:   # don't panic if we can't find it
1272                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1273
1274                 # upload date
1275                 upload_date = u'NA'
1276                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1277                 if mobj is not None:
1278                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1279                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1280                         for expression in format_expressions:
1281                                 try:
1282                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1283                                 except:
1284                                         pass
1285
1286                 # description
1287                 try:
1288                         lxml.etree
1289                 except NameError:
1290                         video_description = u'No description available.'
1291                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1292                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1293                                 if mobj is not None:
1294                                         video_description = mobj.group(1).decode('utf-8')
1295                 else:
1296                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1297                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1298                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1299                         # TODO use another parser
1300
1301                 # token
1302                 video_token = urllib.unquote_plus(video_info['token'][0])
1303
1304                 # Decide which formats to download
1305                 req_format = self._downloader.params.get('format', None)
1306
1307                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1308                         self.report_rtmp_download()
1309                         video_url_list = [(None, video_info['conn'][0])]
1310                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1311                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1312                         url_data = [parse_qs(uds) for uds in url_data_strs]
1313                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1314                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1315
1316                         format_limit = self._downloader.params.get('format_limit', None)
1317                         if format_limit is not None and format_limit in self._available_formats:
1318                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1319                         else:
1320                                 format_list = self._available_formats
1321                         existing_formats = [x for x in format_list if x in url_map]
1322                         if len(existing_formats) == 0:
1323                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1324                                 return
1325                         if req_format is None or req_format == 'best':
1326                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1327                         elif req_format == 'worst':
1328                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1329                         elif req_format in ('-1', 'all'):
1330                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1331                         else:
1332                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1333                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1334                                 req_formats = req_format.split('/')
1335                                 video_url_list = None
1336                                 for rf in req_formats:
1337                                         if rf in url_map:
1338                                                 video_url_list = [(rf, url_map[rf])]
1339                                                 break
1340                                 if video_url_list is None:
1341                                         self._downloader.trouble(u'ERROR: requested format not available')
1342                                         return
1343                 else:
1344                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1345                         return
1346
1347                 for format_param, video_real_url in video_url_list:
1348                         # At this point we have a new video
1349                         self._downloader.increment_downloads()
1350
1351                         # Extension
1352                         video_extension = self._video_extensions.get(format_param, 'flv')
1353
1354                         try:
1355                                 # Process video information
1356                                 self._downloader.process_info({
1357                                         'id':           video_id.decode('utf-8'),
1358                                         'url':          video_real_url.decode('utf-8'),
1359                                         'uploader':     video_uploader.decode('utf-8'),
1360                                         'upload_date':  upload_date,
1361                                         'title':        video_title,
1362                                         'stitle':       simple_title,
1363                                         'ext':          video_extension.decode('utf-8'),
1364                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1365                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1366                                         'description':  video_description,
1367                                         'player_url':   player_url,
1368                                 })
1369                         except UnavailableVideoError, err:
1370                                 self._downloader.trouble(u'\nERROR: unable to download video')
1371
1372
1373 class MetacafeIE(InfoExtractor):
1374         """Information Extractor for metacafe.com."""
1375
1376         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1377         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1378         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1379         _youtube_ie = None
1380         IE_NAME = u'metacafe'
1381
1382         def __init__(self, youtube_ie, downloader=None):
1383                 InfoExtractor.__init__(self, downloader)
1384                 self._youtube_ie = youtube_ie
1385
1386         def report_disclaimer(self):
1387                 """Report disclaimer retrieval."""
1388                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1389
1390         def report_age_confirmation(self):
1391                 """Report attempt to confirm age."""
1392                 self._downloader.to_screen(u'[metacafe] Confirming age')
1393
1394         def report_download_webpage(self, video_id):
1395                 """Report webpage download."""
1396                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1397
1398         def report_extraction(self, video_id):
1399                 """Report information extraction."""
1400                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1401
1402         def _real_initialize(self):
1403                 # Retrieve disclaimer
1404                 request = urllib2.Request(self._DISCLAIMER)
1405                 try:
1406                         self.report_disclaimer()
1407                         disclaimer = urllib2.urlopen(request).read()
1408                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1409                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1410                         return
1411
1412                 # Confirm age
1413                 disclaimer_form = {
1414                         'filters': '0',
1415                         'submit': "Continue - I'm over 18",
1416                         }
1417                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1418                 try:
1419                         self.report_age_confirmation()
1420                         disclaimer = urllib2.urlopen(request).read()
1421                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1422                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1423                         return
1424
1425         def _real_extract(self, url):
1426                 # Extract id and simplified title from URL
1427                 mobj = re.match(self._VALID_URL, url)
1428                 if mobj is None:
1429                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1430                         return
1431
1432                 video_id = mobj.group(1)
1433
1434                 # Check if video comes from YouTube
1435                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1436                 if mobj2 is not None:
1437                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1438                         return
1439
1440                 # At this point we have a new video
1441                 self._downloader.increment_downloads()
1442
1443                 simple_title = mobj.group(2).decode('utf-8')
1444
1445                 # Retrieve video webpage to extract further information
1446                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1447                 try:
1448                         self.report_download_webpage(video_id)
1449                         webpage = urllib2.urlopen(request).read()
1450                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1451                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1452                         return
1453
1454                 # Extract URL, uploader and title from webpage
1455                 self.report_extraction(video_id)
1456                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1457                 if mobj is not None:
1458                         mediaURL = urllib.unquote(mobj.group(1))
1459                         video_extension = mediaURL[-3:]
1460
1461                         # Extract gdaKey if available
1462                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1463                         if mobj is None:
1464                                 video_url = mediaURL
1465                         else:
1466                                 gdaKey = mobj.group(1)
1467                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1468                 else:
1469                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1470                         if mobj is None:
1471                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1472                                 return
1473                         vardict = parse_qs(mobj.group(1))
1474                         if 'mediaData' not in vardict:
1475                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1476                                 return
1477                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1478                         if mobj is None:
1479                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1480                                 return
1481                         mediaURL = mobj.group(1).replace('\\/', '/')
1482                         video_extension = mediaURL[-3:]
1483                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1484
1485                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1486                 if mobj is None:
1487                         self._downloader.trouble(u'ERROR: unable to extract title')
1488                         return
1489                 video_title = mobj.group(1).decode('utf-8')
1490                 video_title = sanitize_title(video_title)
1491
1492                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1493                 if mobj is None:
1494                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1495                         return
1496                 video_uploader = mobj.group(1)
1497
1498                 try:
1499                         # Process video information
1500                         self._downloader.process_info({
1501                                 'id':           video_id.decode('utf-8'),
1502                                 'url':          video_url.decode('utf-8'),
1503                                 'uploader':     video_uploader.decode('utf-8'),
1504                                 'upload_date':  u'NA',
1505                                 'title':        video_title,
1506                                 'stitle':       simple_title,
1507                                 'ext':          video_extension.decode('utf-8'),
1508                                 'format':       u'NA',
1509                                 'player_url':   None,
1510                         })
1511                 except UnavailableVideoError:
1512                         self._downloader.trouble(u'\nERROR: unable to download video')
1513
1514
1515 class DailymotionIE(InfoExtractor):
1516         """Information Extractor for Dailymotion"""
1517
1518         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1519         IE_NAME = u'dailymotion'
1520
1521         def __init__(self, downloader=None):
1522                 InfoExtractor.__init__(self, downloader)
1523
1524         def report_download_webpage(self, video_id):
1525                 """Report webpage download."""
1526                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1527
1528         def report_extraction(self, video_id):
1529                 """Report information extraction."""
1530                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1531
1532         def _real_initialize(self):
1533                 return
1534
1535         def _real_extract(self, url):
1536                 # Extract id and simplified title from URL
1537                 mobj = re.match(self._VALID_URL, url)
1538                 if mobj is None:
1539                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1540                         return
1541
1542                 # At this point we have a new video
1543                 self._downloader.increment_downloads()
1544                 video_id = mobj.group(1)
1545
1546                 simple_title = mobj.group(2).decode('utf-8')
1547                 video_extension = 'flv'
1548
1549                 # Retrieve video webpage to extract further information
1550                 request = urllib2.Request(url)
1551                 request.add_header('Cookie', 'family_filter=off')
1552                 try:
1553                         self.report_download_webpage(video_id)
1554                         webpage = urllib2.urlopen(request).read()
1555                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1556                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1557                         return
1558
1559                 # Extract URL, uploader and title from webpage
1560                 self.report_extraction(video_id)
1561                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1562                 if mobj is None:
1563                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1564                         return
1565                 sequence = urllib.unquote(mobj.group(1))
1566                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1567                 if mobj is None:
1568                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1569                         return
1570                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1571
1572                 # if needed add http://www.dailymotion.com/ if relative URL
1573
1574                 video_url = mediaURL
1575
1576                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1577                 if mobj is None:
1578                         self._downloader.trouble(u'ERROR: unable to extract title')
1579                         return
1580                 video_title = mobj.group(1).decode('utf-8')
1581                 video_title = sanitize_title(video_title)
1582
1583                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1584                 if mobj is None:
1585                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1586                         return
1587                 video_uploader = mobj.group(1)
1588
1589                 try:
1590                         # Process video information
1591                         self._downloader.process_info({
1592                                 'id':           video_id.decode('utf-8'),
1593                                 'url':          video_url.decode('utf-8'),
1594                                 'uploader':     video_uploader.decode('utf-8'),
1595                                 'upload_date':  u'NA',
1596                                 'title':        video_title,
1597                                 'stitle':       simple_title,
1598                                 'ext':          video_extension.decode('utf-8'),
1599                                 'format':       u'NA',
1600                                 'player_url':   None,
1601                         })
1602                 except UnavailableVideoError:
1603                         self._downloader.trouble(u'\nERROR: unable to download video')
1604
1605
1606 class GoogleIE(InfoExtractor):
1607         """Information extractor for video.google.com."""
1608
1609         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1610         IE_NAME = u'video.google'
1611
1612         def __init__(self, downloader=None):
1613                 InfoExtractor.__init__(self, downloader)
1614
1615         def report_download_webpage(self, video_id):
1616                 """Report webpage download."""
1617                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1618
1619         def report_extraction(self, video_id):
1620                 """Report information extraction."""
1621                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1622
1623         def _real_initialize(self):
1624                 return
1625
1626         def _real_extract(self, url):
1627                 # Extract id from URL
1628                 mobj = re.match(self._VALID_URL, url)
1629                 if mobj is None:
1630                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1631                         return
1632
1633                 # At this point we have a new video
1634                 self._downloader.increment_downloads()
1635                 video_id = mobj.group(1)
1636
1637                 video_extension = 'mp4'
1638
1639                 # Retrieve video webpage to extract further information
1640                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1641                 try:
1642                         self.report_download_webpage(video_id)
1643                         webpage = urllib2.urlopen(request).read()
1644                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1645                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1646                         return
1647
1648                 # Extract URL, uploader, and title from webpage
1649                 self.report_extraction(video_id)
1650                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1651                 if mobj is None:
1652                         video_extension = 'flv'
1653                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1654                 if mobj is None:
1655                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1656                         return
1657                 mediaURL = urllib.unquote(mobj.group(1))
1658                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1659                 mediaURL = mediaURL.replace('\\x26', '\x26')
1660
1661                 video_url = mediaURL
1662
1663                 mobj = re.search(r'<title>(.*)</title>', webpage)
1664                 if mobj is None:
1665                         self._downloader.trouble(u'ERROR: unable to extract title')
1666                         return
1667                 video_title = mobj.group(1).decode('utf-8')
1668                 video_title = sanitize_title(video_title)
1669                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1670
1671                 # Extract video description
1672                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1673                 if mobj is None:
1674                         self._downloader.trouble(u'ERROR: unable to extract video description')
1675                         return
1676                 video_description = mobj.group(1).decode('utf-8')
1677                 if not video_description:
1678                         video_description = 'No description available.'
1679
1680                 # Extract video thumbnail
1681                 if self._downloader.params.get('forcethumbnail', False):
1682                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1683                         try:
1684                                 webpage = urllib2.urlopen(request).read()
1685                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1686                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1687                                 return
1688                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1689                         if mobj is None:
1690                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1691                                 return
1692                         video_thumbnail = mobj.group(1)
1693                 else:   # we need something to pass to process_info
1694                         video_thumbnail = ''
1695
1696                 try:
1697                         # Process video information
1698                         self._downloader.process_info({
1699                                 'id':           video_id.decode('utf-8'),
1700                                 'url':          video_url.decode('utf-8'),
1701                                 'uploader':     u'NA',
1702                                 'upload_date':  u'NA',
1703                                 'title':        video_title,
1704                                 'stitle':       simple_title,
1705                                 'ext':          video_extension.decode('utf-8'),
1706                                 'format':       u'NA',
1707                                 'player_url':   None,
1708                         })
1709                 except UnavailableVideoError:
1710                         self._downloader.trouble(u'\nERROR: unable to download video')
1711
1712
1713 class PhotobucketIE(InfoExtractor):
1714         """Information extractor for photobucket.com."""
1715
1716         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1717         IE_NAME = u'photobucket'
1718
1719         def __init__(self, downloader=None):
1720                 InfoExtractor.__init__(self, downloader)
1721
1722         def report_download_webpage(self, video_id):
1723                 """Report webpage download."""
1724                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1725
1726         def report_extraction(self, video_id):
1727                 """Report information extraction."""
1728                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1729
1730         def _real_initialize(self):
1731                 return
1732
1733         def _real_extract(self, url):
1734                 # Extract id from URL
1735                 mobj = re.match(self._VALID_URL, url)
1736                 if mobj is None:
1737                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1738                         return
1739
1740                 # At this point we have a new video
1741                 self._downloader.increment_downloads()
1742                 video_id = mobj.group(1)
1743
1744                 video_extension = 'flv'
1745
1746                 # Retrieve video webpage to extract further information
1747                 request = urllib2.Request(url)
1748                 try:
1749                         self.report_download_webpage(video_id)
1750                         webpage = urllib2.urlopen(request).read()
1751                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1752                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1753                         return
1754
1755                 # Extract URL, uploader, and title from webpage
1756                 self.report_extraction(video_id)
1757                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1758                 if mobj is None:
1759                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1760                         return
1761                 mediaURL = urllib.unquote(mobj.group(1))
1762
1763                 video_url = mediaURL
1764
1765                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1766                 if mobj is None:
1767                         self._downloader.trouble(u'ERROR: unable to extract title')
1768                         return
1769                 video_title = mobj.group(1).decode('utf-8')
1770                 video_title = sanitize_title(video_title)
1771                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1772
1773                 video_uploader = mobj.group(2).decode('utf-8')
1774
1775                 try:
1776                         # Process video information
1777                         self._downloader.process_info({
1778                                 'id':           video_id.decode('utf-8'),
1779                                 'url':          video_url.decode('utf-8'),
1780                                 'uploader':     video_uploader,
1781                                 'upload_date':  u'NA',
1782                                 'title':        video_title,
1783                                 'stitle':       simple_title,
1784                                 'ext':          video_extension.decode('utf-8'),
1785                                 'format':       u'NA',
1786                                 'player_url':   None,
1787                         })
1788                 except UnavailableVideoError:
1789                         self._downloader.trouble(u'\nERROR: unable to download video')
1790
1791
1792 class YahooIE(InfoExtractor):
1793         """Information extractor for video.yahoo.com."""
1794
1795         # _VALID_URL matches all Yahoo! Video URLs
1796         # _VPAGE_URL matches only the extractable '/watch/' URLs
1797         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1798         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1799         IE_NAME = u'video.yahoo'
1800
1801         def __init__(self, downloader=None):
1802                 InfoExtractor.__init__(self, downloader)
1803
1804         def report_download_webpage(self, video_id):
1805                 """Report webpage download."""
1806                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1807
1808         def report_extraction(self, video_id):
1809                 """Report information extraction."""
1810                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1811
1812         def _real_initialize(self):
1813                 return
1814
1815         def _real_extract(self, url, new_video=True):
1816                 # Extract ID from URL
1817                 mobj = re.match(self._VALID_URL, url)
1818                 if mobj is None:
1819                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1820                         return
1821
1822                 # At this point we have a new video
1823                 self._downloader.increment_downloads()
1824                 video_id = mobj.group(2)
1825                 video_extension = 'flv'
1826
1827                 # Rewrite valid but non-extractable URLs as
1828                 # extractable English language /watch/ URLs
1829                 if re.match(self._VPAGE_URL, url) is None:
1830                         request = urllib2.Request(url)
1831                         try:
1832                                 webpage = urllib2.urlopen(request).read()
1833                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1834                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1835                                 return
1836
1837                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1838                         if mobj is None:
1839                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1840                                 return
1841                         yahoo_id = mobj.group(1)
1842
1843                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1844                         if mobj is None:
1845                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1846                                 return
1847                         yahoo_vid = mobj.group(1)
1848
1849                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1850                         return self._real_extract(url, new_video=False)
1851
1852                 # Retrieve video webpage to extract further information
1853                 request = urllib2.Request(url)
1854                 try:
1855                         self.report_download_webpage(video_id)
1856                         webpage = urllib2.urlopen(request).read()
1857                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1858                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1859                         return
1860
1861                 # Extract uploader and title from webpage
1862                 self.report_extraction(video_id)
1863                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1864                 if mobj is None:
1865                         self._downloader.trouble(u'ERROR: unable to extract video title')
1866                         return
1867                 video_title = mobj.group(1).decode('utf-8')
1868                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1869
1870                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1871                 if mobj is None:
1872                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1873                         return
1874                 video_uploader = mobj.group(1).decode('utf-8')
1875
1876                 # Extract video thumbnail
1877                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1878                 if mobj is None:
1879                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1880                         return
1881                 video_thumbnail = mobj.group(1).decode('utf-8')
1882
1883                 # Extract video description
1884                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1885                 if mobj is None:
1886                         self._downloader.trouble(u'ERROR: unable to extract video description')
1887                         return
1888                 video_description = mobj.group(1).decode('utf-8')
1889                 if not video_description:
1890                         video_description = 'No description available.'
1891
1892                 # Extract video height and width
1893                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1894                 if mobj is None:
1895                         self._downloader.trouble(u'ERROR: unable to extract video height')
1896                         return
1897                 yv_video_height = mobj.group(1)
1898
1899                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1900                 if mobj is None:
1901                         self._downloader.trouble(u'ERROR: unable to extract video width')
1902                         return
1903                 yv_video_width = mobj.group(1)
1904
1905                 # Retrieve video playlist to extract media URL
1906                 # I'm not completely sure what all these options are, but we
1907                 # seem to need most of them, otherwise the server sends a 401.
1908                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1909                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1910                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1911                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1912                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1913                 try:
1914                         self.report_download_webpage(video_id)
1915                         webpage = urllib2.urlopen(request).read()
1916                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1917                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1918                         return
1919
1920                 # Extract media URL from playlist XML
1921                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1922                 if mobj is None:
1923                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1924                         return
1925                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1926                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1927
1928                 try:
1929                         # Process video information
1930                         self._downloader.process_info({
1931                                 'id':           video_id.decode('utf-8'),
1932                                 'url':          video_url,
1933                                 'uploader':     video_uploader,
1934                                 'upload_date':  u'NA',
1935                                 'title':        video_title,
1936                                 'stitle':       simple_title,
1937                                 'ext':          video_extension.decode('utf-8'),
1938                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1939                                 'description':  video_description,
1940                                 'thumbnail':    video_thumbnail,
1941                                 'player_url':   None,
1942                         })
1943                 except UnavailableVideoError:
1944                         self._downloader.trouble(u'\nERROR: unable to download video')
1945
1946
1947 class VimeoIE(InfoExtractor):
1948         """Information extractor for vimeo.com."""
1949
1950         # _VALID_URL matches Vimeo URLs
1951         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1952         IE_NAME = u'vimeo'
1953
1954         def __init__(self, downloader=None):
1955                 InfoExtractor.__init__(self, downloader)
1956
1957         def report_download_webpage(self, video_id):
1958                 """Report webpage download."""
1959                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1960
1961         def report_extraction(self, video_id):
1962                 """Report information extraction."""
1963                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1964
1965         def _real_initialize(self):
1966                 return
1967
1968         def _real_extract(self, url, new_video=True):
1969                 # Extract ID from URL
1970                 mobj = re.match(self._VALID_URL, url)
1971                 if mobj is None:
1972                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1973                         return
1974
1975                 # At this point we have a new video
1976                 self._downloader.increment_downloads()
1977                 video_id = mobj.group(1)
1978
1979                 # Retrieve video webpage to extract further information
1980                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1981                 try:
1982                         self.report_download_webpage(video_id)
1983                         webpage = urllib2.urlopen(request).read()
1984                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1985                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1986                         return
1987
1988                 # Now we begin extracting as much information as we can from what we
1989                 # retrieved. First we extract the information common to all extractors,
1990                 # and latter we extract those that are Vimeo specific.
1991                 self.report_extraction(video_id)
1992
1993                 # Extract title
1994                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1995                 if mobj is None:
1996                         self._downloader.trouble(u'ERROR: unable to extract video title')
1997                         return
1998                 video_title = mobj.group(1).decode('utf-8')
1999                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2000
2001                 # Extract uploader
2002                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2003                 if mobj is None:
2004                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2005                         return
2006                 video_uploader = mobj.group(1).decode('utf-8')
2007
2008                 # Extract video thumbnail
2009                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2010                 if mobj is None:
2011                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2012                         return
2013                 video_thumbnail = mobj.group(1).decode('utf-8')
2014
2015                 # # Extract video description
2016                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2017                 # if mobj is None:
2018                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2019                 #       return
2020                 # video_description = mobj.group(1).decode('utf-8')
2021                 # if not video_description: video_description = 'No description available.'
2022                 video_description = 'Foo.'
2023
2024                 # Vimeo specific: extract request signature
2025                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2026                 if mobj is None:
2027                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2028                         return
2029                 sig = mobj.group(1).decode('utf-8')
2030
2031                 # Vimeo specific: Extract request signature expiration
2032                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2033                 if mobj is None:
2034                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2035                         return
2036                 sig_exp = mobj.group(1).decode('utf-8')
2037
2038                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2039
2040                 try:
2041                         # Process video information
2042                         self._downloader.process_info({
2043                                 'id':           video_id.decode('utf-8'),
2044                                 'url':          video_url,
2045                                 'uploader':     video_uploader,
2046                                 'upload_date':  u'NA',
2047                                 'title':        video_title,
2048                                 'stitle':       simple_title,
2049                                 'ext':          u'mp4',
2050                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2051                                 'description':  video_description,
2052                                 'thumbnail':    video_thumbnail,
2053                                 'description':  video_description,
2054                                 'player_url':   None,
2055                         })
2056                 except UnavailableVideoError:
2057                         self._downloader.trouble(u'ERROR: unable to download video')
2058
2059
2060 class GenericIE(InfoExtractor):
2061         """Generic last-resort information extractor."""
2062
2063         _VALID_URL = r'.*'
2064         IE_NAME = u'generic'
2065
2066         def __init__(self, downloader=None):
2067                 InfoExtractor.__init__(self, downloader)
2068
2069         def report_download_webpage(self, video_id):
2070                 """Report webpage download."""
2071                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2072                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2073
2074         def report_extraction(self, video_id):
2075                 """Report information extraction."""
2076                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2077
2078         def _real_initialize(self):
2079                 return
2080
2081         def _real_extract(self, url):
2082                 # At this point we have a new video
2083                 self._downloader.increment_downloads()
2084
2085                 video_id = url.split('/')[-1]
2086                 request = urllib2.Request(url)
2087                 try:
2088                         self.report_download_webpage(video_id)
2089                         webpage = urllib2.urlopen(request).read()
2090                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2091                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2092                         return
2093                 except ValueError, err:
2094                         # since this is the last-resort InfoExtractor, if
2095                         # this error is thrown, it'll be thrown here
2096                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2097                         return
2098
2099                 self.report_extraction(video_id)
2100                 # Start with something easy: JW Player in SWFObject
2101                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2102                 if mobj is None:
2103                         # Broaden the search a little bit
2104                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2105                 if mobj is None:
2106                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2107                         return
2108
2109                 # It's possible that one of the regexes
2110                 # matched, but returned an empty group:
2111                 if mobj.group(1) is None:
2112                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2113                         return
2114
2115                 video_url = urllib.unquote(mobj.group(1))
2116                 video_id = os.path.basename(video_url)
2117
2118                 # here's a fun little line of code for you:
2119                 video_extension = os.path.splitext(video_id)[1][1:]
2120                 video_id = os.path.splitext(video_id)[0]
2121
2122                 # it's tempting to parse this further, but you would
2123                 # have to take into account all the variations like
2124                 #   Video Title - Site Name
2125                 #   Site Name | Video Title
2126                 #   Video Title - Tagline | Site Name
2127                 # and so on and so forth; it's just not practical
2128                 mobj = re.search(r'<title>(.*)</title>', webpage)
2129                 if mobj is None:
2130                         self._downloader.trouble(u'ERROR: unable to extract title')
2131                         return
2132                 video_title = mobj.group(1).decode('utf-8')
2133                 video_title = sanitize_title(video_title)
2134                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2135
2136                 # video uploader is domain name
2137                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2138                 if mobj is None:
2139                         self._downloader.trouble(u'ERROR: unable to extract title')
2140                         return
2141                 video_uploader = mobj.group(1).decode('utf-8')
2142
2143                 try:
2144                         # Process video information
2145                         self._downloader.process_info({
2146                                 'id':           video_id.decode('utf-8'),
2147                                 'url':          video_url.decode('utf-8'),
2148                                 'uploader':     video_uploader,
2149                                 'upload_date':  u'NA',
2150                                 'title':        video_title,
2151                                 'stitle':       simple_title,
2152                                 'ext':          video_extension.decode('utf-8'),
2153                                 'format':       u'NA',
2154                                 'player_url':   None,
2155                         })
2156                 except UnavailableVideoError, err:
2157                         self._downloader.trouble(u'\nERROR: unable to download video')
2158
2159
2160 class YoutubeSearchIE(InfoExtractor):
2161         """Information Extractor for YouTube search queries."""
2162         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2163         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2164         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2165         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2166         _youtube_ie = None
2167         _max_youtube_results = 1000
2168         IE_NAME = u'youtube:search'
2169
2170         def __init__(self, youtube_ie, downloader=None):
2171                 InfoExtractor.__init__(self, downloader)
2172                 self._youtube_ie = youtube_ie
2173
2174         def report_download_page(self, query, pagenum):
2175                 """Report attempt to download playlist page with given number."""
2176                 query = query.decode(preferredencoding())
2177                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2178
2179         def _real_initialize(self):
2180                 self._youtube_ie.initialize()
2181
2182         def _real_extract(self, query):
2183                 mobj = re.match(self._VALID_URL, query)
2184                 if mobj is None:
2185                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2186                         return
2187
2188                 prefix, query = query.split(':')
2189                 prefix = prefix[8:]
2190                 query = query.encode('utf-8')
2191                 if prefix == '':
2192                         self._download_n_results(query, 1)
2193                         return
2194                 elif prefix == 'all':
2195                         self._download_n_results(query, self._max_youtube_results)
2196                         return
2197                 else:
2198                         try:
2199                                 n = long(prefix)
2200                                 if n <= 0:
2201                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2202                                         return
2203                                 elif n > self._max_youtube_results:
2204                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2205                                         n = self._max_youtube_results
2206                                 self._download_n_results(query, n)
2207                                 return
2208                         except ValueError: # parsing prefix as integer fails
2209                                 self._download_n_results(query, 1)
2210                                 return
2211
2212         def _download_n_results(self, query, n):
2213                 """Downloads a specified number of results for a query"""
2214
2215                 video_ids = []
2216                 already_seen = set()
2217                 pagenum = 1
2218
2219                 while True:
2220                         self.report_download_page(query, pagenum)
2221                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2222                         request = urllib2.Request(result_url)
2223                         try:
2224                                 page = urllib2.urlopen(request).read()
2225                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2226                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2227                                 return
2228
2229                         # Extract video identifiers
2230                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2231                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2232                                 if video_id not in already_seen:
2233                                         video_ids.append(video_id)
2234                                         already_seen.add(video_id)
2235                                         if len(video_ids) == n:
2236                                                 # Specified n videos reached
2237                                                 for id in video_ids:
2238                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2239                                                 return
2240
2241                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2242                                 for id in video_ids:
2243                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2244                                 return
2245
2246                         pagenum = pagenum + 1
2247
2248
2249 class GoogleSearchIE(InfoExtractor):
2250         """Information Extractor for Google Video search queries."""
2251         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2252         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2253         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2254         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2255         _google_ie = None
2256         _max_google_results = 1000
2257         IE_NAME = u'video.google:search'
2258
2259         def __init__(self, google_ie, downloader=None):
2260                 InfoExtractor.__init__(self, downloader)
2261                 self._google_ie = google_ie
2262
2263         def report_download_page(self, query, pagenum):
2264                 """Report attempt to download playlist page with given number."""
2265                 query = query.decode(preferredencoding())
2266                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2267
2268         def _real_initialize(self):
2269                 self._google_ie.initialize()
2270
2271         def _real_extract(self, query):
2272                 mobj = re.match(self._VALID_URL, query)
2273                 if mobj is None:
2274                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2275                         return
2276
2277                 prefix, query = query.split(':')
2278                 prefix = prefix[8:]
2279                 query = query.encode('utf-8')
2280                 if prefix == '':
2281                         self._download_n_results(query, 1)
2282                         return
2283                 elif prefix == 'all':
2284                         self._download_n_results(query, self._max_google_results)
2285                         return
2286                 else:
2287                         try:
2288                                 n = long(prefix)
2289                                 if n <= 0:
2290                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2291                                         return
2292                                 elif n > self._max_google_results:
2293                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2294                                         n = self._max_google_results
2295                                 self._download_n_results(query, n)
2296                                 return
2297                         except ValueError: # parsing prefix as integer fails
2298                                 self._download_n_results(query, 1)
2299                                 return
2300
2301         def _download_n_results(self, query, n):
2302                 """Downloads a specified number of results for a query"""
2303
2304                 video_ids = []
2305                 already_seen = set()
2306                 pagenum = 1
2307
2308                 while True:
2309                         self.report_download_page(query, pagenum)
2310                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2311                         request = urllib2.Request(result_url)
2312                         try:
2313                                 page = urllib2.urlopen(request).read()
2314                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2315                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2316                                 return
2317
2318                         # Extract video identifiers
2319                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2320                                 video_id = mobj.group(1)
2321                                 if video_id not in already_seen:
2322                                         video_ids.append(video_id)
2323                                         already_seen.add(video_id)
2324                                         if len(video_ids) == n:
2325                                                 # Specified n videos reached
2326                                                 for id in video_ids:
2327                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2328                                                 return
2329
2330                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2331                                 for id in video_ids:
2332                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2333                                 return
2334
2335                         pagenum = pagenum + 1
2336
2337
2338 class YahooSearchIE(InfoExtractor):
2339         """Information Extractor for Yahoo! Video search queries."""
2340         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2341         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2342         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2343         _MORE_PAGES_INDICATOR = r'\s*Next'
2344         _yahoo_ie = None
2345         _max_yahoo_results = 1000
2346         IE_NAME = u'video.yahoo:search'
2347
2348         def __init__(self, yahoo_ie, downloader=None):
2349                 InfoExtractor.__init__(self, downloader)
2350                 self._yahoo_ie = yahoo_ie
2351
2352         def report_download_page(self, query, pagenum):
2353                 """Report attempt to download playlist page with given number."""
2354                 query = query.decode(preferredencoding())
2355                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2356
2357         def _real_initialize(self):
2358                 self._yahoo_ie.initialize()
2359
2360         def _real_extract(self, query):
2361                 mobj = re.match(self._VALID_URL, query)
2362                 if mobj is None:
2363                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2364                         return
2365
2366                 prefix, query = query.split(':')
2367                 prefix = prefix[8:]
2368                 query = query.encode('utf-8')
2369                 if prefix == '':
2370                         self._download_n_results(query, 1)
2371                         return
2372                 elif prefix == 'all':
2373                         self._download_n_results(query, self._max_yahoo_results)
2374                         return
2375                 else:
2376                         try:
2377                                 n = long(prefix)
2378                                 if n <= 0:
2379                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2380                                         return
2381                                 elif n > self._max_yahoo_results:
2382                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2383                                         n = self._max_yahoo_results
2384                                 self._download_n_results(query, n)
2385                                 return
2386                         except ValueError: # parsing prefix as integer fails
2387                                 self._download_n_results(query, 1)
2388                                 return
2389
2390         def _download_n_results(self, query, n):
2391                 """Downloads a specified number of results for a query"""
2392
2393                 video_ids = []
2394                 already_seen = set()
2395                 pagenum = 1
2396
2397                 while True:
2398                         self.report_download_page(query, pagenum)
2399                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2400                         request = urllib2.Request(result_url)
2401                         try:
2402                                 page = urllib2.urlopen(request).read()
2403                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2404                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2405                                 return
2406
2407                         # Extract video identifiers
2408                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2409                                 video_id = mobj.group(1)
2410                                 if video_id not in already_seen:
2411                                         video_ids.append(video_id)
2412                                         already_seen.add(video_id)
2413                                         if len(video_ids) == n:
2414                                                 # Specified n videos reached
2415                                                 for id in video_ids:
2416                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2417                                                 return
2418
2419                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2420                                 for id in video_ids:
2421                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2422                                 return
2423
2424                         pagenum = pagenum + 1
2425
2426
2427 class YoutubePlaylistIE(InfoExtractor):
2428         """Information Extractor for YouTube playlists."""
2429
2430         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2431         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2432         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2433         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2434         _youtube_ie = None
2435         IE_NAME = u'youtube:playlist'
2436
2437         def __init__(self, youtube_ie, downloader=None):
2438                 InfoExtractor.__init__(self, downloader)
2439                 self._youtube_ie = youtube_ie
2440
2441         def report_download_page(self, playlist_id, pagenum):
2442                 """Report attempt to download playlist page with given number."""
2443                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2444
2445         def _real_initialize(self):
2446                 self._youtube_ie.initialize()
2447
2448         def _real_extract(self, url):
2449                 # Extract playlist id
2450                 mobj = re.match(self._VALID_URL, url)
2451                 if mobj is None:
2452                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2453                         return
2454
2455                 # Single video case
2456                 if mobj.group(3) is not None:
2457                         self._youtube_ie.extract(mobj.group(3))
2458                         return
2459
2460                 # Download playlist pages
2461                 # prefix is 'p' as default for playlists but there are other types that need extra care
2462                 playlist_prefix = mobj.group(1)
2463                 if playlist_prefix == 'a':
2464                         playlist_access = 'artist'
2465                 else:
2466                         playlist_prefix = 'p'
2467                         playlist_access = 'view_play_list'
2468                 playlist_id = mobj.group(2)
2469                 video_ids = []
2470                 pagenum = 1
2471
2472                 while True:
2473                         self.report_download_page(playlist_id, pagenum)
2474                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2475                         try:
2476                                 page = urllib2.urlopen(request).read()
2477                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2478                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2479                                 return
2480
2481                         # Extract video identifiers
2482                         ids_in_page = []
2483                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2484                                 if mobj.group(1) not in ids_in_page:
2485                                         ids_in_page.append(mobj.group(1))
2486                         video_ids.extend(ids_in_page)
2487
2488                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2489                                 break
2490                         pagenum = pagenum + 1
2491
2492                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2493                 playlistend = self._downloader.params.get('playlistend', -1)
2494                 video_ids = video_ids[playliststart:playlistend]
2495
2496                 for id in video_ids:
2497                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2498                 return
2499
2500
2501 class YoutubeUserIE(InfoExtractor):
2502         """Information Extractor for YouTube users."""
2503
2504         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2505         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2506         _GDATA_PAGE_SIZE = 50
2507         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2508         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2509         _youtube_ie = None
2510         IE_NAME = u'youtube:user'
2511
2512         def __init__(self, youtube_ie, downloader=None):
2513                 InfoExtractor.__init__(self, downloader)
2514                 self._youtube_ie = youtube_ie
2515
2516         def report_download_page(self, username, start_index):
2517                 """Report attempt to download user page."""
2518                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2519                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2520
2521         def _real_initialize(self):
2522                 self._youtube_ie.initialize()
2523
2524         def _real_extract(self, url):
2525                 # Extract username
2526                 mobj = re.match(self._VALID_URL, url)
2527                 if mobj is None:
2528                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2529                         return
2530
2531                 username = mobj.group(1)
2532
2533                 # Download video ids using YouTube Data API. Result size per
2534                 # query is limited (currently to 50 videos) so we need to query
2535                 # page by page until there are no video ids - it means we got
2536                 # all of them.
2537
2538                 video_ids = []
2539                 pagenum = 0
2540
2541                 while True:
2542                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2543                         self.report_download_page(username, start_index)
2544
2545                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2546
2547                         try:
2548                                 page = urllib2.urlopen(request).read()
2549                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2550                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2551                                 return
2552
2553                         # Extract video identifiers
2554                         ids_in_page = []
2555
2556                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2557                                 if mobj.group(1) not in ids_in_page:
2558                                         ids_in_page.append(mobj.group(1))
2559
2560                         video_ids.extend(ids_in_page)
2561
2562                         # A little optimization - if current page is not
2563                         # "full", ie. does not contain PAGE_SIZE video ids then
2564                         # we can assume that this page is the last one - there
2565                         # are no more ids on further pages - no need to query
2566                         # again.
2567
2568                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2569                                 break
2570
2571                         pagenum += 1
2572
2573                 all_ids_count = len(video_ids)
2574                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2575                 playlistend = self._downloader.params.get('playlistend', -1)
2576
2577                 if playlistend == -1:
2578                         video_ids = video_ids[playliststart:]
2579                 else:
2580                         video_ids = video_ids[playliststart:playlistend]
2581
2582                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2583                                 (username, all_ids_count, len(video_ids)))
2584
2585                 for video_id in video_ids:
2586                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2587
2588
2589 class DepositFilesIE(InfoExtractor):
2590         """Information extractor for depositfiles.com"""
2591
2592         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2593         IE_NAME = u'DepositFiles'
2594
2595         def __init__(self, downloader=None):
2596                 InfoExtractor.__init__(self, downloader)
2597
2598         def report_download_webpage(self, file_id):
2599                 """Report webpage download."""
2600                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2601
2602         def report_extraction(self, file_id):
2603                 """Report information extraction."""
2604                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2605
2606         def _real_initialize(self):
2607                 return
2608
2609         def _real_extract(self, url):
2610                 # At this point we have a new file
2611                 self._downloader.increment_downloads()
2612
2613                 file_id = url.split('/')[-1]
2614                 # Rebuild url in english locale
2615                 url = 'http://depositfiles.com/en/files/' + file_id
2616
2617                 # Retrieve file webpage with 'Free download' button pressed
2618                 free_download_indication = { 'gateway_result' : '1' }
2619                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2620                 try:
2621                         self.report_download_webpage(file_id)
2622                         webpage = urllib2.urlopen(request).read()
2623                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2624                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2625                         return
2626
2627                 # Search for the real file URL
2628                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2629                 if (mobj is None) or (mobj.group(1) is None):
2630                         # Try to figure out reason of the error.
2631                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2632                         if (mobj is not None) and (mobj.group(1) is not None):
2633                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2634                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2635                         else:
2636                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2637                         return
2638
2639                 file_url = mobj.group(1)
2640                 file_extension = os.path.splitext(file_url)[1][1:]
2641
2642                 # Search for file title
2643                 mobj = re.search(r'<b title="(.*?)">', webpage)
2644                 if mobj is None:
2645                         self._downloader.trouble(u'ERROR: unable to extract title')
2646                         return
2647                 file_title = mobj.group(1).decode('utf-8')
2648
2649                 try:
2650                         # Process file information
2651                         self._downloader.process_info({
2652                                 'id':           file_id.decode('utf-8'),
2653                                 'url':          file_url.decode('utf-8'),
2654                                 'uploader':     u'NA',
2655                                 'upload_date':  u'NA',
2656                                 'title':        file_title,
2657                                 'stitle':       file_title,
2658                                 'ext':          file_extension.decode('utf-8'),
2659                                 'format':       u'NA',
2660                                 'player_url':   None,
2661                         })
2662                 except UnavailableVideoError, err:
2663                         self._downloader.trouble(u'ERROR: unable to download file')
2664
2665
2666 class FacebookIE(InfoExtractor):
2667         """Information Extractor for Facebook"""
2668
2669         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2670         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2671         _NETRC_MACHINE = 'facebook'
2672         _available_formats = ['highqual', 'lowqual']
2673         _video_extensions = {
2674                 'highqual': 'mp4',
2675                 'lowqual': 'mp4',
2676         }
2677         IE_NAME = u'facebook'
2678
2679         def __init__(self, downloader=None):
2680                 InfoExtractor.__init__(self, downloader)
2681
2682         def _reporter(self, message):
2683                 """Add header and report message."""
2684                 self._downloader.to_screen(u'[facebook] %s' % message)
2685
2686         def report_login(self):
2687                 """Report attempt to log in."""
2688                 self._reporter(u'Logging in')
2689
2690         def report_video_webpage_download(self, video_id):
2691                 """Report attempt to download video webpage."""
2692                 self._reporter(u'%s: Downloading video webpage' % video_id)
2693
2694         def report_information_extraction(self, video_id):
2695                 """Report attempt to extract video information."""
2696                 self._reporter(u'%s: Extracting video information' % video_id)
2697
2698         def _parse_page(self, video_webpage):
2699                 """Extract video information from page"""
2700                 # General data
2701                 data = {'title': r'class="video_title datawrap">(.*?)</',
2702                         'description': r'<div class="datawrap">(.*?)</div>',
2703                         'owner': r'\("video_owner_name", "(.*?)"\)',
2704                         'upload_date': r'data-date="(.*?)"',
2705                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2706                         }
2707                 video_info = {}
2708                 for piece in data.keys():
2709                         mobj = re.search(data[piece], video_webpage)
2710                         if mobj is not None:
2711                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2712
2713                 # Video urls
2714                 video_urls = {}
2715                 for fmt in self._available_formats:
2716                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2717                         if mobj is not None:
2718                                 # URL is in a Javascript segment inside an escaped Unicode format within
2719                                 # the generally utf-8 page
2720                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2721                 video_info['video_urls'] = video_urls
2722
2723                 return video_info
2724
2725         def _real_initialize(self):
2726                 if self._downloader is None:
2727                         return
2728
2729                 useremail = None
2730                 password = None
2731                 downloader_params = self._downloader.params
2732
2733                 # Attempt to use provided username and password or .netrc data
2734                 if downloader_params.get('username', None) is not None:
2735                         useremail = downloader_params['username']
2736                         password = downloader_params['password']
2737                 elif downloader_params.get('usenetrc', False):
2738                         try:
2739                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2740                                 if info is not None:
2741                                         useremail = info[0]
2742                                         password = info[2]
2743                                 else:
2744                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2745                         except (IOError, netrc.NetrcParseError), err:
2746                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2747                                 return
2748
2749                 if useremail is None:
2750                         return
2751
2752                 # Log in
2753                 login_form = {
2754                         'email': useremail,
2755                         'pass': password,
2756                         'login': 'Log+In'
2757                         }
2758                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2759                 try:
2760                         self.report_login()
2761                         login_results = urllib2.urlopen(request).read()
2762                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2763                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2764                                 return
2765                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2766                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2767                         return
2768
2769         def _real_extract(self, url):
2770                 mobj = re.match(self._VALID_URL, url)
2771                 if mobj is None:
2772                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2773                         return
2774                 video_id = mobj.group('ID')
2775
2776                 # Get video webpage
2777                 self.report_video_webpage_download(video_id)
2778                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2779                 try:
2780                         page = urllib2.urlopen(request)
2781                         video_webpage = page.read()
2782                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2783                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2784                         return
2785
2786                 # Start extracting information
2787                 self.report_information_extraction(video_id)
2788
2789                 # Extract information
2790                 video_info = self._parse_page(video_webpage)
2791
2792                 # uploader
2793                 if 'owner' not in video_info:
2794                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2795                         return
2796                 video_uploader = video_info['owner']
2797
2798                 # title
2799                 if 'title' not in video_info:
2800                         self._downloader.trouble(u'ERROR: unable to extract video title')
2801                         return
2802                 video_title = video_info['title']
2803                 video_title = video_title.decode('utf-8')
2804                 video_title = sanitize_title(video_title)
2805
2806                 # simplified title
2807                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2808                 simple_title = simple_title.strip(ur'_')
2809
2810                 # thumbnail image
2811                 if 'thumbnail' not in video_info:
2812                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2813                         video_thumbnail = ''
2814                 else:
2815                         video_thumbnail = video_info['thumbnail']
2816
2817                 # upload date
2818                 upload_date = u'NA'
2819                 if 'upload_date' in video_info:
2820                         upload_time = video_info['upload_date']
2821                         timetuple = email.utils.parsedate_tz(upload_time)
2822                         if timetuple is not None:
2823                                 try:
2824                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2825                                 except:
2826                                         pass
2827
2828                 # description
2829                 video_description = video_info.get('description', 'No description available.')
2830
2831                 url_map = video_info['video_urls']
2832                 if len(url_map.keys()) > 0:
2833                         # Decide which formats to download
2834                         req_format = self._downloader.params.get('format', None)
2835                         format_limit = self._downloader.params.get('format_limit', None)
2836
2837                         if format_limit is not None and format_limit in self._available_formats:
2838                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2839                         else:
2840                                 format_list = self._available_formats
2841                         existing_formats = [x for x in format_list if x in url_map]
2842                         if len(existing_formats) == 0:
2843                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2844                                 return
2845                         if req_format is None:
2846                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2847                         elif req_format == 'worst':
2848                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2849                         elif req_format == '-1':
2850                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2851                         else:
2852                                 # Specific format
2853                                 if req_format not in url_map:
2854                                         self._downloader.trouble(u'ERROR: requested format not available')
2855                                         return
2856                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2857
2858                 for format_param, video_real_url in video_url_list:
2859
2860                         # At this point we have a new video
2861                         self._downloader.increment_downloads()
2862
2863                         # Extension
2864                         video_extension = self._video_extensions.get(format_param, 'mp4')
2865
2866                         try:
2867                                 # Process video information
2868                                 self._downloader.process_info({
2869                                         'id':           video_id.decode('utf-8'),
2870                                         'url':          video_real_url.decode('utf-8'),
2871                                         'uploader':     video_uploader.decode('utf-8'),
2872                                         'upload_date':  upload_date,
2873                                         'title':        video_title,
2874                                         'stitle':       simple_title,
2875                                         'ext':          video_extension.decode('utf-8'),
2876                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2877                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2878                                         'description':  video_description.decode('utf-8'),
2879                                         'player_url':   None,
2880                                 })
2881                         except UnavailableVideoError, err:
2882                                 self._downloader.trouble(u'\nERROR: unable to download video')
2883
2884 class BlipTVIE(InfoExtractor):
2885         """Information extractor for blip.tv"""
2886
2887         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2888         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2889         IE_NAME = u'blip.tv'
2890
2891         def report_extraction(self, file_id):
2892                 """Report information extraction."""
2893                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2894
2895         def _simplify_title(self, title):
2896                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2897                 res = res.strip(ur'_')
2898                 return res
2899
2900         def _real_extract(self, url):
2901                 mobj = re.match(self._VALID_URL, url)
2902                 if mobj is None:
2903                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2904                         return
2905
2906                 if '?' in url:
2907                         cchar = '&'
2908                 else:
2909                         cchar = '?'
2910                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2911                 request = urllib2.Request(json_url)
2912                 self.report_extraction(mobj.group(1))
2913                 try:
2914                         json_code = urllib2.urlopen(request).read()
2915                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2916                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2917                         return
2918                 try:
2919                         json_data = json.loads(json_code)
2920                         if 'Post' in json_data:
2921                                 data = json_data['Post']
2922                         else:
2923                                 data = json_data
2924
2925                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2926                         video_url = data['media']['url']
2927                         umobj = re.match(self._URL_EXT, video_url)
2928                         if umobj is None:
2929                                 raise ValueError('Can not determine filename extension')
2930                         ext = umobj.group(1)
2931
2932                         self._downloader.increment_downloads()
2933
2934                         info = {
2935                                 'id': data['item_id'],
2936                                 'url': video_url,
2937                                 'uploader': data['display_name'],
2938                                 'upload_date': upload_date,
2939                                 'title': data['title'],
2940                                 'stitle': self._simplify_title(data['title']),
2941                                 'ext': ext,
2942                                 'format': data['media']['mimeType'],
2943                                 'thumbnail': data['thumbnailUrl'],
2944                                 'description': data['description'],
2945                                 'player_url': data['embedUrl']
2946                         }
2947                 except (ValueError,KeyError), err:
2948                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2949                         return
2950
2951                 try:
2952                         self._downloader.process_info(info)
2953                 except UnavailableVideoError, err:
2954                         self._downloader.trouble(u'\nERROR: unable to download video')
2955
2956
2957 class MyVideoIE(InfoExtractor):
2958         """Information Extractor for myvideo.de."""
2959
2960         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2961         IE_NAME = u'myvideo'
2962
2963         def __init__(self, downloader=None):
2964                 InfoExtractor.__init__(self, downloader)
2965
2966         def report_download_webpage(self, video_id):
2967                 """Report webpage download."""
2968                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2969
2970         def report_extraction(self, video_id):
2971                 """Report information extraction."""
2972                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2973
2974         def _real_initialize(self):
2975                 return
2976
2977         def _real_extract(self,url):
2978                 mobj = re.match(self._VALID_URL, url)
2979                 if mobj is None:
2980                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2981                         return
2982
2983                 video_id = mobj.group(1)
2984                 simple_title = mobj.group(2).decode('utf-8')
2985                 # should actually not be necessary
2986                 simple_title = sanitize_title(simple_title)
2987                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2988
2989                 # Get video webpage
2990                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2991                 try:
2992                         self.report_download_webpage(video_id)
2993                         webpage = urllib2.urlopen(request).read()
2994                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2995                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2996                         return
2997
2998                 self.report_extraction(video_id)
2999                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3000                                  webpage)
3001                 if mobj is None:
3002                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3003                         return
3004                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3005
3006                 mobj = re.search('<title>([^<]+)</title>', webpage)
3007                 if mobj is None:
3008                         self._downloader.trouble(u'ERROR: unable to extract title')
3009                         return
3010
3011                 video_title = mobj.group(1)
3012                 video_title = sanitize_title(video_title)
3013
3014                 try:
3015                         print(video_url)
3016                         self._downloader.process_info({
3017                                 'id':           video_id,
3018                                 'url':          video_url,
3019                                 'uploader':     u'NA',
3020                                 'upload_date':  u'NA',
3021                                 'title':        video_title,
3022                                 'stitle':       simple_title,
3023                                 'ext':          u'flv',
3024                                 'format':       u'NA',
3025                                 'player_url':   None,
3026                         })
3027                 except UnavailableVideoError:
3028                         self._downloader.trouble(u'\nERROR: Unable to download video')
3029
3030 class ComedyCentralIE(InfoExtractor):
3031         """Information extractor for The Daily Show and Colbert Report """
3032
3033         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3034         IE_NAME = u'comedycentral'
3035
3036         def report_extraction(self, episode_id):
3037                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3038
3039         def report_config_download(self, episode_id):
3040                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3041
3042         def report_index_download(self, episode_id):
3043                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3044
3045         def report_player_url(self, episode_id):
3046                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3047
3048         def _simplify_title(self, title):
3049                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3050                 res = res.strip(ur'_')
3051                 return res
3052
3053         def _real_extract(self, url):
3054                 mobj = re.match(self._VALID_URL, url)
3055                 if mobj is None:
3056                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3057                         return
3058
3059                 if mobj.group('shortname'):
3060                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3061                                 url = 'http://www.thedailyshow.com/full-episodes/'
3062                         else:
3063                                 url = 'http://www.colbertnation.com/full-episodes/'
3064                         mobj = re.match(self._VALID_URL, url)
3065                         assert mobj is not None
3066
3067                 dlNewest = not mobj.group('episode')
3068                 if dlNewest:
3069                         epTitle = mobj.group('showname')
3070                 else:
3071                         epTitle = mobj.group('episode')
3072
3073                 req = urllib2.Request(url)
3074                 self.report_extraction(epTitle)
3075                 try:
3076                         htmlHandle = urllib2.urlopen(req)
3077                         html = htmlHandle.read()
3078                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3079                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3080                         return
3081                 if dlNewest:
3082                         url = htmlHandle.geturl()
3083                         mobj = re.match(self._VALID_URL, url)
3084                         if mobj is None:
3085                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3086                                 return
3087                         if mobj.group('episode') == '':
3088                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3089                                 return
3090                         epTitle = mobj.group('episode')
3091
3092                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3093                 if len(mMovieParams) == 0:
3094                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3095                         return
3096
3097                 playerUrl_raw = mMovieParams[0][0]
3098                 self.report_player_url(epTitle)
3099                 try:
3100                         urlHandle = urllib2.urlopen(playerUrl_raw)
3101                         playerUrl = urlHandle.geturl()
3102                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3103                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3104                         return
3105
3106                 uri = mMovieParams[0][1]
3107                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3108                 self.report_index_download(epTitle)
3109                 try:
3110                         indexXml = urllib2.urlopen(indexUrl).read()
3111                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3112                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3113                         return
3114
3115                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3116                 itemEls = idoc.findall('.//item')
3117                 for itemEl in itemEls:
3118                         mediaId = itemEl.findall('./guid')[0].text
3119                         shortMediaId = mediaId.split(':')[-1]
3120                         showId = mediaId.split(':')[-2].replace('.com', '')
3121                         officialTitle = itemEl.findall('./title')[0].text
3122                         officialDate = itemEl.findall('./pubDate')[0].text
3123
3124                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3125                                                 urllib.urlencode({'uri': mediaId}))
3126                         configReq = urllib2.Request(configUrl)
3127                         self.report_config_download(epTitle)
3128                         try:
3129                                 configXml = urllib2.urlopen(configReq).read()
3130                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3131                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3132                                 return
3133
3134                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3135                         turls = []
3136                         for rendition in cdoc.findall('.//rendition'):
3137                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3138                                 turls.append(finfo)
3139
3140                         if len(turls) == 0:
3141                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3142                                 continue
3143
3144                         # For now, just pick the highest bitrate
3145                         format,video_url = turls[-1]
3146
3147                         self._downloader.increment_downloads()
3148
3149                         effTitle = showId + '-' + epTitle
3150                         info = {
3151                                 'id': shortMediaId,
3152                                 'url': video_url,
3153                                 'uploader': showId,
3154                                 'upload_date': officialDate,
3155                                 'title': effTitle,
3156                                 'stitle': self._simplify_title(effTitle),
3157                                 'ext': 'mp4',
3158                                 'format': format,
3159                                 'thumbnail': None,
3160                                 'description': officialTitle,
3161                                 'player_url': playerUrl
3162                         }
3163
3164                         try:
3165                                 self._downloader.process_info(info)
3166                         except UnavailableVideoError, err:
3167                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3168                                 continue
3169
3170
3171 class EscapistIE(InfoExtractor):
3172         """Information extractor for The Escapist """
3173
3174         _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3175         IE_NAME = u'escapist'
3176
3177         def report_extraction(self, showName):
3178                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3179
3180         def report_config_download(self, showName):
3181                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3182
3183         def _simplify_title(self, title):
3184                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3185                 res = res.strip(ur'_')
3186                 return res
3187
3188         def _real_extract(self, url):
3189                 htmlParser = HTMLParser.HTMLParser()
3190
3191                 mobj = re.match(self._VALID_URL, url)
3192                 if mobj is None:
3193                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3194                         return
3195                 showName = mobj.group('showname')
3196                 videoId = mobj.group('episode')
3197
3198                 self.report_extraction(showName)
3199                 try:
3200                         webPage = urllib2.urlopen(url).read()
3201                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3202                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3203                         return
3204
3205                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3206                 description = htmlParser.unescape(descMatch.group(1))
3207                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3208                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3209                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3210                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3211                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3212                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3213
3214                 self.report_config_download(showName)
3215                 try:
3216                         configJSON = urllib2.urlopen(configUrl).read()
3217                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3218                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3219                         return
3220
3221                 # Technically, it's JavaScript, not JSON
3222                 configJSON = configJSON.replace("'", '"')
3223
3224                 try:
3225                         config = json.loads(configJSON)
3226                 except (ValueError,), err:
3227                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3228                         return
3229
3230                 playlist = config['playlist']
3231                 videoUrl = playlist[1]['url']
3232
3233                 self._downloader.increment_downloads()
3234                 info = {
3235                         'id': videoId,
3236                         'url': videoUrl,
3237                         'uploader': showName,
3238                         'upload_date': None,
3239                         'title': showName,
3240                         'stitle': self._simplify_title(showName),
3241                         'ext': 'flv',
3242                         'format': 'flv',
3243                         'thumbnail': imgUrl,
3244                         'description': description,
3245                         'player_url': playerUrl,
3246                 }
3247
3248                 try:
3249                         self._downloader.process_info(info)
3250                 except UnavailableVideoError, err:
3251                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3252
3253
3254
3255 class PostProcessor(object):
3256         """Post Processor class.
3257
3258         PostProcessor objects can be added to downloaders with their
3259         add_post_processor() method. When the downloader has finished a
3260         successful download, it will take its internal chain of PostProcessors
3261         and start calling the run() method on each one of them, first with
3262         an initial argument and then with the returned value of the previous
3263         PostProcessor.
3264
3265         The chain will be stopped if one of them ever returns None or the end
3266         of the chain is reached.
3267
3268         PostProcessor objects follow a "mutual registration" process similar
3269         to InfoExtractor objects.
3270         """
3271
3272         _downloader = None
3273
3274         def __init__(self, downloader=None):
3275                 self._downloader = downloader
3276
3277         def set_downloader(self, downloader):
3278                 """Sets the downloader for this PP."""
3279                 self._downloader = downloader
3280
3281         def run(self, information):
3282                 """Run the PostProcessor.
3283
3284                 The "information" argument is a dictionary like the ones
3285                 composed by InfoExtractors. The only difference is that this
3286                 one has an extra field called "filepath" that points to the
3287                 downloaded file.
3288
3289                 When this method returns None, the postprocessing chain is
3290                 stopped. However, this method may return an information
3291                 dictionary that will be passed to the next postprocessing
3292                 object in the chain. It can be the one it received after
3293                 changing some fields.
3294
3295                 In addition, this method may raise a PostProcessingError
3296                 exception that will be taken into account by the downloader
3297                 it was called from.
3298                 """
3299                 return information # by default, do nothing
3300
3301
3302 class FFmpegExtractAudioPP(PostProcessor):
3303
3304         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3305                 PostProcessor.__init__(self, downloader)
3306                 if preferredcodec is None:
3307                         preferredcodec = 'best'
3308                 self._preferredcodec = preferredcodec
3309                 self._preferredquality = preferredquality
3310                 self._keepvideo = keepvideo
3311
3312         @staticmethod
3313         def get_audio_codec(path):
3314                 try:
3315                         cmd = ['ffprobe', '-show_streams', '--', path]
3316                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3317                         output = handle.communicate()[0]
3318                         if handle.wait() != 0:
3319                                 return None
3320                 except (IOError, OSError):
3321                         return None
3322                 audio_codec = None
3323                 for line in output.split('\n'):
3324                         if line.startswith('codec_name='):
3325                                 audio_codec = line.split('=')[1].strip()
3326                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3327                                 return audio_codec
3328                 return None
3329
3330         @staticmethod
3331         def run_ffmpeg(path, out_path, codec, more_opts):
3332                 try:
3333                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3334                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3335                         return (ret == 0)
3336                 except (IOError, OSError):
3337                         return False
3338
3339         def run(self, information):
3340                 path = information['filepath']
3341
3342                 filecodec = self.get_audio_codec(path)
3343                 if filecodec is None:
3344                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3345                         return None
3346
3347                 more_opts = []
3348                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3349                         if filecodec == 'aac' or filecodec == 'mp3':
3350                                 # Lossless if possible
3351                                 acodec = 'copy'
3352                                 extension = filecodec
3353                                 if filecodec == 'aac':
3354                                         more_opts = ['-f', 'adts']
3355                         else:
3356                                 # MP3 otherwise.
3357                                 acodec = 'libmp3lame'
3358                                 extension = 'mp3'
3359                                 more_opts = []
3360                                 if self._preferredquality is not None:
3361                                         more_opts += ['-ab', self._preferredquality]
3362                 else:
3363                         # We convert the audio (lossy)
3364                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3365                         extension = self._preferredcodec
3366                         more_opts = []
3367                         if self._preferredquality is not None:
3368                                 more_opts += ['-ab', self._preferredquality]
3369                         if self._preferredcodec == 'aac':
3370                                 more_opts += ['-f', 'adts']
3371
3372                 (prefix, ext) = os.path.splitext(path)
3373                 new_path = prefix + '.' + extension
3374                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3375                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3376
3377                 if not status:
3378                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3379                         return None
3380
3381                 # Try to update the date time for extracted audio file.
3382                 if information.get('filetime') is not None:
3383                         try:
3384                                 os.utime(new_path, (time.time(), information['filetime']))
3385                         except:
3386                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3387
3388                 if not self._keepvideo:
3389                         try:
3390                                 os.remove(path)
3391                         except (IOError, OSError):
3392                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3393                                 return None
3394
3395                 information['filepath'] = new_path
3396                 return information
3397
3398
3399 def updateSelf(downloader, filename):
3400         ''' Update the program file with the latest version from the repository '''
3401         # Note: downloader only used for options
3402         if not os.access(filename, os.W_OK):
3403                 sys.exit('ERROR: no write permissions on %s' % filename)
3404
3405         downloader.to_screen('Updating to latest version...')
3406
3407         try:
3408                 try:
3409                         urlh = urllib.urlopen(UPDATE_URL)
3410                         newcontent = urlh.read()
3411                 finally:
3412                         urlh.close()
3413         except (IOError, OSError), err:
3414                 sys.exit('ERROR: unable to download latest version')
3415
3416         try:
3417                 outf = open(filename, 'wb')
3418                 try:
3419                         outf.write(newcontent)
3420                 finally:
3421                         outf.close()
3422         except (IOError, OSError), err:
3423                 sys.exit('ERROR: unable to overwrite current version')
3424
3425         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3426
3427 def parseOpts():
3428         # Deferred imports
3429         import getpass
3430         import optparse
3431
3432         def _format_option_string(option):
3433                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3434
3435                 opts = []
3436
3437                 if option._short_opts: opts.append(option._short_opts[0])
3438                 if option._long_opts: opts.append(option._long_opts[0])
3439                 if len(opts) > 1: opts.insert(1, ', ')
3440
3441                 if option.takes_value(): opts.append(' %s' % option.metavar)
3442
3443                 return "".join(opts)
3444
3445         def _find_term_columns():
3446                 columns = os.environ.get('COLUMNS', None)
3447                 if columns:
3448                         return int(columns)
3449
3450                 try:
3451                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3452                         out,err = sp.communicate()
3453                         return int(out.split()[1])
3454                 except:
3455                         pass
3456                 return None
3457
3458         max_width = 80
3459         max_help_position = 80
3460
3461         # No need to wrap help messages if we're on a wide console
3462         columns = _find_term_columns()
3463         if columns: max_width = columns
3464
3465         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3466         fmt.format_option_strings = _format_option_string
3467
3468         kw = {
3469                 'version'   : __version__,
3470                 'formatter' : fmt,
3471                 'usage' : '%prog [options] url [url...]',
3472                 'conflict_handler' : 'resolve',
3473         }
3474
3475         parser = optparse.OptionParser(**kw)
3476
3477         # option groups
3478         general        = optparse.OptionGroup(parser, 'General Options')
3479         selection      = optparse.OptionGroup(parser, 'Video Selection')
3480         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3481         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3482         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3483         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3484         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3485
3486         general.add_option('-h', '--help',
3487                         action='help', help='print this help text and exit')
3488         general.add_option('-v', '--version',
3489                         action='version', help='print program version and exit')
3490         general.add_option('-U', '--update',
3491                         action='store_true', dest='update_self', help='update this program to latest version')
3492         general.add_option('-i', '--ignore-errors',
3493                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3494         general.add_option('-r', '--rate-limit',
3495                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3496         general.add_option('-R', '--retries',
3497                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3498         general.add_option('--dump-user-agent',
3499                         action='store_true', dest='dump_user_agent',
3500                         help='display the current browser identification', default=False)
3501         general.add_option('--list-extractors',
3502                         action='store_true', dest='list_extractors',
3503                         help='List all supported extractors and the URLs they would handle', default=False)
3504
3505         selection.add_option('--playlist-start',
3506                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3507         selection.add_option('--playlist-end',
3508                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3509         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3510         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3511
3512         authentication.add_option('-u', '--username',
3513                         dest='username', metavar='USERNAME', help='account username')
3514         authentication.add_option('-p', '--password',
3515                         dest='password', metavar='PASSWORD', help='account password')
3516         authentication.add_option('-n', '--netrc',
3517                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3518
3519
3520         video_format.add_option('-f', '--format',
3521                         action='store', dest='format', metavar='FORMAT', help='video format code')
3522         video_format.add_option('--all-formats',
3523                         action='store_const', dest='format', help='download all available video formats', const='all')
3524         video_format.add_option('--max-quality',
3525                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3526
3527
3528         verbosity.add_option('-q', '--quiet',
3529                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3530         verbosity.add_option('-s', '--simulate',
3531                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3532         verbosity.add_option('--skip-download',
3533                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3534         verbosity.add_option('-g', '--get-url',
3535                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3536         verbosity.add_option('-e', '--get-title',
3537                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3538         verbosity.add_option('--get-thumbnail',
3539                         action='store_true', dest='getthumbnail',
3540                         help='simulate, quiet but print thumbnail URL', default=False)
3541         verbosity.add_option('--get-description',
3542                         action='store_true', dest='getdescription',
3543                         help='simulate, quiet but print video description', default=False)
3544         verbosity.add_option('--get-filename',
3545                         action='store_true', dest='getfilename',
3546                         help='simulate, quiet but print output filename', default=False)
3547         verbosity.add_option('--get-format',
3548                         action='store_true', dest='getformat',
3549                         help='simulate, quiet but print output format', default=False)
3550         verbosity.add_option('--no-progress',
3551                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3552         verbosity.add_option('--console-title',
3553                         action='store_true', dest='consoletitle',
3554                         help='display progress in console titlebar', default=False)
3555
3556
3557         filesystem.add_option('-t', '--title',
3558                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3559         filesystem.add_option('-l', '--literal',
3560                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3561         filesystem.add_option('-A', '--auto-number',
3562                         action='store_true', dest='autonumber',
3563                         help='number downloaded files starting from 00000', default=False)
3564         filesystem.add_option('-o', '--output',
3565                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3566         filesystem.add_option('-a', '--batch-file',
3567                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3568         filesystem.add_option('-w', '--no-overwrites',
3569                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3570         filesystem.add_option('-c', '--continue',
3571                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3572         filesystem.add_option('--cookies',
3573                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3574         filesystem.add_option('--no-part',
3575                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3576         filesystem.add_option('--no-mtime',
3577                         action='store_false', dest='updatetime',
3578                         help='do not use the Last-modified header to set the file modification time', default=True)
3579         filesystem.add_option('--write-description',
3580                         action='store_true', dest='writedescription',
3581                         help='write video description to a .description file', default=False)
3582         filesystem.add_option('--write-info-json',
3583                         action='store_true', dest='writeinfojson',
3584                         help='write video metadata to a .info.json file', default=False)
3585
3586
3587         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3588                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3589         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3590                         help='"best", "aac" or "mp3"; best by default')
3591         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3592                         help='ffmpeg audio bitrate specification, 128k by default')
3593         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3594                         help='keeps the video file on disk after the post-processing; the video is erased by default')
3595
3596
3597         parser.add_option_group(general)
3598         parser.add_option_group(selection)
3599         parser.add_option_group(filesystem)
3600         parser.add_option_group(verbosity)
3601         parser.add_option_group(video_format)
3602         parser.add_option_group(authentication)
3603         parser.add_option_group(postproc)
3604
3605         opts, args = parser.parse_args()
3606
3607         return parser, opts, args
3608
3609 def gen_extractors():
3610         """ Return a list of an instance of every supported extractor.
3611         The order does matter; the first extractor matched is the one handling the URL.
3612         """
3613         youtube_ie = YoutubeIE()
3614         google_ie = GoogleIE()
3615         yahoo_ie = YahooIE()
3616         return [
3617                 youtube_ie,
3618                 MetacafeIE(youtube_ie),
3619                 DailymotionIE(),
3620                 YoutubePlaylistIE(youtube_ie),
3621                 YoutubeUserIE(youtube_ie),
3622                 YoutubeSearchIE(youtube_ie),
3623                 google_ie,
3624                 GoogleSearchIE(google_ie),
3625                 PhotobucketIE(),
3626                 yahoo_ie,
3627                 YahooSearchIE(yahoo_ie),
3628                 DepositFilesIE(),
3629                 FacebookIE(),
3630                 BlipTVIE(),
3631                 VimeoIE(),
3632                 MyVideoIE(),
3633                 ComedyCentralIE(),
3634                 EscapistIE(),
3635
3636                 GenericIE()
3637         ]
3638
3639 def main():
3640         parser, opts, args = parseOpts()
3641
3642         # Open appropriate CookieJar
3643         if opts.cookiefile is None:
3644                 jar = cookielib.CookieJar()
3645         else:
3646                 try:
3647                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3648                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3649                                 jar.load()
3650                 except (IOError, OSError), err:
3651                         sys.exit(u'ERROR: unable to open cookie file')
3652
3653         # Dump user agent
3654         if opts.dump_user_agent:
3655                 print std_headers['User-Agent']
3656                 sys.exit(0)
3657
3658         # Batch file verification
3659         batchurls = []
3660         if opts.batchfile is not None:
3661                 try:
3662                         if opts.batchfile == '-':
3663                                 batchfd = sys.stdin
3664                         else:
3665                                 batchfd = open(opts.batchfile, 'r')
3666                         batchurls = batchfd.readlines()
3667                         batchurls = [x.strip() for x in batchurls]
3668                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3669                 except IOError:
3670                         sys.exit(u'ERROR: batch file could not be read')
3671         all_urls = batchurls + args
3672
3673         # General configuration
3674         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3675         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3676         urllib2.install_opener(opener)
3677         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3678
3679         extractors = gen_extractors()
3680
3681         if opts.list_extractors:
3682                 for ie in extractors:
3683                         print(ie.IE_NAME)
3684                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3685                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3686                         for mu in matchedUrls:
3687                                 print(u'  ' + mu)
3688                 sys.exit(0)
3689
3690         # Conflicting, missing and erroneous options
3691         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3692                 parser.error(u'using .netrc conflicts with giving username/password')
3693         if opts.password is not None and opts.username is None:
3694                 parser.error(u'account username missing')
3695         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3696                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3697         if opts.usetitle and opts.useliteral:
3698                 parser.error(u'using title conflicts with using literal title')
3699         if opts.username is not None and opts.password is None:
3700                 opts.password = getpass.getpass(u'Type account password and press return:')
3701         if opts.ratelimit is not None:
3702                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3703                 if numeric_limit is None:
3704                         parser.error(u'invalid rate limit specified')
3705                 opts.ratelimit = numeric_limit
3706         if opts.retries is not None:
3707                 try:
3708                         opts.retries = long(opts.retries)
3709                 except (TypeError, ValueError), err:
3710                         parser.error(u'invalid retry count specified')
3711         try:
3712                 opts.playliststart = int(opts.playliststart)
3713                 if opts.playliststart <= 0:
3714                         raise ValueError(u'Playlist start must be positive')
3715         except (TypeError, ValueError), err:
3716                 parser.error(u'invalid playlist start number specified')
3717         try:
3718                 opts.playlistend = int(opts.playlistend)
3719                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3720                         raise ValueError(u'Playlist end must be greater than playlist start')
3721         except (TypeError, ValueError), err:
3722                 parser.error(u'invalid playlist end number specified')
3723         if opts.extractaudio:
3724                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3725                         parser.error(u'invalid audio format specified')
3726
3727         # File downloader
3728         fd = FileDownloader({
3729                 'usenetrc': opts.usenetrc,
3730                 'username': opts.username,
3731                 'password': opts.password,
3732                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3733                 'forceurl': opts.geturl,
3734                 'forcetitle': opts.gettitle,
3735                 'forcethumbnail': opts.getthumbnail,
3736                 'forcedescription': opts.getdescription,
3737                 'forcefilename': opts.getfilename,
3738                 'forceformat': opts.getformat,
3739                 'simulate': opts.simulate,
3740                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3741                 'format': opts.format,
3742                 'format_limit': opts.format_limit,
3743                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3744                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3745                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3746                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3747                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3748                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3749                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3750                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3751                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3752                         or u'%(id)s.%(ext)s'),
3753                 'ignoreerrors': opts.ignoreerrors,
3754                 'ratelimit': opts.ratelimit,
3755                 'nooverwrites': opts.nooverwrites,
3756                 'retries': opts.retries,
3757                 'continuedl': opts.continue_dl,
3758                 'noprogress': opts.noprogress,
3759                 'playliststart': opts.playliststart,
3760                 'playlistend': opts.playlistend,
3761                 'logtostderr': opts.outtmpl == '-',
3762                 'consoletitle': opts.consoletitle,
3763                 'nopart': opts.nopart,
3764                 'updatetime': opts.updatetime,
3765                 'writedescription': opts.writedescription,
3766                 'writeinfojson': opts.writeinfojson,
3767                 'matchtitle': opts.matchtitle,
3768                 'rejecttitle': opts.rejecttitle,
3769                 })
3770         for extractor in extractors:
3771                 fd.add_info_extractor(extractor)
3772
3773         # PostProcessors
3774         if opts.extractaudio:
3775                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3776
3777         # Update version
3778         if opts.update_self:
3779                 updateSelf(fd, sys.argv[0])
3780
3781         # Maybe do nothing
3782         if len(all_urls) < 1:
3783                 if not opts.update_self:
3784                         parser.error(u'you must provide at least one URL')
3785                 else:
3786                         sys.exit()
3787         retcode = fd.download(all_urls)
3788
3789         # Dump cookie jar if requested
3790         if opts.cookiefile is not None:
3791                 try:
3792                         jar.save()
3793                 except (IOError, OSError), err:
3794                         sys.exit(u'ERROR: unable to save cookie jar')
3795
3796         sys.exit(retcode)
3797
3798
3799 if __name__ == '__main__':
3800         try:
3801                 main()
3802         except DownloadError:
3803                 sys.exit(1)
3804         except SameFileError:
3805                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3806         except KeyboardInterrupt:
3807                 sys.exit(u'\nERROR: Interrupted by user')
3808
3809 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: