youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.09.14'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import HTMLParser
  27 import httplib
  28 import locale
  29 import math
  30 import netrc
  31 import os
  32 import os.path
  33 import re
  34 import socket
  35 import string
  36 import subprocess
  37 import sys
  38 import time
  39 import urllib
  40 import urllib2
  41 import warnings
  42 import zlib
  43
  44 if os.name == 'nt':
  45         import ctypes
  46
  47 try:
  48         import email.utils
  49 except ImportError: # Python 2.4
  50         import email.Utils
  51 try:
  52         import cStringIO as StringIO
  53 except ImportError:
  54         import StringIO
  55
  56 # parse_qs was moved from the cgi module to the urlparse module recently.
  57 try:
  58         from urlparse import parse_qs
  59 except ImportError:
  60         from cgi import parse_qs
  61
  62 try:
  63         import lxml.etree
  64 except ImportError:
  65         pass # Handled below
  66
  67 try:
  68         import xml.etree.ElementTree
  69 except ImportError: # Python<2.5
  70         pass # Not officially supported, but let it slip
  71
  72 std_headers = {
  73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  76         'Accept-Encoding': 'gzip, deflate',
  77         'Accept-Language': 'en-us,en;q=0.5',
  78 }
  79
  80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280
 281 class DownloadError(Exception):
 282         """Download Error exception.
 283
 284         This exception may be thrown by FileDownloader objects if they are not
 285         configured to continue on errors. They will contain the appropriate
 286         error message.
 287         """
 288         pass
 289
 290
 291 class SameFileError(Exception):
 292         """Same File exception.
 293
 294         This exception will be thrown by FileDownloader objects if they detect
 295         multiple files would have to be downloaded to the same file on disk.
 296         """
 297         pass
 298
 299
 300 class PostProcessingError(Exception):
 301         """Post Processing exception.
 302
 303         This exception may be raised by PostProcessor's .run() method to
 304         indicate an error in the postprocessing task.
 305         """
 306         pass
 307
 308
 309 class UnavailableVideoError(Exception):
 310         """Unavailable Format exception.
 311
 312         This exception will be thrown when a video is requested
 313         in a format that is not available for that video.
 314         """
 315         pass
 316
 317
 318 class ContentTooShortError(Exception):
 319         """Content Too Short exception.
 320
 321         This exception may be raised by FileDownloader objects when a file they
 322         download is too small for what the server announced first, indicating
 323         the connection was probably interrupted.
 324         """
 325         # Both in bytes
 326         downloaded = None
 327         expected = None
 328
 329         def __init__(self, downloaded, expected):
 330                 self.downloaded = downloaded
 331                 self.expected = expected
 332
 333
 334 class YoutubeDLHandler(urllib2.HTTPHandler):
 335         """Handler for HTTP requests and responses.
 336
 337         This class, when installed with an OpenerDirector, automatically adds
 338         the standard headers to every HTTP request and handles gzipped and
 339         deflated responses from web servers. If compression is to be avoided in
 340         a particular request, the original request in the program code only has
 341         to include the HTTP header "Youtubedl-No-Compression", which will be
 342         removed before making the real request.
 343
 344         Part of this code was copied from:
 345
 346         http://techknack.net/python-urllib2-handlers/
 347
 348         Andrew Rowls, the author of that code, agreed to release it to the
 349         public domain.
 350         """
 351
 352         @staticmethod
 353         def deflate(data):
 354                 try:
 355                         return zlib.decompress(data, -zlib.MAX_WBITS)
 356                 except zlib.error:
 357                         return zlib.decompress(data)
 358
 359         @staticmethod
 360         def addinfourl_wrapper(stream, headers, url, code):
 361                 if hasattr(urllib2.addinfourl, 'getcode'):
 362                         return urllib2.addinfourl(stream, headers, url, code)
 363                 ret = urllib2.addinfourl(stream, headers, url)
 364                 ret.code = code
 365                 return ret
 366
 367         def http_request(self, req):
 368                 for h in std_headers:
 369                         if h in req.headers:
 370                                 del req.headers[h]
 371                         req.add_header(h, std_headers[h])
 372                 if 'Youtubedl-no-compression' in req.headers:
 373                         if 'Accept-encoding' in req.headers:
 374                                 del req.headers['Accept-encoding']
 375                         del req.headers['Youtubedl-no-compression']
 376                 return req
 377
 378         def http_response(self, req, resp):
 379                 old_resp = resp
 380                 # gzip
 381                 if resp.headers.get('Content-encoding', '') == 'gzip':
 382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 384                         resp.msg = old_resp.msg
 385                 # deflate
 386                 if resp.headers.get('Content-encoding', '') == 'deflate':
 387                         gz = StringIO.StringIO(self.deflate(resp.read()))
 388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 389                         resp.msg = old_resp.msg
 390                 return resp
 391
 392
 393 class FileDownloader(object):
 394         """File Downloader class.
 395
 396         File downloader objects are the ones responsible of downloading the
 397         actual video file and writing it to disk if the user has requested
 398         it, among some other tasks. In most cases there should be one per
 399         program. As, given a video URL, the downloader doesn't know how to
 400         extract all the needed information, task that InfoExtractors do, it
 401         has to pass the URL to one of them.
 402
 403         For this, file downloader objects have a method that allows
 404         InfoExtractors to be registered in a given order. When it is passed
 405         a URL, the file downloader handles it to the first InfoExtractor it
 406         finds that reports being able to handle it. The InfoExtractor extracts
 407         all the information about the video or videos the URL refers to, and
 408         asks the FileDownloader to process the video information, possibly
 409         downloading the video.
 410
 411         File downloaders accept a lot of parameters. In order not to saturate
 412         the object constructor with arguments, it receives a dictionary of
 413         options instead. These options are available through the params
 414         attribute for the InfoExtractors to use. The FileDownloader also
 415         registers itself as the downloader in charge for the InfoExtractors
 416         that are added to it, so this is a "mutual registration".
 417
 418         Available options:
 419
 420         username:         Username for authentication purposes.
 421         password:         Password for authentication purposes.
 422         usenetrc:         Use netrc for authentication instead.
 423         quiet:            Do not print messages to stdout.
 424         forceurl:         Force printing final URL.
 425         forcetitle:       Force printing title.
 426         forcethumbnail:   Force printing thumbnail URL.
 427         forcedescription: Force printing description.
 428         forcefilename:    Force printing final filename.
 429         simulate:         Do not download the video files.
 430         format:           Video format code.
 431         format_limit:     Highest quality format to try.
 432         outtmpl:          Template for output names.
 433         ignoreerrors:     Do not stop on download errors.
 434         ratelimit:        Download speed limit, in bytes/sec.
 435         nooverwrites:     Prevent overwriting files.
 436         retries:          Number of times to retry for HTTP error 5xx
 437         continuedl:       Try to continue downloads if possible.
 438         noprogress:       Do not print the progress bar.
 439         playliststart:    Playlist item to start at.
 440         playlistend:      Playlist item to end at.
 441         logtostderr:      Log messages to stderr instead of stdout.
 442         consoletitle:     Display progress in console window's titlebar.
 443         nopart:           Do not use temporary .part files.
 444         updatetime:       Use the Last-modified header to set output file timestamps.
 445         writedescription: Write the video description to a .description file
 446         writeinfojson:    Write the video description to a .info.json file
 447         """
 448
 449         params = None
 450         _ies = []
 451         _pps = []
 452         _download_retcode = None
 453         _num_downloads = None
 454         _screen_file = None
 455
 456         def __init__(self, params):
 457                 """Create a FileDownloader object with the given options."""
 458                 self._ies = []
 459                 self._pps = []
 460                 self._download_retcode = 0
 461                 self._num_downloads = 0
 462                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 463                 self.params = params
 464
 465         @staticmethod
 466         def format_bytes(bytes):
 467                 if bytes is None:
 468                         return 'N/A'
 469                 if type(bytes) is str:
 470                         bytes = float(bytes)
 471                 if bytes == 0.0:
 472                         exponent = 0
 473                 else:
 474                         exponent = long(math.log(bytes, 1024.0))
 475                 suffix = 'bkMGTPEZY'[exponent]
 476                 converted = float(bytes) / float(1024 ** exponent)
 477                 return '%.2f%s' % (converted, suffix)
 478
 479         @staticmethod
 480         def calc_percent(byte_counter, data_len):
 481                 if data_len is None:
 482                         return '---.-%'
 483                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 484
 485         @staticmethod
 486         def calc_eta(start, now, total, current):
 487                 if total is None:
 488                         return '--:--'
 489                 dif = now - start
 490                 if current == 0 or dif < 0.001: # One millisecond
 491                         return '--:--'
 492                 rate = float(current) / dif
 493                 eta = long((float(total) - float(current)) / rate)
 494                 (eta_mins, eta_secs) = divmod(eta, 60)
 495                 if eta_mins > 99:
 496                         return '--:--'
 497                 return '%02d:%02d' % (eta_mins, eta_secs)
 498
 499         @staticmethod
 500         def calc_speed(start, now, bytes):
 501                 dif = now - start
 502                 if bytes == 0 or dif < 0.001: # One millisecond
 503                         return '%10s' % '---b/s'
 504                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 505
 506         @staticmethod
 507         def best_block_size(elapsed_time, bytes):
 508                 new_min = max(bytes / 2.0, 1.0)
 509                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 510                 if elapsed_time < 0.001:
 511                         return long(new_max)
 512                 rate = bytes / elapsed_time
 513                 if rate > new_max:
 514                         return long(new_max)
 515                 if rate < new_min:
 516                         return long(new_min)
 517                 return long(rate)
 518
 519         @staticmethod
 520         def parse_bytes(bytestr):
 521                 """Parse a string indicating a byte quantity into a long integer."""
 522                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 523                 if matchobj is None:
 524                         return None
 525                 number = float(matchobj.group(1))
 526                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 527                 return long(round(number * multiplier))
 528
 529         def add_info_extractor(self, ie):
 530                 """Add an InfoExtractor object to the end of the list."""
 531                 self._ies.append(ie)
 532                 ie.set_downloader(self)
 533
 534         def add_post_processor(self, pp):
 535                 """Add a PostProcessor object to the end of the chain."""
 536                 self._pps.append(pp)
 537                 pp.set_downloader(self)
 538
 539         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 540                 """Print message to stdout if not in quiet mode."""
 541                 try:
 542                         if not self.params.get('quiet', False):
 543                                 terminator = [u'\n', u''][skip_eol]
 544                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 545                         self._screen_file.flush()
 546                 except (UnicodeEncodeError), err:
 547                         if not ignore_encoding_errors:
 548                                 raise
 549
 550         def to_stderr(self, message):
 551                 """Print message to stderr."""
 552                 print >>sys.stderr, message.encode(preferredencoding())
 553
 554         def to_cons_title(self, message):
 555                 """Set console/terminal window title to message."""
 556                 if not self.params.get('consoletitle', False):
 557                         return
 558                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 559                         # c_wchar_p() might not be necessary if `message` is
 560                         # already of type unicode()
 561                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 562                 elif 'TERM' in os.environ:
 563                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 564
 565         def fixed_template(self):
 566                 """Checks if the output template is fixed."""
 567                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 568
 569         def trouble(self, message=None):
 570                 """Determine action to take when a download problem appears.
 571
 572                 Depending on if the downloader has been configured to ignore
 573                 download errors or not, this method may throw an exception or
 574                 not when errors are found, after printing the message.
 575                 """
 576                 if message is not None:
 577                         self.to_stderr(message)
 578                 if not self.params.get('ignoreerrors', False):
 579                         raise DownloadError(message)
 580                 self._download_retcode = 1
 581
 582         def slow_down(self, start_time, byte_counter):
 583                 """Sleep if the download speed is over the rate limit."""
 584                 rate_limit = self.params.get('ratelimit', None)
 585                 if rate_limit is None or byte_counter == 0:
 586                         return
 587                 now = time.time()
 588                 elapsed = now - start_time
 589                 if elapsed <= 0.0:
 590                         return
 591                 speed = float(byte_counter) / elapsed
 592                 if speed > rate_limit:
 593                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 594
 595         def temp_name(self, filename):
 596                 """Returns a temporary filename for the given filename."""
 597                 if self.params.get('nopart', False) or filename == u'-' or \
 598                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 599                         return filename
 600                 return filename + u'.part'
 601
 602         def undo_temp_name(self, filename):
 603                 if filename.endswith(u'.part'):
 604                         return filename[:-len(u'.part')]
 605                 return filename
 606
 607         def try_rename(self, old_filename, new_filename):
 608                 try:
 609                         if old_filename == new_filename:
 610                                 return
 611                         os.rename(old_filename, new_filename)
 612                 except (IOError, OSError), err:
 613                         self.trouble(u'ERROR: unable to rename file')
 614
 615         def try_utime(self, filename, last_modified_hdr):
 616                 """Try to set the last-modified time of the given file."""
 617                 if last_modified_hdr is None:
 618                         return
 619                 if not os.path.isfile(filename):
 620                         return
 621                 timestr = last_modified_hdr
 622                 if timestr is None:
 623                         return
 624                 filetime = timeconvert(timestr)
 625                 if filetime is None:
 626                         return
 627                 try:
 628                         os.utime(filename, (time.time(), filetime))
 629                 except:
 630                         pass
 631
 632         def report_writedescription(self, descfn):
 633                 """ Report that the description file is being written """
 634                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 635
 636         def report_writeinfojson(self, infofn):
 637                 """ Report that the metadata file has been written """
 638                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 639
 640         def report_destination(self, filename):
 641                 """Report destination filename."""
 642                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 643
 644         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 645                 """Report download progress."""
 646                 if self.params.get('noprogress', False):
 647                         return
 648                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 649                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 650                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 651                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 652
 653         def report_resuming_byte(self, resume_len):
 654                 """Report attempt to resume at given byte."""
 655                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 656
 657         def report_retry(self, count, retries):
 658                 """Report retry in case of HTTP error 5xx"""
 659                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 660
 661         def report_file_already_downloaded(self, file_name):
 662                 """Report file has already been fully downloaded."""
 663                 try:
 664                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 665                 except (UnicodeEncodeError), err:
 666                         self.to_screen(u'[download] The file has already been downloaded')
 667
 668         def report_unable_to_resume(self):
 669                 """Report it was impossible to resume download."""
 670                 self.to_screen(u'[download] Unable to resume')
 671
 672         def report_finish(self):
 673                 """Report download finished."""
 674                 if self.params.get('noprogress', False):
 675                         self.to_screen(u'[download] Download completed')
 676                 else:
 677                         self.to_screen(u'')
 678
 679         def increment_downloads(self):
 680                 """Increment the ordinal that assigns a number to each file."""
 681                 self._num_downloads += 1
 682
 683         def prepare_filename(self, info_dict):
 684                 """Generate the output filename."""
 685                 try:
 686                         template_dict = dict(info_dict)
 687                         template_dict['epoch'] = unicode(long(time.time()))
 688                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 689                         filename = self.params['outtmpl'] % template_dict
 690                         return filename
 691                 except (ValueError, KeyError), err:
 692                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 693                         return None
 694
 695         def process_info(self, info_dict):
 696                 """Process a single dictionary returned by an InfoExtractor."""
 697                 filename = self.prepare_filename(info_dict)
 698                 # Do nothing else if in simulate mode
 699                 if self.params.get('simulate', False):
 700                         # Forced printings
 701                         if self.params.get('forcetitle', False):
 702                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 703                         if self.params.get('forceurl', False):
 704                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 705                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 706                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 707                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 708                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                         if self.params.get('forcefilename', False) and filename is not None:
 710                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 711
 712                         return
 713
 714                 if filename is None:
 715                         return
 716                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 717                         self.to_stderr(u'WARNING: file exists and will be skipped')
 718                         return
 719
 720                 try:
 721                         dn = os.path.dirname(filename)
 722                         if dn != '' and not os.path.exists(dn):
 723                                 os.makedirs(dn)
 724                 except (OSError, IOError), err:
 725                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 726                         return
 727
 728                 if self.params.get('writedescription', False):
 729                         try:
 730                                 descfn = filename + '.description'
 731                                 self.report_writedescription(descfn)
 732                                 descfile = open(descfn, 'wb')
 733                                 try:
 734                                         descfile.write(info_dict['description'].encode('utf-8'))
 735                                 finally:
 736                                         descfile.close()
 737                         except (OSError, IOError):
 738                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 739                                 return
 740
 741                 if self.params.get('writeinfojson', False):
 742                         infofn = filename + '.info.json'
 743                         self.report_writeinfojson(infofn)
 744                         try:
 745                                 json.dump
 746                         except (NameError,AttributeError):
 747                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 748                                 return
 749                         try:
 750                                 infof = open(infofn, 'wb')
 751                                 try:
 752                                         json.dump(info_dict, infof)
 753                                 finally:
 754                                         infof.close()
 755                         except (OSError, IOError):
 756                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 757                                 return
 758
 759                 try:
 760                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 761                 except (OSError, IOError), err:
 762                         raise UnavailableVideoError
 763                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 764                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 765                         return
 766                 except (ContentTooShortError, ), err:
 767                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 768                         return
 769
 770                 if success:
 771                         try:
 772                                 self.post_process(filename, info_dict)
 773                         except (PostProcessingError), err:
 774                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 775                                 return
 776
 777         def download(self, url_list):
 778                 """Download a given list of URLs."""
 779                 if len(url_list) > 1 and self.fixed_template():
 780                         raise SameFileError(self.params['outtmpl'])
 781
 782                 for url in url_list:
 783                         suitable_found = False
 784                         for ie in self._ies:
 785                                 # Go to next InfoExtractor if not suitable
 786                                 if not ie.suitable(url):
 787                                         continue
 788
 789                                 # Suitable InfoExtractor found
 790                                 suitable_found = True
 791
 792                                 # Extract information from URL and process it
 793                                 ie.extract(url)
 794
 795                                 # Suitable InfoExtractor had been found; go to next URL
 796                                 break
 797
 798                         if not suitable_found:
 799                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 800
 801                 return self._download_retcode
 802
 803         def post_process(self, filename, ie_info):
 804                 """Run the postprocessing chain on the given file."""
 805                 info = dict(ie_info)
 806                 info['filepath'] = filename
 807                 for pp in self._pps:
 808                         info = pp.run(info)
 809                         if info is None:
 810                                 break
 811
 812         def _download_with_rtmpdump(self, filename, url, player_url):
 813                 self.report_destination(filename)
 814                 tmpfilename = self.temp_name(filename)
 815
 816                 # Check for rtmpdump first
 817                 try:
 818                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 819                 except (OSError, IOError):
 820                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 821                         return False
 822
 823                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 824                 # the connection was interrumpted and resuming appears to be
 825                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 826                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 827                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 828                 while retval == 2 or retval == 1:
 829                         prevsize = os.path.getsize(tmpfilename)
 830                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 831                         time.sleep(5.0) # This seems to be needed
 832                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 833                         cursize = os.path.getsize(tmpfilename)
 834                         if prevsize == cursize and retval == 1:
 835                                 break
 836                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 837                         if prevsize == cursize and retval == 2 and cursize > 1024:
 838                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 839                                 retval = 0
 840                                 break
 841                 if retval == 0:
 842                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 843                         self.try_rename(tmpfilename, filename)
 844                         return True
 845                 else:
 846                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 847                         return False
 848
 849         def _do_download(self, filename, url, player_url):
 850                 # Check file already present
 851                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 852                         self.report_file_already_downloaded(filename)
 853                         return True
 854
 855                 # Attempt to download using rtmpdump
 856                 if url.startswith('rtmp'):
 857                         return self._download_with_rtmpdump(filename, url, player_url)
 858
 859                 tmpfilename = self.temp_name(filename)
 860                 stream = None
 861                 open_mode = 'wb'
 862
 863                 # Do not include the Accept-Encoding header
 864                 headers = {'Youtubedl-no-compression': 'True'}
 865                 basic_request = urllib2.Request(url, None, headers)
 866                 request = urllib2.Request(url, None, headers)
 867
 868                 # Establish possible resume length
 869                 if os.path.isfile(tmpfilename):
 870                         resume_len = os.path.getsize(tmpfilename)
 871                 else:
 872                         resume_len = 0
 873
 874                 # Request parameters in case of being able to resume
 875                 if self.params.get('continuedl', False) and resume_len != 0:
 876                         self.report_resuming_byte(resume_len)
 877                         request.add_header('Range', 'bytes=%d-' % resume_len)
 878                         open_mode = 'ab'
 879
 880                 count = 0
 881                 retries = self.params.get('retries', 0)
 882                 while count <= retries:
 883                         # Establish connection
 884                         try:
 885                                 data = urllib2.urlopen(request)
 886                                 break
 887                         except (urllib2.HTTPError, ), err:
 888                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 889                                         # Unexpected HTTP error
 890                                         raise
 891                                 elif err.code == 416:
 892                                         # Unable to resume (requested range not satisfiable)
 893                                         try:
 894                                                 # Open the connection again without the range header
 895                                                 data = urllib2.urlopen(basic_request)
 896                                                 content_length = data.info()['Content-Length']
 897                                         except (urllib2.HTTPError, ), err:
 898                                                 if err.code < 500 or err.code >= 600:
 899                                                         raise
 900                                         else:
 901                                                 # Examine the reported length
 902                                                 if (content_length is not None and
 903                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 904                                                         # The file had already been fully downloaded.
 905                                                         # Explanation to the above condition: in issue #175 it was revealed that
 906                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 907                                                         # changing the file size slightly and causing problems for some users. So
 908                                                         # I decided to implement a suggested change and consider the file
 909                                                         # completely downloaded if the file size differs less than 100 bytes from
 910                                                         # the one in the hard drive.
 911                                                         self.report_file_already_downloaded(filename)
 912                                                         self.try_rename(tmpfilename, filename)
 913                                                         return True
 914                                                 else:
 915                                                         # The length does not match, we start the download over
 916                                                         self.report_unable_to_resume()
 917                                                         open_mode = 'wb'
 918                                                         break
 919                         # Retry
 920                         count += 1
 921                         if count <= retries:
 922                                 self.report_retry(count, retries)
 923
 924                 if count > retries:
 925                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 926                         return False
 927
 928                 data_len = data.info().get('Content-length', None)
 929                 if data_len is not None:
 930                         data_len = long(data_len) + resume_len
 931                 data_len_str = self.format_bytes(data_len)
 932                 byte_counter = 0 + resume_len
 933                 block_size = 1024
 934                 start = time.time()
 935                 while True:
 936                         # Download and write
 937                         before = time.time()
 938                         data_block = data.read(block_size)
 939                         after = time.time()
 940                         if len(data_block) == 0:
 941                                 break
 942                         byte_counter += len(data_block)
 943
 944                         # Open file just in time
 945                         if stream is None:
 946                                 try:
 947                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 948                                         assert stream is not None
 949                                         filename = self.undo_temp_name(tmpfilename)
 950                                         self.report_destination(filename)
 951                                 except (OSError, IOError), err:
 952                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 953                                         return False
 954                         try:
 955                                 stream.write(data_block)
 956                         except (IOError, OSError), err:
 957                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 958                                 return False
 959                         block_size = self.best_block_size(after - before, len(data_block))
 960
 961                         # Progress message
 962                         percent_str = self.calc_percent(byte_counter, data_len)
 963                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 964                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 965                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 966
 967                         # Apply rate limit
 968                         self.slow_down(start, byte_counter - resume_len)
 969
 970                 if stream is None:
 971                         self.trouble(u'\nERROR: Did not get any data blocks')
 972                         return False
 973                 stream.close()
 974                 self.report_finish()
 975                 if data_len is not None and byte_counter != data_len:
 976                         raise ContentTooShortError(byte_counter, long(data_len))
 977                 self.try_rename(tmpfilename, filename)
 978
 979                 # Update file modification time
 980                 if self.params.get('updatetime', True):
 981                         self.try_utime(filename, data.info().get('last-modified', None))
 982
 983                 return True
 984
 985
 986 class InfoExtractor(object):
 987         """Information Extractor class.
 988
 989         Information extractors are the classes that, given a URL, extract
 990         information from the video (or videos) the URL refers to. This
 991         information includes the real video URL, the video title and simplified
 992         title, author and others. The information is stored in a dictionary
 993         which is then passed to the FileDownloader. The FileDownloader
 994         processes this information possibly downloading the video to the file
 995         system, among other possible outcomes. The dictionaries must include
 996         the following fields:
 997
 998         id:             Video identifier.
 999         url:            Final video URL.
1000         uploader:       Nickname of the video uploader.
1001         title:          Literal title.
1002         stitle:         Simplified title.
1003         ext:            Video filename extension.
1004         format:         Video format.
1005         player_url:     SWF Player URL (may be None).
1006
1007         The following fields are optional. Their primary purpose is to allow
1008         youtube-dl to serve as the backend for a video search function, such
1009         as the one in youtube2mp3.  They are only used when their respective
1010         forced printing functions are called:
1011
1012         thumbnail:      Full URL to a video thumbnail image.
1013         description:    One-line video description.
1014
1015         Subclasses of this one should re-define the _real_initialize() and
1016         _real_extract() methods, as well as the suitable() static method.
1017         Probably, they should also be instantiated and added to the main
1018         downloader.
1019         """
1020
1021         _ready = False
1022         _downloader = None
1023
1024         def __init__(self, downloader=None):
1025                 """Constructor. Receives an optional downloader."""
1026                 self._ready = False
1027                 self.set_downloader(downloader)
1028
1029         @staticmethod
1030         def suitable(url):
1031                 """Receives a URL and returns True if suitable for this IE."""
1032                 return False
1033
1034         def initialize(self):
1035                 """Initializes an instance (authentication, etc)."""
1036                 if not self._ready:
1037                         self._real_initialize()
1038                         self._ready = True
1039
1040         def extract(self, url):
1041                 """Extracts URL information and returns it in list of dicts."""
1042                 self.initialize()
1043                 return self._real_extract(url)
1044
1045         def set_downloader(self, downloader):
1046                 """Sets the downloader for this IE."""
1047                 self._downloader = downloader
1048
1049         def _real_initialize(self):
1050                 """Real initialization process. Redefine in subclasses."""
1051                 pass
1052
1053         def _real_extract(self, url):
1054                 """Real extraction process. Redefine in subclasses."""
1055                 pass
1056
1057
1058 class YoutubeIE(InfoExtractor):
1059         """Information extractor for youtube.com."""
1060
1061         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1062         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1063         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1064         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1065         _NETRC_MACHINE = 'youtube'
1066         # Listed in order of quality
1067         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1068         _video_extensions = {
1069                 '13': '3gp',
1070                 '17': 'mp4',
1071                 '18': 'mp4',
1072                 '22': 'mp4',
1073                 '37': 'mp4',
1074                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1075                 '43': 'webm',
1076                 '45': 'webm',
1077         }
1078
1079         @staticmethod
1080         def suitable(url):
1081                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1082
1083         def report_lang(self):
1084                 """Report attempt to set language."""
1085                 self._downloader.to_screen(u'[youtube] Setting language')
1086
1087         def report_login(self):
1088                 """Report attempt to log in."""
1089                 self._downloader.to_screen(u'[youtube] Logging in')
1090
1091         def report_age_confirmation(self):
1092                 """Report attempt to confirm age."""
1093                 self._downloader.to_screen(u'[youtube] Confirming age')
1094
1095         def report_video_webpage_download(self, video_id):
1096                 """Report attempt to download video webpage."""
1097                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1098
1099         def report_video_info_webpage_download(self, video_id):
1100                 """Report attempt to download video info webpage."""
1101                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1102
1103         def report_information_extraction(self, video_id):
1104                 """Report attempt to extract video information."""
1105                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1106
1107         def report_unavailable_format(self, video_id, format):
1108                 """Report extracted video URL."""
1109                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1110
1111         def report_rtmp_download(self):
1112                 """Indicate the download will use the RTMP protocol."""
1113                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1114
1115         def _real_initialize(self):
1116                 if self._downloader is None:
1117                         return
1118
1119                 username = None
1120                 password = None
1121                 downloader_params = self._downloader.params
1122
1123                 # Attempt to use provided username and password or .netrc data
1124                 if downloader_params.get('username', None) is not None:
1125                         username = downloader_params['username']
1126                         password = downloader_params['password']
1127                 elif downloader_params.get('usenetrc', False):
1128                         try:
1129                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1130                                 if info is not None:
1131                                         username = info[0]
1132                                         password = info[2]
1133                                 else:
1134                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1135                         except (IOError, netrc.NetrcParseError), err:
1136                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1137                                 return
1138
1139                 # Set language
1140                 request = urllib2.Request(self._LANG_URL)
1141                 try:
1142                         self.report_lang()
1143                         urllib2.urlopen(request).read()
1144                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1145                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1146                         return
1147
1148                 # No authentication to be performed
1149                 if username is None:
1150                         return
1151
1152                 # Log in
1153                 login_form = {
1154                                 'current_form': 'loginForm',
1155                                 'next':         '/',
1156                                 'action_login': 'Log In',
1157                                 'username':     username,
1158                                 'password':     password,
1159                                 }
1160                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1161                 try:
1162                         self.report_login()
1163                         login_results = urllib2.urlopen(request).read()
1164                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1165                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1166                                 return
1167                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1168                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1169                         return
1170
1171                 # Confirm age
1172                 age_form = {
1173                                 'next_url':             '/',
1174                                 'action_confirm':       'Confirm',
1175                                 }
1176                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1177                 try:
1178                         self.report_age_confirmation()
1179                         age_results = urllib2.urlopen(request).read()
1180                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1182                         return
1183
1184         def _real_extract(self, url):
1185                 # Extract video id from URL
1186                 mobj = re.match(self._VALID_URL, url)
1187                 if mobj is None:
1188                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1189                         return
1190                 video_id = mobj.group(2)
1191
1192                 # Get video webpage
1193                 self.report_video_webpage_download(video_id)
1194                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1195                 try:
1196                         video_webpage = urllib2.urlopen(request).read()
1197                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1198                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1199                         return
1200
1201                 # Attempt to extract SWF player URL
1202                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1203                 if mobj is not None:
1204                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1205                 else:
1206                         player_url = None
1207
1208                 # Get video info
1209                 self.report_video_info_webpage_download(video_id)
1210                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1211                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1212                                         % (video_id, el_type))
1213                         request = urllib2.Request(video_info_url)
1214                         try:
1215                                 video_info_webpage = urllib2.urlopen(request).read()
1216                                 video_info = parse_qs(video_info_webpage)
1217                                 if 'token' in video_info:
1218                                         break
1219                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1221                                 return
1222                 if 'token' not in video_info:
1223                         if 'reason' in video_info:
1224                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1225                         else:
1226                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1227                         return
1228
1229                 # Start extracting information
1230                 self.report_information_extraction(video_id)
1231
1232                 # uploader
1233                 if 'author' not in video_info:
1234                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1235                         return
1236                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1237
1238                 # title
1239                 if 'title' not in video_info:
1240                         self._downloader.trouble(u'ERROR: unable to extract video title')
1241                         return
1242                 video_title = urllib.unquote_plus(video_info['title'][0])
1243                 video_title = video_title.decode('utf-8')
1244                 video_title = sanitize_title(video_title)
1245
1246                 # simplified title
1247                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1248                 simple_title = simple_title.strip(ur'_')
1249
1250                 # thumbnail image
1251                 if 'thumbnail_url' not in video_info:
1252                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1253                         video_thumbnail = ''
1254                 else:   # don't panic if we can't find it
1255                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1256
1257                 # upload date
1258                 upload_date = u'NA'
1259                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1260                 if mobj is not None:
1261                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1262                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1263                         for expression in format_expressions:
1264                                 try:
1265                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1266                                 except:
1267                                         pass
1268
1269                 # description
1270                 try:
1271                         lxml.etree
1272                 except NameError:
1273                         video_description = u'No description available.'
1274                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1275                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1276                                 if mobj is not None:
1277                                         video_description = mobj.group(1).decode('utf-8')
1278                 else:
1279                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1280                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1281                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1282                         # TODO use another parser
1283
1284                 # token
1285                 video_token = urllib.unquote_plus(video_info['token'][0])
1286
1287                 # Decide which formats to download
1288                 req_format = self._downloader.params.get('format', None)
1289
1290                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1291                         self.report_rtmp_download()
1292                         video_url_list = [(None, video_info['conn'][0])]
1293                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1294                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1295                         url_data = [parse_qs(uds) for uds in url_data_strs]
1296                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1297                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1298
1299                         format_limit = self._downloader.params.get('format_limit', None)
1300                         if format_limit is not None and format_limit in self._available_formats:
1301                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1302                         else:
1303                                 format_list = self._available_formats
1304                         existing_formats = [x for x in format_list if x in url_map]
1305                         if len(existing_formats) == 0:
1306                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1307                                 return
1308                         if req_format is None:
1309                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1310                         elif req_format == '-1':
1311                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1312                         else:
1313                                 # Specific format
1314                                 if req_format not in url_map:
1315                                         self._downloader.trouble(u'ERROR: requested format not available')
1316                                         return
1317                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1318                 else:
1319                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1320                         return
1321
1322                 for format_param, video_real_url in video_url_list:
1323                         # At this point we have a new video
1324                         self._downloader.increment_downloads()
1325
1326                         # Extension
1327                         video_extension = self._video_extensions.get(format_param, 'flv')
1328
1329                         try:
1330                                 # Process video information
1331                                 self._downloader.process_info({
1332                                         'id':           video_id.decode('utf-8'),
1333                                         'url':          video_real_url.decode('utf-8'),
1334                                         'uploader':     video_uploader.decode('utf-8'),
1335                                         'upload_date':  upload_date,
1336                                         'title':        video_title,
1337                                         'stitle':       simple_title,
1338                                         'ext':          video_extension.decode('utf-8'),
1339                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1340                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1341                                         'description':  video_description,
1342                                         'player_url':   player_url,
1343                                 })
1344                         except UnavailableVideoError, err:
1345                                 self._downloader.trouble(u'\nERROR: unable to download video')
1346
1347
1348 class MetacafeIE(InfoExtractor):
1349         """Information Extractor for metacafe.com."""
1350
1351         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1352         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1353         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1354         _youtube_ie = None
1355
1356         def __init__(self, youtube_ie, downloader=None):
1357                 InfoExtractor.__init__(self, downloader)
1358                 self._youtube_ie = youtube_ie
1359
1360         @staticmethod
1361         def suitable(url):
1362                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1363
1364         def report_disclaimer(self):
1365                 """Report disclaimer retrieval."""
1366                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1367
1368         def report_age_confirmation(self):
1369                 """Report attempt to confirm age."""
1370                 self._downloader.to_screen(u'[metacafe] Confirming age')
1371
1372         def report_download_webpage(self, video_id):
1373                 """Report webpage download."""
1374                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1375
1376         def report_extraction(self, video_id):
1377                 """Report information extraction."""
1378                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1379
1380         def _real_initialize(self):
1381                 # Retrieve disclaimer
1382                 request = urllib2.Request(self._DISCLAIMER)
1383                 try:
1384                         self.report_disclaimer()
1385                         disclaimer = urllib2.urlopen(request).read()
1386                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1387                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1388                         return
1389
1390                 # Confirm age
1391                 disclaimer_form = {
1392                         'filters': '0',
1393                         'submit': "Continue - I'm over 18",
1394                         }
1395                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1396                 try:
1397                         self.report_age_confirmation()
1398                         disclaimer = urllib2.urlopen(request).read()
1399                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1400                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1401                         return
1402
1403         def _real_extract(self, url):
1404                 # Extract id and simplified title from URL
1405                 mobj = re.match(self._VALID_URL, url)
1406                 if mobj is None:
1407                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1408                         return
1409
1410                 video_id = mobj.group(1)
1411
1412                 # Check if video comes from YouTube
1413                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1414                 if mobj2 is not None:
1415                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1416                         return
1417
1418                 # At this point we have a new video
1419                 self._downloader.increment_downloads()
1420
1421                 simple_title = mobj.group(2).decode('utf-8')
1422
1423                 # Retrieve video webpage to extract further information
1424                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1425                 try:
1426                         self.report_download_webpage(video_id)
1427                         webpage = urllib2.urlopen(request).read()
1428                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1429                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1430                         return
1431
1432                 # Extract URL, uploader and title from webpage
1433                 self.report_extraction(video_id)
1434                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1435                 if mobj is not None:
1436                         mediaURL = urllib.unquote(mobj.group(1))
1437                         video_extension = mediaURL[-3:]
1438
1439                         # Extract gdaKey if available
1440                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1441                         if mobj is None:
1442                                 video_url = mediaURL
1443                         else:
1444                                 gdaKey = mobj.group(1)
1445                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1446                 else:
1447                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1448                         if mobj is None:
1449                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1450                                 return
1451                         vardict = parse_qs(mobj.group(1))
1452                         if 'mediaData' not in vardict:
1453                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1454                                 return
1455                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1456                         if mobj is None:
1457                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1458                                 return
1459                         mediaURL = mobj.group(1).replace('\\/', '/')
1460                         video_extension = mediaURL[-3:]
1461                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1462
1463                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1464                 if mobj is None:
1465                         self._downloader.trouble(u'ERROR: unable to extract title')
1466                         return
1467                 video_title = mobj.group(1).decode('utf-8')
1468                 video_title = sanitize_title(video_title)
1469
1470                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1471                 if mobj is None:
1472                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1473                         return
1474                 video_uploader = mobj.group(1)
1475
1476                 try:
1477                         # Process video information
1478                         self._downloader.process_info({
1479                                 'id':           video_id.decode('utf-8'),
1480                                 'url':          video_url.decode('utf-8'),
1481                                 'uploader':     video_uploader.decode('utf-8'),
1482                                 'upload_date':  u'NA',
1483                                 'title':        video_title,
1484                                 'stitle':       simple_title,
1485                                 'ext':          video_extension.decode('utf-8'),
1486                                 'format':       u'NA',
1487                                 'player_url':   None,
1488                         })
1489                 except UnavailableVideoError:
1490                         self._downloader.trouble(u'\nERROR: unable to download video')
1491
1492
1493 class DailymotionIE(InfoExtractor):
1494         """Information Extractor for Dailymotion"""
1495
1496         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1497
1498         def __init__(self, downloader=None):
1499                 InfoExtractor.__init__(self, downloader)
1500
1501         @staticmethod
1502         def suitable(url):
1503                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1504
1505         def report_download_webpage(self, video_id):
1506                 """Report webpage download."""
1507                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1508
1509         def report_extraction(self, video_id):
1510                 """Report information extraction."""
1511                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1512
1513         def _real_initialize(self):
1514                 return
1515
1516         def _real_extract(self, url):
1517                 # Extract id and simplified title from URL
1518                 mobj = re.match(self._VALID_URL, url)
1519                 if mobj is None:
1520                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1521                         return
1522
1523                 # At this point we have a new video
1524                 self._downloader.increment_downloads()
1525                 video_id = mobj.group(1)
1526
1527                 simple_title = mobj.group(2).decode('utf-8')
1528                 video_extension = 'flv'
1529
1530                 # Retrieve video webpage to extract further information
1531                 request = urllib2.Request(url)
1532                 request.add_header('Cookie', 'family_filter=off')
1533                 try:
1534                         self.report_download_webpage(video_id)
1535                         webpage = urllib2.urlopen(request).read()
1536                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1537                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1538                         return
1539
1540                 # Extract URL, uploader and title from webpage
1541                 self.report_extraction(video_id)
1542                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1543                 if mobj is None:
1544                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1545                         return
1546                 sequence = urllib.unquote(mobj.group(1))
1547                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1548                 if mobj is None:
1549                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1550                         return
1551                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1552
1553                 # if needed add http://www.dailymotion.com/ if relative URL
1554
1555                 video_url = mediaURL
1556
1557                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1558                 if mobj is None:
1559                         self._downloader.trouble(u'ERROR: unable to extract title')
1560                         return
1561                 video_title = mobj.group(1).decode('utf-8')
1562                 video_title = sanitize_title(video_title)
1563
1564                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1565                 if mobj is None:
1566                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1567                         return
1568                 video_uploader = mobj.group(1)
1569
1570                 try:
1571                         # Process video information
1572                         self._downloader.process_info({
1573                                 'id':           video_id.decode('utf-8'),
1574                                 'url':          video_url.decode('utf-8'),
1575                                 'uploader':     video_uploader.decode('utf-8'),
1576                                 'upload_date':  u'NA',
1577                                 'title':        video_title,
1578                                 'stitle':       simple_title,
1579                                 'ext':          video_extension.decode('utf-8'),
1580                                 'format':       u'NA',
1581                                 'player_url':   None,
1582                         })
1583                 except UnavailableVideoError:
1584                         self._downloader.trouble(u'\nERROR: unable to download video')
1585
1586
1587 class GoogleIE(InfoExtractor):
1588         """Information extractor for video.google.com."""
1589
1590         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1591
1592         def __init__(self, downloader=None):
1593                 InfoExtractor.__init__(self, downloader)
1594
1595         @staticmethod
1596         def suitable(url):
1597                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1598
1599         def report_download_webpage(self, video_id):
1600                 """Report webpage download."""
1601                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1602
1603         def report_extraction(self, video_id):
1604                 """Report information extraction."""
1605                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1606
1607         def _real_initialize(self):
1608                 return
1609
1610         def _real_extract(self, url):
1611                 # Extract id from URL
1612                 mobj = re.match(self._VALID_URL, url)
1613                 if mobj is None:
1614                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1615                         return
1616
1617                 # At this point we have a new video
1618                 self._downloader.increment_downloads()
1619                 video_id = mobj.group(1)
1620
1621                 video_extension = 'mp4'
1622
1623                 # Retrieve video webpage to extract further information
1624                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1625                 try:
1626                         self.report_download_webpage(video_id)
1627                         webpage = urllib2.urlopen(request).read()
1628                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1629                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1630                         return
1631
1632                 # Extract URL, uploader, and title from webpage
1633                 self.report_extraction(video_id)
1634                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1635                 if mobj is None:
1636                         video_extension = 'flv'
1637                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1638                 if mobj is None:
1639                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1640                         return
1641                 mediaURL = urllib.unquote(mobj.group(1))
1642                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1643                 mediaURL = mediaURL.replace('\\x26', '\x26')
1644
1645                 video_url = mediaURL
1646
1647                 mobj = re.search(r'<title>(.*)</title>', webpage)
1648                 if mobj is None:
1649                         self._downloader.trouble(u'ERROR: unable to extract title')
1650                         return
1651                 video_title = mobj.group(1).decode('utf-8')
1652                 video_title = sanitize_title(video_title)
1653                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1654
1655                 # Extract video description
1656                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1657                 if mobj is None:
1658                         self._downloader.trouble(u'ERROR: unable to extract video description')
1659                         return
1660                 video_description = mobj.group(1).decode('utf-8')
1661                 if not video_description:
1662                         video_description = 'No description available.'
1663
1664                 # Extract video thumbnail
1665                 if self._downloader.params.get('forcethumbnail', False):
1666                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1667                         try:
1668                                 webpage = urllib2.urlopen(request).read()
1669                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1670                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1671                                 return
1672                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1673                         if mobj is None:
1674                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1675                                 return
1676                         video_thumbnail = mobj.group(1)
1677                 else:   # we need something to pass to process_info
1678                         video_thumbnail = ''
1679
1680                 try:
1681                         # Process video information
1682                         self._downloader.process_info({
1683                                 'id':           video_id.decode('utf-8'),
1684                                 'url':          video_url.decode('utf-8'),
1685                                 'uploader':     u'NA',
1686                                 'upload_date':  u'NA',
1687                                 'title':        video_title,
1688                                 'stitle':       simple_title,
1689                                 'ext':          video_extension.decode('utf-8'),
1690                                 'format':       u'NA',
1691                                 'player_url':   None,
1692                         })
1693                 except UnavailableVideoError:
1694                         self._downloader.trouble(u'\nERROR: unable to download video')
1695
1696
1697 class PhotobucketIE(InfoExtractor):
1698         """Information extractor for photobucket.com."""
1699
1700         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1701
1702         def __init__(self, downloader=None):
1703                 InfoExtractor.__init__(self, downloader)
1704
1705         @staticmethod
1706         def suitable(url):
1707                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1708
1709         def report_download_webpage(self, video_id):
1710                 """Report webpage download."""
1711                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1712
1713         def report_extraction(self, video_id):
1714                 """Report information extraction."""
1715                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1716
1717         def _real_initialize(self):
1718                 return
1719
1720         def _real_extract(self, url):
1721                 # Extract id from URL
1722                 mobj = re.match(self._VALID_URL, url)
1723                 if mobj is None:
1724                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1725                         return
1726
1727                 # At this point we have a new video
1728                 self._downloader.increment_downloads()
1729                 video_id = mobj.group(1)
1730
1731                 video_extension = 'flv'
1732
1733                 # Retrieve video webpage to extract further information
1734                 request = urllib2.Request(url)
1735                 try:
1736                         self.report_download_webpage(video_id)
1737                         webpage = urllib2.urlopen(request).read()
1738                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1739                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1740                         return
1741
1742                 # Extract URL, uploader, and title from webpage
1743                 self.report_extraction(video_id)
1744                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1745                 if mobj is None:
1746                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1747                         return
1748                 mediaURL = urllib.unquote(mobj.group(1))
1749
1750                 video_url = mediaURL
1751
1752                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1753                 if mobj is None:
1754                         self._downloader.trouble(u'ERROR: unable to extract title')
1755                         return
1756                 video_title = mobj.group(1).decode('utf-8')
1757                 video_title = sanitize_title(video_title)
1758                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1759
1760                 video_uploader = mobj.group(2).decode('utf-8')
1761
1762                 try:
1763                         # Process video information
1764                         self._downloader.process_info({
1765                                 'id':           video_id.decode('utf-8'),
1766                                 'url':          video_url.decode('utf-8'),
1767                                 'uploader':     video_uploader,
1768                                 'upload_date':  u'NA',
1769                                 'title':        video_title,
1770                                 'stitle':       simple_title,
1771                                 'ext':          video_extension.decode('utf-8'),
1772                                 'format':       u'NA',
1773                                 'player_url':   None,
1774                         })
1775                 except UnavailableVideoError:
1776                         self._downloader.trouble(u'\nERROR: unable to download video')
1777
1778
1779 class YahooIE(InfoExtractor):
1780         """Information extractor for video.yahoo.com."""
1781
1782         # _VALID_URL matches all Yahoo! Video URLs
1783         # _VPAGE_URL matches only the extractable '/watch/' URLs
1784         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1785         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1786
1787         def __init__(self, downloader=None):
1788                 InfoExtractor.__init__(self, downloader)
1789
1790         @staticmethod
1791         def suitable(url):
1792                 return (re.match(YahooIE._VALID_URL, url) is not None)
1793
1794         def report_download_webpage(self, video_id):
1795                 """Report webpage download."""
1796                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1797
1798         def report_extraction(self, video_id):
1799                 """Report information extraction."""
1800                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1801
1802         def _real_initialize(self):
1803                 return
1804
1805         def _real_extract(self, url, new_video=True):
1806                 # Extract ID from URL
1807                 mobj = re.match(self._VALID_URL, url)
1808                 if mobj is None:
1809                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1810                         return
1811
1812                 # At this point we have a new video
1813                 self._downloader.increment_downloads()
1814                 video_id = mobj.group(2)
1815                 video_extension = 'flv'
1816
1817                 # Rewrite valid but non-extractable URLs as
1818                 # extractable English language /watch/ URLs
1819                 if re.match(self._VPAGE_URL, url) is None:
1820                         request = urllib2.Request(url)
1821                         try:
1822                                 webpage = urllib2.urlopen(request).read()
1823                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1824                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1825                                 return
1826
1827                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1828                         if mobj is None:
1829                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1830                                 return
1831                         yahoo_id = mobj.group(1)
1832
1833                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1834                         if mobj is None:
1835                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1836                                 return
1837                         yahoo_vid = mobj.group(1)
1838
1839                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1840                         return self._real_extract(url, new_video=False)
1841
1842                 # Retrieve video webpage to extract further information
1843                 request = urllib2.Request(url)
1844                 try:
1845                         self.report_download_webpage(video_id)
1846                         webpage = urllib2.urlopen(request).read()
1847                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1848                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1849                         return
1850
1851                 # Extract uploader and title from webpage
1852                 self.report_extraction(video_id)
1853                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1854                 if mobj is None:
1855                         self._downloader.trouble(u'ERROR: unable to extract video title')
1856                         return
1857                 video_title = mobj.group(1).decode('utf-8')
1858                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1859
1860                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1861                 if mobj is None:
1862                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1863                         return
1864                 video_uploader = mobj.group(1).decode('utf-8')
1865
1866                 # Extract video thumbnail
1867                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1868                 if mobj is None:
1869                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1870                         return
1871                 video_thumbnail = mobj.group(1).decode('utf-8')
1872
1873                 # Extract video description
1874                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1875                 if mobj is None:
1876                         self._downloader.trouble(u'ERROR: unable to extract video description')
1877                         return
1878                 video_description = mobj.group(1).decode('utf-8')
1879                 if not video_description:
1880                         video_description = 'No description available.'
1881
1882                 # Extract video height and width
1883                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1884                 if mobj is None:
1885                         self._downloader.trouble(u'ERROR: unable to extract video height')
1886                         return
1887                 yv_video_height = mobj.group(1)
1888
1889                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1890                 if mobj is None:
1891                         self._downloader.trouble(u'ERROR: unable to extract video width')
1892                         return
1893                 yv_video_width = mobj.group(1)
1894
1895                 # Retrieve video playlist to extract media URL
1896                 # I'm not completely sure what all these options are, but we
1897                 # seem to need most of them, otherwise the server sends a 401.
1898                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1899                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1900                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1901                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1902                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1903                 try:
1904                         self.report_download_webpage(video_id)
1905                         webpage = urllib2.urlopen(request).read()
1906                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1907                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1908                         return
1909
1910                 # Extract media URL from playlist XML
1911                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1912                 if mobj is None:
1913                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1914                         return
1915                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1916                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1917
1918                 try:
1919                         # Process video information
1920                         self._downloader.process_info({
1921                                 'id':           video_id.decode('utf-8'),
1922                                 'url':          video_url,
1923                                 'uploader':     video_uploader,
1924                                 'upload_date':  u'NA',
1925                                 'title':        video_title,
1926                                 'stitle':       simple_title,
1927                                 'ext':          video_extension.decode('utf-8'),
1928                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1929                                 'description':  video_description,
1930                                 'thumbnail':    video_thumbnail,
1931                                 'player_url':   None,
1932                         })
1933                 except UnavailableVideoError:
1934                         self._downloader.trouble(u'\nERROR: unable to download video')
1935
1936
1937 class VimeoIE(InfoExtractor):
1938         """Information extractor for vimeo.com."""
1939
1940         # _VALID_URL matches Vimeo URLs
1941         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1942
1943         def __init__(self, downloader=None):
1944                 InfoExtractor.__init__(self, downloader)
1945
1946         @staticmethod
1947         def suitable(url):
1948                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1949
1950         def report_download_webpage(self, video_id):
1951                 """Report webpage download."""
1952                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1953
1954         def report_extraction(self, video_id):
1955                 """Report information extraction."""
1956                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1957
1958         def _real_initialize(self):
1959                 return
1960
1961         def _real_extract(self, url, new_video=True):
1962                 # Extract ID from URL
1963                 mobj = re.match(self._VALID_URL, url)
1964                 if mobj is None:
1965                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1966                         return
1967
1968                 # At this point we have a new video
1969                 self._downloader.increment_downloads()
1970                 video_id = mobj.group(1)
1971
1972                 # Retrieve video webpage to extract further information
1973                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1974                 try:
1975                         self.report_download_webpage(video_id)
1976                         webpage = urllib2.urlopen(request).read()
1977                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1978                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1979                         return
1980
1981                 # Now we begin extracting as much information as we can from what we
1982                 # retrieved. First we extract the information common to all extractors,
1983                 # and latter we extract those that are Vimeo specific.
1984                 self.report_extraction(video_id)
1985
1986                 # Extract title
1987                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1988                 if mobj is None:
1989                         self._downloader.trouble(u'ERROR: unable to extract video title')
1990                         return
1991                 video_title = mobj.group(1).decode('utf-8')
1992                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1993
1994                 # Extract uploader
1995                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1996                 if mobj is None:
1997                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1998                         return
1999                 video_uploader = mobj.group(1).decode('utf-8')
2000
2001                 # Extract video thumbnail
2002                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2003                 if mobj is None:
2004                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2005                         return
2006                 video_thumbnail = mobj.group(1).decode('utf-8')
2007
2008                 # # Extract video description
2009                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2010                 # if mobj is None:
2011                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2012                 #       return
2013                 # video_description = mobj.group(1).decode('utf-8')
2014                 # if not video_description: video_description = 'No description available.'
2015                 video_description = 'Foo.'
2016
2017                 # Vimeo specific: extract request signature
2018                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2019                 if mobj is None:
2020                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2021                         return
2022                 sig = mobj.group(1).decode('utf-8')
2023
2024                 # Vimeo specific: Extract request signature expiration
2025                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2026                 if mobj is None:
2027                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2028                         return
2029                 sig_exp = mobj.group(1).decode('utf-8')
2030
2031                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2032
2033                 try:
2034                         # Process video information
2035                         self._downloader.process_info({
2036                                 'id':           video_id.decode('utf-8'),
2037                                 'url':          video_url,
2038                                 'uploader':     video_uploader,
2039                                 'upload_date':  u'NA',
2040                                 'title':        video_title,
2041                                 'stitle':       simple_title,
2042                                 'ext':          u'mp4',
2043                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2044                                 'description':  video_description,
2045                                 'thumbnail':    video_thumbnail,
2046                                 'description':  video_description,
2047                                 'player_url':   None,
2048                         })
2049                 except UnavailableVideoError:
2050                         self._downloader.trouble(u'ERROR: unable to download video')
2051
2052
2053 class GenericIE(InfoExtractor):
2054         """Generic last-resort information extractor."""
2055
2056         def __init__(self, downloader=None):
2057                 InfoExtractor.__init__(self, downloader)
2058
2059         @staticmethod
2060         def suitable(url):
2061                 return True
2062
2063         def report_download_webpage(self, video_id):
2064                 """Report webpage download."""
2065                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2066                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2067
2068         def report_extraction(self, video_id):
2069                 """Report information extraction."""
2070                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2071
2072         def _real_initialize(self):
2073                 return
2074
2075         def _real_extract(self, url):
2076                 # At this point we have a new video
2077                 self._downloader.increment_downloads()
2078
2079                 video_id = url.split('/')[-1]
2080                 request = urllib2.Request(url)
2081                 try:
2082                         self.report_download_webpage(video_id)
2083                         webpage = urllib2.urlopen(request).read()
2084                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2085                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2086                         return
2087                 except ValueError, err:
2088                         # since this is the last-resort InfoExtractor, if
2089                         # this error is thrown, it'll be thrown here
2090                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2091                         return
2092
2093                 self.report_extraction(video_id)
2094                 # Start with something easy: JW Player in SWFObject
2095                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2096                 if mobj is None:
2097                         # Broaden the search a little bit
2098                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2099                 if mobj is None:
2100                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2101                         return
2102
2103                 # It's possible that one of the regexes
2104                 # matched, but returned an empty group:
2105                 if mobj.group(1) is None:
2106                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2107                         return
2108
2109                 video_url = urllib.unquote(mobj.group(1))
2110                 video_id = os.path.basename(video_url)
2111
2112                 # here's a fun little line of code for you:
2113                 video_extension = os.path.splitext(video_id)[1][1:]
2114                 video_id = os.path.splitext(video_id)[0]
2115
2116                 # it's tempting to parse this further, but you would
2117                 # have to take into account all the variations like
2118                 #   Video Title - Site Name
2119                 #   Site Name | Video Title
2120                 #   Video Title - Tagline | Site Name
2121                 # and so on and so forth; it's just not practical
2122                 mobj = re.search(r'<title>(.*)</title>', webpage)
2123                 if mobj is None:
2124                         self._downloader.trouble(u'ERROR: unable to extract title')
2125                         return
2126                 video_title = mobj.group(1).decode('utf-8')
2127                 video_title = sanitize_title(video_title)
2128                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2129
2130                 # video uploader is domain name
2131                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2132                 if mobj is None:
2133                         self._downloader.trouble(u'ERROR: unable to extract title')
2134                         return
2135                 video_uploader = mobj.group(1).decode('utf-8')
2136
2137                 try:
2138                         # Process video information
2139                         self._downloader.process_info({
2140                                 'id':           video_id.decode('utf-8'),
2141                                 'url':          video_url.decode('utf-8'),
2142                                 'uploader':     video_uploader,
2143                                 'upload_date':  u'NA',
2144                                 'title':        video_title,
2145                                 'stitle':       simple_title,
2146                                 'ext':          video_extension.decode('utf-8'),
2147                                 'format':       u'NA',
2148                                 'player_url':   None,
2149                         })
2150                 except UnavailableVideoError, err:
2151                         self._downloader.trouble(u'\nERROR: unable to download video')
2152
2153
2154 class YoutubeSearchIE(InfoExtractor):
2155         """Information Extractor for YouTube search queries."""
2156         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2157         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2158         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2159         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2160         _youtube_ie = None
2161         _max_youtube_results = 1000
2162
2163         def __init__(self, youtube_ie, downloader=None):
2164                 InfoExtractor.__init__(self, downloader)
2165                 self._youtube_ie = youtube_ie
2166
2167         @staticmethod
2168         def suitable(url):
2169                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2170
2171         def report_download_page(self, query, pagenum):
2172                 """Report attempt to download playlist page with given number."""
2173                 query = query.decode(preferredencoding())
2174                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2175
2176         def _real_initialize(self):
2177                 self._youtube_ie.initialize()
2178
2179         def _real_extract(self, query):
2180                 mobj = re.match(self._VALID_QUERY, query)
2181                 if mobj is None:
2182                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2183                         return
2184
2185                 prefix, query = query.split(':')
2186                 prefix = prefix[8:]
2187                 query = query.encode('utf-8')
2188                 if prefix == '':
2189                         self._download_n_results(query, 1)
2190                         return
2191                 elif prefix == 'all':
2192                         self._download_n_results(query, self._max_youtube_results)
2193                         return
2194                 else:
2195                         try:
2196                                 n = long(prefix)
2197                                 if n <= 0:
2198                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2199                                         return
2200                                 elif n > self._max_youtube_results:
2201                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2202                                         n = self._max_youtube_results
2203                                 self._download_n_results(query, n)
2204                                 return
2205                         except ValueError: # parsing prefix as integer fails
2206                                 self._download_n_results(query, 1)
2207                                 return
2208
2209         def _download_n_results(self, query, n):
2210                 """Downloads a specified number of results for a query"""
2211
2212                 video_ids = []
2213                 already_seen = set()
2214                 pagenum = 1
2215
2216                 while True:
2217                         self.report_download_page(query, pagenum)
2218                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2219                         request = urllib2.Request(result_url)
2220                         try:
2221                                 page = urllib2.urlopen(request).read()
2222                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2223                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2224                                 return
2225
2226                         # Extract video identifiers
2227                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2228                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2229                                 if video_id not in already_seen:
2230                                         video_ids.append(video_id)
2231                                         already_seen.add(video_id)
2232                                         if len(video_ids) == n:
2233                                                 # Specified n videos reached
2234                                                 for id in video_ids:
2235                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2236                                                 return
2237
2238                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2239                                 for id in video_ids:
2240                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2241                                 return
2242
2243                         pagenum = pagenum + 1
2244
2245
2246 class GoogleSearchIE(InfoExtractor):
2247         """Information Extractor for Google Video search queries."""
2248         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2249         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2250         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2251         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2252         _google_ie = None
2253         _max_google_results = 1000
2254
2255         def __init__(self, google_ie, downloader=None):
2256                 InfoExtractor.__init__(self, downloader)
2257                 self._google_ie = google_ie
2258
2259         @staticmethod
2260         def suitable(url):
2261                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2262
2263         def report_download_page(self, query, pagenum):
2264                 """Report attempt to download playlist page with given number."""
2265                 query = query.decode(preferredencoding())
2266                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2267
2268         def _real_initialize(self):
2269                 self._google_ie.initialize()
2270
2271         def _real_extract(self, query):
2272                 mobj = re.match(self._VALID_QUERY, query)
2273                 if mobj is None:
2274                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2275                         return
2276
2277                 prefix, query = query.split(':')
2278                 prefix = prefix[8:]
2279                 query = query.encode('utf-8')
2280                 if prefix == '':
2281                         self._download_n_results(query, 1)
2282                         return
2283                 elif prefix == 'all':
2284                         self._download_n_results(query, self._max_google_results)
2285                         return
2286                 else:
2287                         try:
2288                                 n = long(prefix)
2289                                 if n <= 0:
2290                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2291                                         return
2292                                 elif n > self._max_google_results:
2293                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2294                                         n = self._max_google_results
2295                                 self._download_n_results(query, n)
2296                                 return
2297                         except ValueError: # parsing prefix as integer fails
2298                                 self._download_n_results(query, 1)
2299                                 return
2300
2301         def _download_n_results(self, query, n):
2302                 """Downloads a specified number of results for a query"""
2303
2304                 video_ids = []
2305                 already_seen = set()
2306                 pagenum = 1
2307
2308                 while True:
2309                         self.report_download_page(query, pagenum)
2310                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2311                         request = urllib2.Request(result_url)
2312                         try:
2313                                 page = urllib2.urlopen(request).read()
2314                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2315                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2316                                 return
2317
2318                         # Extract video identifiers
2319                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2320                                 video_id = mobj.group(1)
2321                                 if video_id not in already_seen:
2322                                         video_ids.append(video_id)
2323                                         already_seen.add(video_id)
2324                                         if len(video_ids) == n:
2325                                                 # Specified n videos reached
2326                                                 for id in video_ids:
2327                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2328                                                 return
2329
2330                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2331                                 for id in video_ids:
2332                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2333                                 return
2334
2335                         pagenum = pagenum + 1
2336
2337
2338 class YahooSearchIE(InfoExtractor):
2339         """Information Extractor for Yahoo! Video search queries."""
2340         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2341         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2342         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2343         _MORE_PAGES_INDICATOR = r'\s*Next'
2344         _yahoo_ie = None
2345         _max_yahoo_results = 1000
2346
2347         def __init__(self, yahoo_ie, downloader=None):
2348                 InfoExtractor.__init__(self, downloader)
2349                 self._yahoo_ie = yahoo_ie
2350
2351         @staticmethod
2352         def suitable(url):
2353                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2354
2355         def report_download_page(self, query, pagenum):
2356                 """Report attempt to download playlist page with given number."""
2357                 query = query.decode(preferredencoding())
2358                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2359
2360         def _real_initialize(self):
2361                 self._yahoo_ie.initialize()
2362
2363         def _real_extract(self, query):
2364                 mobj = re.match(self._VALID_QUERY, query)
2365                 if mobj is None:
2366                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2367                         return
2368
2369                 prefix, query = query.split(':')
2370                 prefix = prefix[8:]
2371                 query = query.encode('utf-8')
2372                 if prefix == '':
2373                         self._download_n_results(query, 1)
2374                         return
2375                 elif prefix == 'all':
2376                         self._download_n_results(query, self._max_yahoo_results)
2377                         return
2378                 else:
2379                         try:
2380                                 n = long(prefix)
2381                                 if n <= 0:
2382                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2383                                         return
2384                                 elif n > self._max_yahoo_results:
2385                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2386                                         n = self._max_yahoo_results
2387                                 self._download_n_results(query, n)
2388                                 return
2389                         except ValueError: # parsing prefix as integer fails
2390                                 self._download_n_results(query, 1)
2391                                 return
2392
2393         def _download_n_results(self, query, n):
2394                 """Downloads a specified number of results for a query"""
2395
2396                 video_ids = []
2397                 already_seen = set()
2398                 pagenum = 1
2399
2400                 while True:
2401                         self.report_download_page(query, pagenum)
2402                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2403                         request = urllib2.Request(result_url)
2404                         try:
2405                                 page = urllib2.urlopen(request).read()
2406                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2407                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2408                                 return
2409
2410                         # Extract video identifiers
2411                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2412                                 video_id = mobj.group(1)
2413                                 if video_id not in already_seen:
2414                                         video_ids.append(video_id)
2415                                         already_seen.add(video_id)
2416                                         if len(video_ids) == n:
2417                                                 # Specified n videos reached
2418                                                 for id in video_ids:
2419                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2420                                                 return
2421
2422                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2423                                 for id in video_ids:
2424                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2425                                 return
2426
2427                         pagenum = pagenum + 1
2428
2429
2430 class YoutubePlaylistIE(InfoExtractor):
2431         """Information Extractor for YouTube playlists."""
2432
2433         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2434         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2435         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2436         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2437         _youtube_ie = None
2438
2439         def __init__(self, youtube_ie, downloader=None):
2440                 InfoExtractor.__init__(self, downloader)
2441                 self._youtube_ie = youtube_ie
2442
2443         @staticmethod
2444         def suitable(url):
2445                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2446
2447         def report_download_page(self, playlist_id, pagenum):
2448                 """Report attempt to download playlist page with given number."""
2449                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2450
2451         def _real_initialize(self):
2452                 self._youtube_ie.initialize()
2453
2454         def _real_extract(self, url):
2455                 # Extract playlist id
2456                 mobj = re.match(self._VALID_URL, url)
2457                 if mobj is None:
2458                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2459                         return
2460
2461                 # Single video case
2462                 if mobj.group(3) is not None:
2463                         self._youtube_ie.extract(mobj.group(3))
2464                         return
2465
2466                 # Download playlist pages
2467                 # prefix is 'p' as default for playlists but there are other types that need extra care
2468                 playlist_prefix = mobj.group(1)
2469                 if playlist_prefix == 'a':
2470                         playlist_access = 'artist'
2471                 else:
2472                         playlist_prefix = 'p'
2473                         playlist_access = 'view_play_list'
2474                 playlist_id = mobj.group(2)
2475                 video_ids = []
2476                 pagenum = 1
2477
2478                 while True:
2479                         self.report_download_page(playlist_id, pagenum)
2480                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2481                         try:
2482                                 page = urllib2.urlopen(request).read()
2483                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2484                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2485                                 return
2486
2487                         # Extract video identifiers
2488                         ids_in_page = []
2489                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2490                                 if mobj.group(1) not in ids_in_page:
2491                                         ids_in_page.append(mobj.group(1))
2492                         video_ids.extend(ids_in_page)
2493
2494                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2495                                 break
2496                         pagenum = pagenum + 1
2497
2498                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2499                 playlistend = self._downloader.params.get('playlistend', -1)
2500                 video_ids = video_ids[playliststart:playlistend]
2501
2502                 for id in video_ids:
2503                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2504                 return
2505
2506
2507 class YoutubeUserIE(InfoExtractor):
2508         """Information Extractor for YouTube users."""
2509
2510         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2511         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2512         _GDATA_PAGE_SIZE = 50
2513         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2514         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2515         _youtube_ie = None
2516
2517         def __init__(self, youtube_ie, downloader=None):
2518                 InfoExtractor.__init__(self, downloader)
2519                 self._youtube_ie = youtube_ie
2520
2521         @staticmethod
2522         def suitable(url):
2523                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2524
2525         def report_download_page(self, username, start_index):
2526                 """Report attempt to download user page."""
2527                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2528                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2529
2530         def _real_initialize(self):
2531                 self._youtube_ie.initialize()
2532
2533         def _real_extract(self, url):
2534                 # Extract username
2535                 mobj = re.match(self._VALID_URL, url)
2536                 if mobj is None:
2537                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2538                         return
2539
2540                 username = mobj.group(1)
2541
2542                 # Download video ids using YouTube Data API. Result size per
2543                 # query is limited (currently to 50 videos) so we need to query
2544                 # page by page until there are no video ids - it means we got
2545                 # all of them.
2546
2547                 video_ids = []
2548                 pagenum = 0
2549
2550                 while True:
2551                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2552                         self.report_download_page(username, start_index)
2553
2554                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2555
2556                         try:
2557                                 page = urllib2.urlopen(request).read()
2558                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2559                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2560                                 return
2561
2562                         # Extract video identifiers
2563                         ids_in_page = []
2564
2565                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2566                                 if mobj.group(1) not in ids_in_page:
2567                                         ids_in_page.append(mobj.group(1))
2568
2569                         video_ids.extend(ids_in_page)
2570
2571                         # A little optimization - if current page is not
2572                         # "full", ie. does not contain PAGE_SIZE video ids then
2573                         # we can assume that this page is the last one - there
2574                         # are no more ids on further pages - no need to query
2575                         # again.
2576
2577                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2578                                 break
2579
2580                         pagenum += 1
2581
2582                 all_ids_count = len(video_ids)
2583                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2584                 playlistend = self._downloader.params.get('playlistend', -1)
2585
2586                 if playlistend == -1:
2587                         video_ids = video_ids[playliststart:]
2588                 else:
2589                         video_ids = video_ids[playliststart:playlistend]
2590
2591                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2592                                 (username, all_ids_count, len(video_ids)))
2593
2594                 for video_id in video_ids:
2595                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2596
2597
2598 class DepositFilesIE(InfoExtractor):
2599         """Information extractor for depositfiles.com"""
2600
2601         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2602
2603         def __init__(self, downloader=None):
2604                 InfoExtractor.__init__(self, downloader)
2605
2606         @staticmethod
2607         def suitable(url):
2608                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2609
2610         def report_download_webpage(self, file_id):
2611                 """Report webpage download."""
2612                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2613
2614         def report_extraction(self, file_id):
2615                 """Report information extraction."""
2616                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2617
2618         def _real_initialize(self):
2619                 return
2620
2621         def _real_extract(self, url):
2622                 # At this point we have a new file
2623                 self._downloader.increment_downloads()
2624
2625                 file_id = url.split('/')[-1]
2626                 # Rebuild url in english locale
2627                 url = 'http://depositfiles.com/en/files/' + file_id
2628
2629                 # Retrieve file webpage with 'Free download' button pressed
2630                 free_download_indication = { 'gateway_result' : '1' }
2631                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2632                 try:
2633                         self.report_download_webpage(file_id)
2634                         webpage = urllib2.urlopen(request).read()
2635                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2636                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2637                         return
2638
2639                 # Search for the real file URL
2640                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2641                 if (mobj is None) or (mobj.group(1) is None):
2642                         # Try to figure out reason of the error.
2643                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2644                         if (mobj is not None) and (mobj.group(1) is not None):
2645                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2646                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2647                         else:
2648                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2649                         return
2650
2651                 file_url = mobj.group(1)
2652                 file_extension = os.path.splitext(file_url)[1][1:]
2653
2654                 # Search for file title
2655                 mobj = re.search(r'<b title="(.*?)">', webpage)
2656                 if mobj is None:
2657                         self._downloader.trouble(u'ERROR: unable to extract title')
2658                         return
2659                 file_title = mobj.group(1).decode('utf-8')
2660
2661                 try:
2662                         # Process file information
2663                         self._downloader.process_info({
2664                                 'id':           file_id.decode('utf-8'),
2665                                 'url':          file_url.decode('utf-8'),
2666                                 'uploader':     u'NA',
2667                                 'upload_date':  u'NA',
2668                                 'title':        file_title,
2669                                 'stitle':       file_title,
2670                                 'ext':          file_extension.decode('utf-8'),
2671                                 'format':       u'NA',
2672                                 'player_url':   None,
2673                         })
2674                 except UnavailableVideoError, err:
2675                         self._downloader.trouble(u'ERROR: unable to download file')
2676
2677
2678 class FacebookIE(InfoExtractor):
2679         """Information Extractor for Facebook"""
2680
2681         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2682         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2683         _NETRC_MACHINE = 'facebook'
2684         _available_formats = ['highqual', 'lowqual']
2685         _video_extensions = {
2686                 'highqual': 'mp4',
2687                 'lowqual': 'mp4',
2688         }
2689
2690         def __init__(self, downloader=None):
2691                 InfoExtractor.__init__(self, downloader)
2692
2693         @staticmethod
2694         def suitable(url):
2695                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2696
2697         def _reporter(self, message):
2698                 """Add header and report message."""
2699                 self._downloader.to_screen(u'[facebook] %s' % message)
2700
2701         def report_login(self):
2702                 """Report attempt to log in."""
2703                 self._reporter(u'Logging in')
2704
2705         def report_video_webpage_download(self, video_id):
2706                 """Report attempt to download video webpage."""
2707                 self._reporter(u'%s: Downloading video webpage' % video_id)
2708
2709         def report_information_extraction(self, video_id):
2710                 """Report attempt to extract video information."""
2711                 self._reporter(u'%s: Extracting video information' % video_id)
2712
2713         def _parse_page(self, video_webpage):
2714                 """Extract video information from page"""
2715                 # General data
2716                 data = {'title': r'class="video_title datawrap">(.*?)</',
2717                         'description': r'<div class="datawrap">(.*?)</div>',
2718                         'owner': r'\("video_owner_name", "(.*?)"\)',
2719                         'upload_date': r'data-date="(.*?)"',
2720                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2721                         }
2722                 video_info = {}
2723                 for piece in data.keys():
2724                         mobj = re.search(data[piece], video_webpage)
2725                         if mobj is not None:
2726                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2727
2728                 # Video urls
2729                 video_urls = {}
2730                 for fmt in self._available_formats:
2731                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2732                         if mobj is not None:
2733                                 # URL is in a Javascript segment inside an escaped Unicode format within
2734                                 # the generally utf-8 page
2735                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2736                 video_info['video_urls'] = video_urls
2737
2738                 return video_info
2739
2740         def _real_initialize(self):
2741                 if self._downloader is None:
2742                         return
2743
2744                 useremail = None
2745                 password = None
2746                 downloader_params = self._downloader.params
2747
2748                 # Attempt to use provided username and password or .netrc data
2749                 if downloader_params.get('username', None) is not None:
2750                         useremail = downloader_params['username']
2751                         password = downloader_params['password']
2752                 elif downloader_params.get('usenetrc', False):
2753                         try:
2754                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2755                                 if info is not None:
2756                                         useremail = info[0]
2757                                         password = info[2]
2758                                 else:
2759                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2760                         except (IOError, netrc.NetrcParseError), err:
2761                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2762                                 return
2763
2764                 if useremail is None:
2765                         return
2766
2767                 # Log in
2768                 login_form = {
2769                         'email': useremail,
2770                         'pass': password,
2771                         'login': 'Log+In'
2772                         }
2773                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2774                 try:
2775                         self.report_login()
2776                         login_results = urllib2.urlopen(request).read()
2777                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2778                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2779                                 return
2780                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2781                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2782                         return
2783
2784         def _real_extract(self, url):
2785                 mobj = re.match(self._VALID_URL, url)
2786                 if mobj is None:
2787                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2788                         return
2789                 video_id = mobj.group('ID')
2790
2791                 # Get video webpage
2792                 self.report_video_webpage_download(video_id)
2793                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2794                 try:
2795                         page = urllib2.urlopen(request)
2796                         video_webpage = page.read()
2797                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2798                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2799                         return
2800
2801                 # Start extracting information
2802                 self.report_information_extraction(video_id)
2803
2804                 # Extract information
2805                 video_info = self._parse_page(video_webpage)
2806
2807                 # uploader
2808                 if 'owner' not in video_info:
2809                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2810                         return
2811                 video_uploader = video_info['owner']
2812
2813                 # title
2814                 if 'title' not in video_info:
2815                         self._downloader.trouble(u'ERROR: unable to extract video title')
2816                         return
2817                 video_title = video_info['title']
2818                 video_title = video_title.decode('utf-8')
2819                 video_title = sanitize_title(video_title)
2820
2821                 # simplified title
2822                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2823                 simple_title = simple_title.strip(ur'_')
2824
2825                 # thumbnail image
2826                 if 'thumbnail' not in video_info:
2827                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2828                         video_thumbnail = ''
2829                 else:
2830                         video_thumbnail = video_info['thumbnail']
2831
2832                 # upload date
2833                 upload_date = u'NA'
2834                 if 'upload_date' in video_info:
2835                         upload_time = video_info['upload_date']
2836                         timetuple = email.utils.parsedate_tz(upload_time)
2837                         if timetuple is not None:
2838                                 try:
2839                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2840                                 except:
2841                                         pass
2842
2843                 # description
2844                 video_description = video_info.get('description', 'No description available.')
2845
2846                 url_map = video_info['video_urls']
2847                 if len(url_map.keys()) > 0:
2848                         # Decide which formats to download
2849                         req_format = self._downloader.params.get('format', None)
2850                         format_limit = self._downloader.params.get('format_limit', None)
2851
2852                         if format_limit is not None and format_limit in self._available_formats:
2853                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2854                         else:
2855                                 format_list = self._available_formats
2856                         existing_formats = [x for x in format_list if x in url_map]
2857                         if len(existing_formats) == 0:
2858                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2859                                 return
2860                         if req_format is None:
2861                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2862                         elif req_format == '-1':
2863                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2864                         else:
2865                                 # Specific format
2866                                 if req_format not in url_map:
2867                                         self._downloader.trouble(u'ERROR: requested format not available')
2868                                         return
2869                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2870
2871                 for format_param, video_real_url in video_url_list:
2872
2873                         # At this point we have a new video
2874                         self._downloader.increment_downloads()
2875
2876                         # Extension
2877                         video_extension = self._video_extensions.get(format_param, 'mp4')
2878
2879                         try:
2880                                 # Process video information
2881                                 self._downloader.process_info({
2882                                         'id':           video_id.decode('utf-8'),
2883                                         'url':          video_real_url.decode('utf-8'),
2884                                         'uploader':     video_uploader.decode('utf-8'),
2885                                         'upload_date':  upload_date,
2886                                         'title':        video_title,
2887                                         'stitle':       simple_title,
2888                                         'ext':          video_extension.decode('utf-8'),
2889                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2890                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2891                                         'description':  video_description.decode('utf-8'),
2892                                         'player_url':   None,
2893                                 })
2894                         except UnavailableVideoError, err:
2895                                 self._downloader.trouble(u'\nERROR: unable to download video')
2896
2897 class BlipTVIE(InfoExtractor):
2898         """Information extractor for blip.tv"""
2899
2900         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2901         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2902
2903         @staticmethod
2904         def suitable(url):
2905                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2906
2907         def report_extraction(self, file_id):
2908                 """Report information extraction."""
2909                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2910
2911         def _simplify_title(self, title):
2912                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2913                 res = res.strip(ur'_')
2914                 return res
2915
2916         def _real_extract(self, url):
2917                 mobj = re.match(self._VALID_URL, url)
2918                 if mobj is None:
2919                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2920                         return
2921
2922                 if '?' in url:
2923                         cchar = '&'
2924                 else:
2925                         cchar = '?'
2926                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2927                 request = urllib2.Request(json_url)
2928                 self.report_extraction(mobj.group(1))
2929                 try:
2930                         json_code = urllib2.urlopen(request).read()
2931                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2932                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2933                         return
2934                 try:
2935                         json_data = json.loads(json_code)
2936                         if 'Post' in json_data:
2937                                 data = json_data['Post']
2938                         else:
2939                                 data = json_data
2940
2941                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2942                         video_url = data['media']['url']
2943                         umobj = re.match(self._URL_EXT, video_url)
2944                         if umobj is None:
2945                                 raise ValueError('Can not determine filename extension')
2946                         ext = umobj.group(1)
2947
2948                         self._downloader.increment_downloads()
2949
2950                         info = {
2951                                 'id': data['item_id'],
2952                                 'url': video_url,
2953                                 'uploader': data['display_name'],
2954                                 'upload_date': upload_date,
2955                                 'title': data['title'],
2956                                 'stitle': self._simplify_title(data['title']),
2957                                 'ext': ext,
2958                                 'format': data['media']['mimeType'],
2959                                 'thumbnail': data['thumbnailUrl'],
2960                                 'description': data['description'],
2961                                 'player_url': data['embedUrl']
2962                         }
2963                 except (ValueError,KeyError), err:
2964                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2965                         return
2966
2967                 try:
2968                         self._downloader.process_info(info)
2969                 except UnavailableVideoError, err:
2970                         self._downloader.trouble(u'\nERROR: unable to download video')
2971
2972
2973 class MyVideoIE(InfoExtractor):
2974         """Information Extractor for myvideo.de."""
2975
2976         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2977
2978         def __init__(self, downloader=None):
2979                 InfoExtractor.__init__(self, downloader)
2980
2981         @staticmethod
2982         def suitable(url):
2983                 return (re.match(MyVideoIE._VALID_URL, url) is not None)
2984
2985         def report_download_webpage(self, video_id):
2986                 """Report webpage download."""
2987                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2988
2989         def report_extraction(self, video_id):
2990                 """Report information extraction."""
2991                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2992
2993         def _real_initialize(self):
2994                 return
2995
2996         def _real_extract(self,url):
2997                 mobj = re.match(self._VALID_URL, url)
2998                 if mobj is None:
2999                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3000                         return
3001
3002                 video_id = mobj.group(1)
3003                 simple_title = mobj.group(2).decode('utf-8')
3004                 # should actually not be necessary
3005                 simple_title = sanitize_title(simple_title)
3006                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3007
3008                 # Get video webpage
3009                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3010                 try:
3011                         self.report_download_webpage(video_id)
3012                         webpage = urllib2.urlopen(request).read()
3013                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3014                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3015                         return
3016
3017                 self.report_extraction(video_id)
3018                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3019                                  webpage)
3020                 if mobj is None:
3021                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3022                         return
3023                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3024
3025                 mobj = re.search('<title>([^<]+)</title>', webpage)
3026                 if mobj is None:
3027                         self._downloader.trouble(u'ERROR: unable to extract title')
3028                         return
3029
3030                 video_title = mobj.group(1)
3031                 video_title = sanitize_title(video_title)
3032
3033                 try:
3034                         print(video_url)
3035                         self._downloader.process_info({
3036                                 'id':           video_id,
3037                                 'url':          video_url,
3038                                 'uploader':     u'NA',
3039                                 'upload_date':  u'NA',
3040                                 'title':        video_title,
3041                                 'stitle':       simple_title,
3042                                 'ext':          u'flv',
3043                                 'format':       u'NA',
3044                                 'player_url':   None,
3045                         })
3046                 except UnavailableVideoError:
3047                         self._downloader.trouble(u'\nERROR: Unable to download video')
3048
3049 class ComedyCentralIE(InfoExtractor):
3050         """Information extractor for The Daily Show and Colbert Report """
3051
3052         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3053
3054         @staticmethod
3055         def suitable(url):
3056                 return (re.match(ComedyCentralIE._VALID_URL, url) is not None)
3057
3058         def report_extraction(self, episode_id):
3059                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3060
3061         def report_config_download(self, episode_id):
3062                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3063
3064         def report_index_download(self, episode_id):
3065                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3066
3067         def report_player_url(self, episode_id):
3068                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3069
3070         def _simplify_title(self, title):
3071                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3072                 res = res.strip(ur'_')
3073                 return res
3074
3075         def _real_extract(self, url):
3076                 mobj = re.match(self._VALID_URL, url)
3077                 if mobj is None:
3078                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3079                         return
3080
3081                 if mobj.group('shortname'):
3082                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3083                                 url = 'http://www.thedailyshow.com/full-episodes/'
3084                         else:
3085                                 url = 'http://www.colbertnation.com/full-episodes/'
3086                         mobj = re.match(self._VALID_URL, url)
3087                         assert mobj is not None
3088
3089                 dlNewest = not mobj.group('episode')
3090                 if dlNewest:
3091                         epTitle = mobj.group('showname')
3092                 else:
3093                         epTitle = mobj.group('episode')
3094
3095                 req = urllib2.Request(url)
3096                 self.report_extraction(epTitle)
3097                 try:
3098                         htmlHandle = urllib2.urlopen(req)
3099                         html = htmlHandle.read()
3100                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3101                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3102                         return
3103                 if dlNewest:
3104                         url = htmlHandle.geturl()
3105                         mobj = re.match(self._VALID_URL, url)
3106                         if mobj is None:
3107                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3108                                 return
3109                         if mobj.group('episode') == '':
3110                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3111                                 return
3112                         epTitle = mobj.group('episode')
3113
3114                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3115                 if len(mMovieParams) == 0:
3116                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3117                         return
3118
3119                 playerUrl_raw = mMovieParams[0][0]
3120                 self.report_player_url(epTitle)
3121                 try:
3122                         urlHandle = urllib2.urlopen(playerUrl_raw)
3123                         playerUrl = urlHandle.geturl()
3124                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3125                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3126                         return
3127
3128                 uri = mMovieParams[0][1]
3129                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3130                 self.report_index_download(epTitle)
3131                 try:
3132                         indexXml = urllib2.urlopen(indexUrl).read()
3133                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3134                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3135                         return
3136
3137                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3138                 itemEls = idoc.findall('.//item')
3139                 for itemEl in itemEls:
3140                         mediaId = itemEl.findall('./guid')[0].text
3141                         shortMediaId = mediaId.split(':')[-1]
3142                         showId = mediaId.split(':')[-2].replace('.com', '')
3143                         officialTitle = itemEl.findall('./title')[0].text
3144                         officialDate = itemEl.findall('./pubDate')[0].text
3145
3146                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3147                                                 urllib.urlencode({'uri': mediaId}))
3148                         configReq = urllib2.Request(configUrl)
3149                         self.report_config_download(epTitle)
3150                         try:
3151                                 configXml = urllib2.urlopen(configReq).read()
3152                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3153                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3154                                 return
3155
3156                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3157                         turls = []
3158                         for rendition in cdoc.findall('.//rendition'):
3159                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3160                                 turls.append(finfo)
3161
3162                         if len(turls) == 0:
3163                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3164                                 continue
3165
3166                         # For now, just pick the highest bitrate
3167                         format,video_url = turls[-1]
3168
3169                         self._downloader.increment_downloads()
3170
3171                         effTitle = showId + '-' + epTitle
3172                         info = {
3173                                 'id': shortMediaId,
3174                                 'url': video_url,
3175                                 'uploader': showId,
3176                                 'upload_date': officialDate,
3177                                 'title': effTitle,
3178                                 'stitle': self._simplify_title(effTitle),
3179                                 'ext': 'mp4',
3180                                 'format': format,
3181                                 'thumbnail': None,
3182                                 'description': officialTitle,
3183                                 'player_url': playerUrl
3184                         }
3185
3186                         try:
3187                                 self._downloader.process_info(info)
3188                         except UnavailableVideoError, err:
3189                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3190                                 continue
3191
3192
3193 class EscapistIE(InfoExtractor):
3194         """Information extractor for The Escapist """
3195
3196         _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3197
3198         @staticmethod
3199         def suitable(url):
3200                 return (re.match(EscapistIE._VALID_URL, url) is not None)
3201
3202         def report_extraction(self, showName):
3203                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3204
3205         def report_config_download(self, showName):
3206                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3207
3208         def _simplify_title(self, title):
3209                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3210                 res = res.strip(ur'_')
3211                 return res
3212
3213         def _real_extract(self, url):
3214                 htmlParser = HTMLParser.HTMLParser()
3215
3216                 mobj = re.match(self._VALID_URL, url)
3217                 if mobj is None:
3218                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3219                         return
3220                 showName = mobj.group('showname')
3221                 videoId = mobj.group('episode')
3222
3223                 self.report_extraction(showName)
3224                 try:
3225                         webPage = urllib2.urlopen(url).read()
3226                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3227                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3228                         return
3229
3230                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3231                 description = htmlParser.unescape(descMatch.group(1))
3232                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3233                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3234                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3235                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3236                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3237                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3238
3239                 self.report_config_download(showName)
3240                 try:
3241                         configJSON = urllib2.urlopen(configUrl).read()
3242                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3243                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3244                         return
3245
3246                 # Technically, it's JavaScript, not JSON
3247                 configJSON = configJSON.replace("'", '"')
3248
3249                 try:
3250                         config = json.loads(configJSON)
3251                 except (ValueError,), err:
3252                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3253                         return
3254
3255                 playlist = config['playlist']
3256                 videoUrl = playlist[1]['url']
3257
3258                 self._downloader.increment_downloads()
3259                 info = {
3260                         'id': videoId,
3261                         'url': videoUrl,
3262                         'uploader': showName,
3263                         'upload_date': None,
3264                         'title': showName,
3265                         'stitle': self._simplify_title(showName),
3266                         'ext': 'flv',
3267                         'format': 'flv',
3268                         'thumbnail': imgUrl,
3269                         'description': description,
3270                         'player_url': playerUrl,
3271                 }
3272
3273                 try:
3274                         self._downloader.process_info(info)
3275                 except UnavailableVideoError, err:
3276                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3277
3278
3279
3280 class PostProcessor(object):
3281         """Post Processor class.
3282
3283         PostProcessor objects can be added to downloaders with their
3284         add_post_processor() method. When the downloader has finished a
3285         successful download, it will take its internal chain of PostProcessors
3286         and start calling the run() method on each one of them, first with
3287         an initial argument and then with the returned value of the previous
3288         PostProcessor.
3289
3290         The chain will be stopped if one of them ever returns None or the end
3291         of the chain is reached.
3292
3293         PostProcessor objects follow a "mutual registration" process similar
3294         to InfoExtractor objects.
3295         """
3296
3297         _downloader = None
3298
3299         def __init__(self, downloader=None):
3300                 self._downloader = downloader
3301
3302         def set_downloader(self, downloader):
3303                 """Sets the downloader for this PP."""
3304                 self._downloader = downloader
3305
3306         def run(self, information):
3307                 """Run the PostProcessor.
3308
3309                 The "information" argument is a dictionary like the ones
3310                 composed by InfoExtractors. The only difference is that this
3311                 one has an extra field called "filepath" that points to the
3312                 downloaded file.
3313
3314                 When this method returns None, the postprocessing chain is
3315                 stopped. However, this method may return an information
3316                 dictionary that will be passed to the next postprocessing
3317                 object in the chain. It can be the one it received after
3318                 changing some fields.
3319
3320                 In addition, this method may raise a PostProcessingError
3321                 exception that will be taken into account by the downloader
3322                 it was called from.
3323                 """
3324                 return information # by default, do nothing
3325
3326
3327 class FFmpegExtractAudioPP(PostProcessor):
3328
3329         def __init__(self, downloader=None, preferredcodec=None):
3330                 PostProcessor.__init__(self, downloader)
3331                 if preferredcodec is None:
3332                         preferredcodec = 'best'
3333                 self._preferredcodec = preferredcodec
3334
3335         @staticmethod
3336         def get_audio_codec(path):
3337                 try:
3338                         cmd = ['ffprobe', '-show_streams', '--', path]
3339                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3340                         output = handle.communicate()[0]
3341                         if handle.wait() != 0:
3342                                 return None
3343                 except (IOError, OSError):
3344                         return None
3345                 audio_codec = None
3346                 for line in output.split('\n'):
3347                         if line.startswith('codec_name='):
3348                                 audio_codec = line.split('=')[1].strip()
3349                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3350                                 return audio_codec
3351                 return None
3352
3353         @staticmethod
3354         def run_ffmpeg(path, out_path, codec, more_opts):
3355                 try:
3356                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3357                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3358                         return (ret == 0)
3359                 except (IOError, OSError):
3360                         return False
3361
3362         def run(self, information):
3363                 path = information['filepath']
3364
3365                 filecodec = self.get_audio_codec(path)
3366                 if filecodec is None:
3367                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3368                         return None
3369
3370                 more_opts = []
3371                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3372                         if filecodec == 'aac' or filecodec == 'mp3':
3373                                 # Lossless if possible
3374                                 acodec = 'copy'
3375                                 extension = filecodec
3376                                 if filecodec == 'aac':
3377                                         more_opts = ['-f', 'adts']
3378                         else:
3379                                 # MP3 otherwise.
3380                                 acodec = 'libmp3lame'
3381                                 extension = 'mp3'
3382                                 more_opts = ['-ab', '128k']
3383                 else:
3384                         # We convert the audio (lossy)
3385                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3386                         extension = self._preferredcodec
3387                         more_opts = ['-ab', '128k']
3388                         if self._preferredcodec == 'aac':
3389                                 more_opts += ['-f', 'adts']
3390
3391                 (prefix, ext) = os.path.splitext(path)
3392                 new_path = prefix + '.' + extension
3393                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3394                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3395
3396                 if not status:
3397                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3398                         return None
3399
3400                 try:
3401                         os.remove(path)
3402                 except (IOError, OSError):
3403                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3404                         return None
3405
3406                 information['filepath'] = new_path
3407                 return information
3408
3409
3410 def updateSelf(downloader, filename):
3411         ''' Update the program file with the latest version from the repository '''
3412         # Note: downloader only used for options
3413         if not os.access(filename, os.W_OK):
3414                 sys.exit('ERROR: no write permissions on %s' % filename)
3415
3416         downloader.to_screen('Updating to latest version...')
3417
3418         try:
3419                 try:
3420                         urlh = urllib.urlopen(UPDATE_URL)
3421                         newcontent = urlh.read()
3422                 finally:
3423                         urlh.close()
3424         except (IOError, OSError), err:
3425                 sys.exit('ERROR: unable to download latest version')
3426
3427         try:
3428                 outf = open(filename, 'wb')
3429                 try:
3430                         outf.write(newcontent)
3431                 finally:
3432                         outf.close()
3433         except (IOError, OSError), err:
3434                 sys.exit('ERROR: unable to overwrite current version')
3435
3436         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3437
3438 def parseOpts():
3439         # Deferred imports
3440         import getpass
3441         import optparse
3442
3443         def _format_option_string(option):
3444                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3445
3446                 opts = []
3447
3448                 if option._short_opts: opts.append(option._short_opts[0])
3449                 if option._long_opts: opts.append(option._long_opts[0])
3450                 if len(opts) > 1: opts.insert(1, ', ')
3451
3452                 if option.takes_value(): opts.append(' %s' % option.metavar)
3453
3454                 return "".join(opts)
3455
3456         def _find_term_columns():
3457                 columns = os.environ.get('COLUMNS', None)
3458                 if columns:
3459                         return int(columns)
3460
3461                 try:
3462                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3463                         out,err = sp.communicate()
3464                         return int(out.split()[1])
3465                 except:
3466                         pass
3467                 return None
3468
3469         max_width = 80
3470         max_help_position = 80
3471
3472         # No need to wrap help messages if we're on a wide console
3473         columns = _find_term_columns()
3474         if columns: max_width = columns
3475
3476         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3477         fmt.format_option_strings = _format_option_string
3478
3479         kw = {
3480                 'version'   : __version__,
3481                 'formatter' : fmt,
3482                 'usage' : '%prog [options] url [url...]',
3483                 'conflict_handler' : 'resolve',
3484         }
3485
3486         parser = optparse.OptionParser(**kw)
3487
3488         # option groups
3489         general        = optparse.OptionGroup(parser, 'General Options')
3490         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3491         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3492         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3493         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3494         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3495
3496         general.add_option('-h', '--help',
3497                         action='help', help='print this help text and exit')
3498         general.add_option('-v', '--version',
3499                         action='version', help='print program version and exit')
3500         general.add_option('-U', '--update',
3501                         action='store_true', dest='update_self', help='update this program to latest version')
3502         general.add_option('-i', '--ignore-errors',
3503                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3504         general.add_option('-r', '--rate-limit',
3505                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3506         general.add_option('-R', '--retries',
3507                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3508         general.add_option('--playlist-start',
3509                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3510         general.add_option('--playlist-end',
3511                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3512         general.add_option('--dump-user-agent',
3513                         action='store_true', dest='dump_user_agent',
3514                         help='display the current browser identification', default=False)
3515
3516         authentication.add_option('-u', '--username',
3517                         dest='username', metavar='USERNAME', help='account username')
3518         authentication.add_option('-p', '--password',
3519                         dest='password', metavar='PASSWORD', help='account password')
3520         authentication.add_option('-n', '--netrc',
3521                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3522
3523
3524         video_format.add_option('-f', '--format',
3525                         action='store', dest='format', metavar='FORMAT', help='video format code')
3526         video_format.add_option('--all-formats',
3527                         action='store_const', dest='format', help='download all available video formats', const='-1')
3528         video_format.add_option('--max-quality',
3529                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3530
3531
3532         verbosity.add_option('-q', '--quiet',
3533                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3534         verbosity.add_option('-s', '--simulate',
3535                         action='store_true', dest='simulate', help='do not download video', default=False)
3536         verbosity.add_option('-g', '--get-url',
3537                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3538         verbosity.add_option('-e', '--get-title',
3539                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3540         verbosity.add_option('--get-thumbnail',
3541                         action='store_true', dest='getthumbnail',
3542                         help='simulate, quiet but print thumbnail URL', default=False)
3543         verbosity.add_option('--get-description',
3544                         action='store_true', dest='getdescription',
3545                         help='simulate, quiet but print video description', default=False)
3546         verbosity.add_option('--get-filename',
3547                         action='store_true', dest='getfilename',
3548                         help='simulate, quiet but print output filename', default=False)
3549         verbosity.add_option('--no-progress',
3550                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3551         verbosity.add_option('--console-title',
3552                         action='store_true', dest='consoletitle',
3553                         help='display progress in console titlebar', default=False)
3554
3555
3556         filesystem.add_option('-t', '--title',
3557                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3558         filesystem.add_option('-l', '--literal',
3559                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3560         filesystem.add_option('-A', '--auto-number',
3561                         action='store_true', dest='autonumber',
3562                         help='number downloaded files starting from 00000', default=False)
3563         filesystem.add_option('-o', '--output',
3564                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3565         filesystem.add_option('-a', '--batch-file',
3566                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3567         filesystem.add_option('-w', '--no-overwrites',
3568                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3569         filesystem.add_option('-c', '--continue',
3570                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3571         filesystem.add_option('--cookies',
3572                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3573         filesystem.add_option('--no-part',
3574                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3575         filesystem.add_option('--no-mtime',
3576                         action='store_false', dest='updatetime',
3577                         help='do not use the Last-modified header to set the file modification time', default=True)
3578         filesystem.add_option('--write-description',
3579                         action='store_true', dest='writedescription',
3580                         help='write video description to a .description file', default=False)
3581         filesystem.add_option('--write-info-json',
3582                         action='store_true', dest='writeinfojson',
3583                         help='write video metadata to a .info.json file', default=False)
3584
3585
3586         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3587                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3588         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3589                         help='"best", "aac" or "mp3"; best by default')
3590
3591
3592         parser.add_option_group(general)
3593         parser.add_option_group(filesystem)
3594         parser.add_option_group(verbosity)
3595         parser.add_option_group(video_format)
3596         parser.add_option_group(authentication)
3597         parser.add_option_group(postproc)
3598
3599         opts, args = parser.parse_args()
3600
3601         return parser, opts, args
3602
3603 def main():
3604         parser, opts, args = parseOpts()
3605
3606         # Open appropriate CookieJar
3607         if opts.cookiefile is None:
3608                 jar = cookielib.CookieJar()
3609         else:
3610                 try:
3611                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3612                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3613                                 jar.load()
3614                 except (IOError, OSError), err:
3615                         sys.exit(u'ERROR: unable to open cookie file')
3616
3617         # Dump user agent
3618         if opts.dump_user_agent:
3619                 print std_headers['User-Agent']
3620                 sys.exit(0)
3621
3622         # General configuration
3623         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3624         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3625         urllib2.install_opener(opener)
3626         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3627
3628         # Batch file verification
3629         batchurls = []
3630         if opts.batchfile is not None:
3631                 try:
3632                         if opts.batchfile == '-':
3633                                 batchfd = sys.stdin
3634                         else:
3635                                 batchfd = open(opts.batchfile, 'r')
3636                         batchurls = batchfd.readlines()
3637                         batchurls = [x.strip() for x in batchurls]
3638                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3639                 except IOError:
3640                         sys.exit(u'ERROR: batch file could not be read')
3641         all_urls = batchurls + args
3642
3643         # Conflicting, missing and erroneous options
3644         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3645                 parser.error(u'using .netrc conflicts with giving username/password')
3646         if opts.password is not None and opts.username is None:
3647                 parser.error(u'account username missing')
3648         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3649                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3650         if opts.usetitle and opts.useliteral:
3651                 parser.error(u'using title conflicts with using literal title')
3652         if opts.username is not None and opts.password is None:
3653                 opts.password = getpass.getpass(u'Type account password and press return:')
3654         if opts.ratelimit is not None:
3655                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3656                 if numeric_limit is None:
3657                         parser.error(u'invalid rate limit specified')
3658                 opts.ratelimit = numeric_limit
3659         if opts.retries is not None:
3660                 try:
3661                         opts.retries = long(opts.retries)
3662                 except (TypeError, ValueError), err:
3663                         parser.error(u'invalid retry count specified')
3664         try:
3665                 opts.playliststart = int(opts.playliststart)
3666                 if opts.playliststart <= 0:
3667                         raise ValueError(u'Playlist start must be positive')
3668         except (TypeError, ValueError), err:
3669                 parser.error(u'invalid playlist start number specified')
3670         try:
3671                 opts.playlistend = int(opts.playlistend)
3672                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3673                         raise ValueError(u'Playlist end must be greater than playlist start')
3674         except (TypeError, ValueError), err:
3675                 parser.error(u'invalid playlist end number specified')
3676         if opts.extractaudio:
3677                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3678                         parser.error(u'invalid audio format specified')
3679
3680         # Information extractors
3681         youtube_ie = YoutubeIE()
3682         google_ie = GoogleIE()
3683         yahoo_ie = YahooIE()
3684         extractors = [ # Order does matter
3685                 youtube_ie,
3686                 MetacafeIE(youtube_ie),
3687                 DailymotionIE(),
3688                 YoutubePlaylistIE(youtube_ie),
3689                 YoutubeUserIE(youtube_ie),
3690                 YoutubeSearchIE(youtube_ie),
3691                 google_ie,
3692                 GoogleSearchIE(google_ie),
3693                 PhotobucketIE(),
3694                 yahoo_ie,
3695                 YahooSearchIE(yahoo_ie),
3696                 DepositFilesIE(),
3697                 FacebookIE(),
3698                 BlipTVIE(),
3699                 VimeoIE(),
3700                 MyVideoIE(),
3701                 ComedyCentralIE(),
3702                 EscapistIE(),
3703
3704                 GenericIE()
3705         ]
3706
3707         # File downloader
3708         fd = FileDownloader({
3709                 'usenetrc': opts.usenetrc,
3710                 'username': opts.username,
3711                 'password': opts.password,
3712                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3713                 'forceurl': opts.geturl,
3714                 'forcetitle': opts.gettitle,
3715                 'forcethumbnail': opts.getthumbnail,
3716                 'forcedescription': opts.getdescription,
3717                 'forcefilename': opts.getfilename,
3718                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3719                 'format': opts.format,
3720                 'format_limit': opts.format_limit,
3721                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3722                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3723                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3724                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3725                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3726                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3727                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3728                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3729                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3730                         or u'%(id)s.%(ext)s'),
3731                 'ignoreerrors': opts.ignoreerrors,
3732                 'ratelimit': opts.ratelimit,
3733                 'nooverwrites': opts.nooverwrites,
3734                 'retries': opts.retries,
3735                 'continuedl': opts.continue_dl,
3736                 'noprogress': opts.noprogress,
3737                 'playliststart': opts.playliststart,
3738                 'playlistend': opts.playlistend,
3739                 'logtostderr': opts.outtmpl == '-',
3740                 'consoletitle': opts.consoletitle,
3741                 'nopart': opts.nopart,
3742                 'updatetime': opts.updatetime,
3743                 'writedescription': opts.writedescription,
3744                 'writeinfojson': opts.writeinfojson,
3745                 })
3746         for extractor in extractors:
3747                 fd.add_info_extractor(extractor)
3748
3749         # PostProcessors
3750         if opts.extractaudio:
3751                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3752
3753         # Update version
3754         if opts.update_self:
3755                 updateSelf(fd, sys.argv[0])
3756
3757         # Maybe do nothing
3758         if len(all_urls) < 1:
3759                 if not opts.update_self:
3760                         parser.error(u'you must provide at least one URL')
3761                 else:
3762                         sys.exit()
3763         retcode = fd.download(all_urls)
3764
3765         # Dump cookie jar if requested
3766         if opts.cookiefile is not None:
3767                 try:
3768                         jar.save()
3769                 except (IOError, OSError), err:
3770                         sys.exit(u'ERROR: unable to save cookie jar')
3771
3772         sys.exit(retcode)
3773
3774
3775 if __name__ == '__main__':
3776         try:
3777                 main()
3778         except DownloadError:
3779                 sys.exit(1)
3780         except SameFileError:
3781                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3782         except KeyboardInterrupt:
3783                 sys.exit(u'\nERROR: Interrupted by user')
3784
3785 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: