youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         )
  16
  17 __license__ = 'Public Domain'
  18 __version__ = '2011.10.19'
  19
  20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  21
  22 import cookielib
  23 import datetime
  24 import gzip
  25 import htmlentitydefs
  26 import HTMLParser
  27 import httplib
  28 import locale
  29 import math
  30 import netrc
  31 import os
  32 import os.path
  33 import re
  34 import socket
  35 import string
  36 import subprocess
  37 import sys
  38 import time
  39 import urllib
  40 import urllib2
  41 import warnings
  42 import zlib
  43
  44 if os.name == 'nt':
  45         import ctypes
  46
  47 try:
  48         import email.utils
  49 except ImportError: # Python 2.4
  50         import email.Utils
  51 try:
  52         import cStringIO as StringIO
  53 except ImportError:
  54         import StringIO
  55
  56 # parse_qs was moved from the cgi module to the urlparse module recently.
  57 try:
  58         from urlparse import parse_qs
  59 except ImportError:
  60         from cgi import parse_qs
  61
  62 try:
  63         import lxml.etree
  64 except ImportError:
  65         pass # Handled below
  66
  67 try:
  68         import xml.etree.ElementTree
  69 except ImportError: # Python<2.5: Not officially supported, but let it slip
  70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  71
  72 std_headers = {
  73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  76         'Accept-Encoding': 'gzip, deflate',
  77         'Accept-Language': 'en-us,en;q=0.5',
  78 }
  79
  80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280
 281 class DownloadError(Exception):
 282         """Download Error exception.
 283
 284         This exception may be thrown by FileDownloader objects if they are not
 285         configured to continue on errors. They will contain the appropriate
 286         error message.
 287         """
 288         pass
 289
 290
 291 class SameFileError(Exception):
 292         """Same File exception.
 293
 294         This exception will be thrown by FileDownloader objects if they detect
 295         multiple files would have to be downloaded to the same file on disk.
 296         """
 297         pass
 298
 299
 300 class PostProcessingError(Exception):
 301         """Post Processing exception.
 302
 303         This exception may be raised by PostProcessor's .run() method to
 304         indicate an error in the postprocessing task.
 305         """
 306         pass
 307
 308
 309 class UnavailableVideoError(Exception):
 310         """Unavailable Format exception.
 311
 312         This exception will be thrown when a video is requested
 313         in a format that is not available for that video.
 314         """
 315         pass
 316
 317
 318 class ContentTooShortError(Exception):
 319         """Content Too Short exception.
 320
 321         This exception may be raised by FileDownloader objects when a file they
 322         download is too small for what the server announced first, indicating
 323         the connection was probably interrupted.
 324         """
 325         # Both in bytes
 326         downloaded = None
 327         expected = None
 328
 329         def __init__(self, downloaded, expected):
 330                 self.downloaded = downloaded
 331                 self.expected = expected
 332
 333
 334 class YoutubeDLHandler(urllib2.HTTPHandler):
 335         """Handler for HTTP requests and responses.
 336
 337         This class, when installed with an OpenerDirector, automatically adds
 338         the standard headers to every HTTP request and handles gzipped and
 339         deflated responses from web servers. If compression is to be avoided in
 340         a particular request, the original request in the program code only has
 341         to include the HTTP header "Youtubedl-No-Compression", which will be
 342         removed before making the real request.
 343
 344         Part of this code was copied from:
 345
 346         http://techknack.net/python-urllib2-handlers/
 347
 348         Andrew Rowls, the author of that code, agreed to release it to the
 349         public domain.
 350         """
 351
 352         @staticmethod
 353         def deflate(data):
 354                 try:
 355                         return zlib.decompress(data, -zlib.MAX_WBITS)
 356                 except zlib.error:
 357                         return zlib.decompress(data)
 358
 359         @staticmethod
 360         def addinfourl_wrapper(stream, headers, url, code):
 361                 if hasattr(urllib2.addinfourl, 'getcode'):
 362                         return urllib2.addinfourl(stream, headers, url, code)
 363                 ret = urllib2.addinfourl(stream, headers, url)
 364                 ret.code = code
 365                 return ret
 366
 367         def http_request(self, req):
 368                 for h in std_headers:
 369                         if h in req.headers:
 370                                 del req.headers[h]
 371                         req.add_header(h, std_headers[h])
 372                 if 'Youtubedl-no-compression' in req.headers:
 373                         if 'Accept-encoding' in req.headers:
 374                                 del req.headers['Accept-encoding']
 375                         del req.headers['Youtubedl-no-compression']
 376                 return req
 377
 378         def http_response(self, req, resp):
 379                 old_resp = resp
 380                 # gzip
 381                 if resp.headers.get('Content-encoding', '') == 'gzip':
 382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 384                         resp.msg = old_resp.msg
 385                 # deflate
 386                 if resp.headers.get('Content-encoding', '') == 'deflate':
 387                         gz = StringIO.StringIO(self.deflate(resp.read()))
 388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 389                         resp.msg = old_resp.msg
 390                 return resp
 391
 392
 393 class FileDownloader(object):
 394         """File Downloader class.
 395
 396         File downloader objects are the ones responsible of downloading the
 397         actual video file and writing it to disk if the user has requested
 398         it, among some other tasks. In most cases there should be one per
 399         program. As, given a video URL, the downloader doesn't know how to
 400         extract all the needed information, task that InfoExtractors do, it
 401         has to pass the URL to one of them.
 402
 403         For this, file downloader objects have a method that allows
 404         InfoExtractors to be registered in a given order. When it is passed
 405         a URL, the file downloader handles it to the first InfoExtractor it
 406         finds that reports being able to handle it. The InfoExtractor extracts
 407         all the information about the video or videos the URL refers to, and
 408         asks the FileDownloader to process the video information, possibly
 409         downloading the video.
 410
 411         File downloaders accept a lot of parameters. In order not to saturate
 412         the object constructor with arguments, it receives a dictionary of
 413         options instead. These options are available through the params
 414         attribute for the InfoExtractors to use. The FileDownloader also
 415         registers itself as the downloader in charge for the InfoExtractors
 416         that are added to it, so this is a "mutual registration".
 417
 418         Available options:
 419
 420         username:         Username for authentication purposes.
 421         password:         Password for authentication purposes.
 422         usenetrc:         Use netrc for authentication instead.
 423         quiet:            Do not print messages to stdout.
 424         forceurl:         Force printing final URL.
 425         forcetitle:       Force printing title.
 426         forcethumbnail:   Force printing thumbnail URL.
 427         forcedescription: Force printing description.
 428         forcefilename:    Force printing final filename.
 429         simulate:         Do not download the video files.
 430         format:           Video format code.
 431         format_limit:     Highest quality format to try.
 432         outtmpl:          Template for output names.
 433         ignoreerrors:     Do not stop on download errors.
 434         ratelimit:        Download speed limit, in bytes/sec.
 435         nooverwrites:     Prevent overwriting files.
 436         retries:          Number of times to retry for HTTP error 5xx
 437         continuedl:       Try to continue downloads if possible.
 438         noprogress:       Do not print the progress bar.
 439         playliststart:    Playlist item to start at.
 440         playlistend:      Playlist item to end at.
 441         matchtitle:       Download only matching titles.
 442         rejecttitle:      Reject downloads for matching titles.
 443         logtostderr:      Log messages to stderr instead of stdout.
 444         consoletitle:     Display progress in console window's titlebar.
 445         nopart:           Do not use temporary .part files.
 446         updatetime:       Use the Last-modified header to set output file timestamps.
 447         writedescription: Write the video description to a .description file
 448         writeinfojson:    Write the video description to a .info.json file
 449         """
 450
 451         params = None
 452         _ies = []
 453         _pps = []
 454         _download_retcode = None
 455         _num_downloads = None
 456         _screen_file = None
 457
 458         def __init__(self, params):
 459                 """Create a FileDownloader object with the given options."""
 460                 self._ies = []
 461                 self._pps = []
 462                 self._download_retcode = 0
 463                 self._num_downloads = 0
 464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 465                 self.params = params
 466
 467         @staticmethod
 468         def format_bytes(bytes):
 469                 if bytes is None:
 470                         return 'N/A'
 471                 if type(bytes) is str:
 472                         bytes = float(bytes)
 473                 if bytes == 0.0:
 474                         exponent = 0
 475                 else:
 476                         exponent = long(math.log(bytes, 1024.0))
 477                 suffix = 'bkMGTPEZY'[exponent]
 478                 converted = float(bytes) / float(1024 ** exponent)
 479                 return '%.2f%s' % (converted, suffix)
 480
 481         @staticmethod
 482         def calc_percent(byte_counter, data_len):
 483                 if data_len is None:
 484                         return '---.-%'
 485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 486
 487         @staticmethod
 488         def calc_eta(start, now, total, current):
 489                 if total is None:
 490                         return '--:--'
 491                 dif = now - start
 492                 if current == 0 or dif < 0.001: # One millisecond
 493                         return '--:--'
 494                 rate = float(current) / dif
 495                 eta = long((float(total) - float(current)) / rate)
 496                 (eta_mins, eta_secs) = divmod(eta, 60)
 497                 if eta_mins > 99:
 498                         return '--:--'
 499                 return '%02d:%02d' % (eta_mins, eta_secs)
 500
 501         @staticmethod
 502         def calc_speed(start, now, bytes):
 503                 dif = now - start
 504                 if bytes == 0 or dif < 0.001: # One millisecond
 505                         return '%10s' % '---b/s'
 506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 507
 508         @staticmethod
 509         def best_block_size(elapsed_time, bytes):
 510                 new_min = max(bytes / 2.0, 1.0)
 511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 512                 if elapsed_time < 0.001:
 513                         return long(new_max)
 514                 rate = bytes / elapsed_time
 515                 if rate > new_max:
 516                         return long(new_max)
 517                 if rate < new_min:
 518                         return long(new_min)
 519                 return long(rate)
 520
 521         @staticmethod
 522         def parse_bytes(bytestr):
 523                 """Parse a string indicating a byte quantity into a long integer."""
 524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 525                 if matchobj is None:
 526                         return None
 527                 number = float(matchobj.group(1))
 528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 529                 return long(round(number * multiplier))
 530
 531         def add_info_extractor(self, ie):
 532                 """Add an InfoExtractor object to the end of the list."""
 533                 self._ies.append(ie)
 534                 ie.set_downloader(self)
 535
 536         def add_post_processor(self, pp):
 537                 """Add a PostProcessor object to the end of the chain."""
 538                 self._pps.append(pp)
 539                 pp.set_downloader(self)
 540
 541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 542                 """Print message to stdout if not in quiet mode."""
 543                 try:
 544                         if not self.params.get('quiet', False):
 545                                 terminator = [u'\n', u''][skip_eol]
 546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 547                         self._screen_file.flush()
 548                 except (UnicodeEncodeError), err:
 549                         if not ignore_encoding_errors:
 550                                 raise
 551
 552         def to_stderr(self, message):
 553                 """Print message to stderr."""
 554                 print >>sys.stderr, message.encode(preferredencoding())
 555
 556         def to_cons_title(self, message):
 557                 """Set console/terminal window title to message."""
 558                 if not self.params.get('consoletitle', False):
 559                         return
 560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 561                         # c_wchar_p() might not be necessary if `message` is
 562                         # already of type unicode()
 563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 564                 elif 'TERM' in os.environ:
 565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 566
 567         def fixed_template(self):
 568                 """Checks if the output template is fixed."""
 569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 570
 571         def trouble(self, message=None):
 572                 """Determine action to take when a download problem appears.
 573
 574                 Depending on if the downloader has been configured to ignore
 575                 download errors or not, this method may throw an exception or
 576                 not when errors are found, after printing the message.
 577                 """
 578                 if message is not None:
 579                         self.to_stderr(message)
 580                 if not self.params.get('ignoreerrors', False):
 581                         raise DownloadError(message)
 582                 self._download_retcode = 1
 583
 584         def slow_down(self, start_time, byte_counter):
 585                 """Sleep if the download speed is over the rate limit."""
 586                 rate_limit = self.params.get('ratelimit', None)
 587                 if rate_limit is None or byte_counter == 0:
 588                         return
 589                 now = time.time()
 590                 elapsed = now - start_time
 591                 if elapsed <= 0.0:
 592                         return
 593                 speed = float(byte_counter) / elapsed
 594                 if speed > rate_limit:
 595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 596
 597         def temp_name(self, filename):
 598                 """Returns a temporary filename for the given filename."""
 599                 if self.params.get('nopart', False) or filename == u'-' or \
 600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 601                         return filename
 602                 return filename + u'.part'
 603
 604         def undo_temp_name(self, filename):
 605                 if filename.endswith(u'.part'):
 606                         return filename[:-len(u'.part')]
 607                 return filename
 608
 609         def try_rename(self, old_filename, new_filename):
 610                 try:
 611                         if old_filename == new_filename:
 612                                 return
 613                         os.rename(old_filename, new_filename)
 614                 except (IOError, OSError), err:
 615                         self.trouble(u'ERROR: unable to rename file')
 616
 617         def try_utime(self, filename, last_modified_hdr):
 618                 """Try to set the last-modified time of the given file."""
 619                 if last_modified_hdr is None:
 620                         return
 621                 if not os.path.isfile(filename):
 622                         return
 623                 timestr = last_modified_hdr
 624                 if timestr is None:
 625                         return
 626                 filetime = timeconvert(timestr)
 627                 if filetime is None:
 628                         return filetime
 629                 try:
 630                         os.utime(filename, (time.time(), filetime))
 631                 except:
 632                         pass
 633                 return filetime
 634
 635         def report_writedescription(self, descfn):
 636                 """ Report that the description file is being written """
 637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 638
 639         def report_writeinfojson(self, infofn):
 640                 """ Report that the metadata file has been written """
 641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 642
 643         def report_destination(self, filename):
 644                 """Report destination filename."""
 645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 646
 647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 648                 """Report download progress."""
 649                 if self.params.get('noprogress', False):
 650                         return
 651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 655
 656         def report_resuming_byte(self, resume_len):
 657                 """Report attempt to resume at given byte."""
 658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 659
 660         def report_retry(self, count, retries):
 661                 """Report retry in case of HTTP error 5xx"""
 662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 663
 664         def report_file_already_downloaded(self, file_name):
 665                 """Report file has already been fully downloaded."""
 666                 try:
 667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 668                 except (UnicodeEncodeError), err:
 669                         self.to_screen(u'[download] The file has already been downloaded')
 670
 671         def report_unable_to_resume(self):
 672                 """Report it was impossible to resume download."""
 673                 self.to_screen(u'[download] Unable to resume')
 674
 675         def report_finish(self):
 676                 """Report download finished."""
 677                 if self.params.get('noprogress', False):
 678                         self.to_screen(u'[download] Download completed')
 679                 else:
 680                         self.to_screen(u'')
 681
 682         def increment_downloads(self):
 683                 """Increment the ordinal that assigns a number to each file."""
 684                 self._num_downloads += 1
 685
 686         def prepare_filename(self, info_dict):
 687                 """Generate the output filename."""
 688                 try:
 689                         template_dict = dict(info_dict)
 690                         template_dict['epoch'] = unicode(long(time.time()))
 691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 692                         filename = self.params['outtmpl'] % template_dict
 693                         return filename
 694                 except (ValueError, KeyError), err:
 695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 696                         return None
 697
 698         def process_info(self, info_dict):
 699                 """Process a single dictionary returned by an InfoExtractor."""
 700                 filename = self.prepare_filename(info_dict)
 701
 702                 # Forced printings
 703                 if self.params.get('forcetitle', False):
 704                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 705                 if self.params.get('forceurl', False):
 706                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 707                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 708                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 710                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 711                 if self.params.get('forcefilename', False) and filename is not None:
 712                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 713                 if self.params.get('forceformat', False):
 714                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 715
 716                 # Do nothing else if in simulate mode
 717                 if self.params.get('simulate', False):
 718                         return
 719
 720                 if filename is None:
 721                         return
 722
 723                 matchtitle=self.params.get('matchtitle',False)
 724                 rejecttitle=self.params.get('rejecttitle',False)
 725                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 726                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 727                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 728                         return
 729                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 730                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 731                         return
 732
 733                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 734                         self.to_stderr(u'WARNING: file exists and will be skipped')
 735                         return
 736
 737                 try:
 738                         dn = os.path.dirname(filename)
 739                         if dn != '' and not os.path.exists(dn):
 740                                 os.makedirs(dn)
 741                 except (OSError, IOError), err:
 742                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 743                         return
 744
 745                 if self.params.get('writedescription', False):
 746                         try:
 747                                 descfn = filename + '.description'
 748                                 self.report_writedescription(descfn)
 749                                 descfile = open(descfn, 'wb')
 750                                 try:
 751                                         descfile.write(info_dict['description'].encode('utf-8'))
 752                                 finally:
 753                                         descfile.close()
 754                         except (OSError, IOError):
 755                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 756                                 return
 757
 758                 if self.params.get('writeinfojson', False):
 759                         infofn = filename + '.info.json'
 760                         self.report_writeinfojson(infofn)
 761                         try:
 762                                 json.dump
 763                         except (NameError,AttributeError):
 764                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 765                                 return
 766                         try:
 767                                 infof = open(infofn, 'wb')
 768                                 try:
 769                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 770                                         json.dump(json_info_dict, infof)
 771                                 finally:
 772                                         infof.close()
 773                         except (OSError, IOError):
 774                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 775                                 return
 776
 777                 if not self.params.get('skip_download', False):
 778                         try:
 779                                 success = self._do_download(filename, info_dict)
 780                         except (OSError, IOError), err:
 781                                 raise UnavailableVideoError
 782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 783                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 784                                 return
 785                         except (ContentTooShortError, ), err:
 786                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 787                                 return
 788
 789                         if success:
 790                                 try:
 791                                         self.post_process(filename, info_dict)
 792                                 except (PostProcessingError), err:
 793                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 794                                         return
 795
 796         def download(self, url_list):
 797                 """Download a given list of URLs."""
 798                 if len(url_list) > 1 and self.fixed_template():
 799                         raise SameFileError(self.params['outtmpl'])
 800
 801                 for url in url_list:
 802                         suitable_found = False
 803                         for ie in self._ies:
 804                                 # Go to next InfoExtractor if not suitable
 805                                 if not ie.suitable(url):
 806                                         continue
 807
 808                                 # Suitable InfoExtractor found
 809                                 suitable_found = True
 810
 811                                 # Extract information from URL and process it
 812                                 ie.extract(url)
 813
 814                                 # Suitable InfoExtractor had been found; go to next URL
 815                                 break
 816
 817                         if not suitable_found:
 818                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 819
 820                 return self._download_retcode
 821
 822         def post_process(self, filename, ie_info):
 823                 """Run the postprocessing chain on the given file."""
 824                 info = dict(ie_info)
 825                 info['filepath'] = filename
 826                 for pp in self._pps:
 827                         info = pp.run(info)
 828                         if info is None:
 829                                 break
 830
 831         def _download_with_rtmpdump(self, filename, url, player_url):
 832                 self.report_destination(filename)
 833                 tmpfilename = self.temp_name(filename)
 834
 835                 # Check for rtmpdump first
 836                 try:
 837                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 838                 except (OSError, IOError):
 839                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 840                         return False
 841
 842                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 843                 # the connection was interrumpted and resuming appears to be
 844                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 845                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 846                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 847                 while retval == 2 or retval == 1:
 848                         prevsize = os.path.getsize(tmpfilename)
 849                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 850                         time.sleep(5.0) # This seems to be needed
 851                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 852                         cursize = os.path.getsize(tmpfilename)
 853                         if prevsize == cursize and retval == 1:
 854                                 break
 855                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 856                         if prevsize == cursize and retval == 2 and cursize > 1024:
 857                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 858                                 retval = 0
 859                                 break
 860                 if retval == 0:
 861                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 862                         self.try_rename(tmpfilename, filename)
 863                         return True
 864                 else:
 865                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 866                         return False
 867
 868         def _do_download(self, filename, info_dict):
 869                 url = info_dict['url']
 870                 player_url = info_dict.get('player_url', None)
 871
 872                 # Check file already present
 873                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 874                         self.report_file_already_downloaded(filename)
 875                         return True
 876
 877                 # Attempt to download using rtmpdump
 878                 if url.startswith('rtmp'):
 879                         return self._download_with_rtmpdump(filename, url, player_url)
 880
 881                 tmpfilename = self.temp_name(filename)
 882                 stream = None
 883
 884                 # Do not include the Accept-Encoding header
 885                 headers = {'Youtubedl-no-compression': 'True'}
 886                 basic_request = urllib2.Request(url, None, headers)
 887                 request = urllib2.Request(url, None, headers)
 888
 889                 # Establish possible resume length
 890                 if os.path.isfile(tmpfilename):
 891                         resume_len = os.path.getsize(tmpfilename)
 892                 else:
 893                         resume_len = 0
 894
 895                 open_mode = 'wb'
 896                 if resume_len != 0:
 897                         if self.params.get('continuedl', False):
 898                                 self.report_resuming_byte(resume_len)
 899                                 request.add_header('Range','bytes=%d-' % resume_len)
 900                                 open_mode = 'ab'
 901                         else:
 902                                 resume_len = 0
 903
 904                 count = 0
 905                 retries = self.params.get('retries', 0)
 906                 while count <= retries:
 907                         # Establish connection
 908                         try:
 909                                 if count == 0 and 'urlhandle' in info_dict:
 910                                         data = info_dict['urlhandle']
 911                                 data = urllib2.urlopen(request)
 912                                 break
 913                         except (urllib2.HTTPError, ), err:
 914                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 915                                         # Unexpected HTTP error
 916                                         raise
 917                                 elif err.code == 416:
 918                                         # Unable to resume (requested range not satisfiable)
 919                                         try:
 920                                                 # Open the connection again without the range header
 921                                                 data = urllib2.urlopen(basic_request)
 922                                                 content_length = data.info()['Content-Length']
 923                                         except (urllib2.HTTPError, ), err:
 924                                                 if err.code < 500 or err.code >= 600:
 925                                                         raise
 926                                         else:
 927                                                 # Examine the reported length
 928                                                 if (content_length is not None and
 929                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 930                                                         # The file had already been fully downloaded.
 931                                                         # Explanation to the above condition: in issue #175 it was revealed that
 932                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 933                                                         # changing the file size slightly and causing problems for some users. So
 934                                                         # I decided to implement a suggested change and consider the file
 935                                                         # completely downloaded if the file size differs less than 100 bytes from
 936                                                         # the one in the hard drive.
 937                                                         self.report_file_already_downloaded(filename)
 938                                                         self.try_rename(tmpfilename, filename)
 939                                                         return True
 940                                                 else:
 941                                                         # The length does not match, we start the download over
 942                                                         self.report_unable_to_resume()
 943                                                         open_mode = 'wb'
 944                                                         break
 945                         # Retry
 946                         count += 1
 947                         if count <= retries:
 948                                 self.report_retry(count, retries)
 949
 950                 if count > retries:
 951                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 952                         return False
 953
 954                 data_len = data.info().get('Content-length', None)
 955                 if data_len is not None:
 956                         data_len = long(data_len) + resume_len
 957                 data_len_str = self.format_bytes(data_len)
 958                 byte_counter = 0 + resume_len
 959                 block_size = 1024
 960                 start = time.time()
 961                 while True:
 962                         # Download and write
 963                         before = time.time()
 964                         data_block = data.read(block_size)
 965                         after = time.time()
 966                         if len(data_block) == 0:
 967                                 break
 968                         byte_counter += len(data_block)
 969
 970                         # Open file just in time
 971                         if stream is None:
 972                                 try:
 973                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 974                                         assert stream is not None
 975                                         filename = self.undo_temp_name(tmpfilename)
 976                                         self.report_destination(filename)
 977                                 except (OSError, IOError), err:
 978                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 979                                         return False
 980                         try:
 981                                 stream.write(data_block)
 982                         except (IOError, OSError), err:
 983                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 984                                 return False
 985                         block_size = self.best_block_size(after - before, len(data_block))
 986
 987                         # Progress message
 988                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 989                         if data_len is None:
 990                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
 991                         else:
 992                                 percent_str = self.calc_percent(byte_counter, data_len)
 993                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 994                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 995
 996                         # Apply rate limit
 997                         self.slow_down(start, byte_counter - resume_len)
 998
 999                 if stream is None:
1000                         self.trouble(u'\nERROR: Did not get any data blocks')
1001                         return False
1002                 stream.close()
1003                 self.report_finish()
1004                 if data_len is not None and byte_counter != data_len:
1005                         raise ContentTooShortError(byte_counter, long(data_len))
1006                 self.try_rename(tmpfilename, filename)
1007
1008                 # Update file modification time
1009                 if self.params.get('updatetime', True):
1010                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1011
1012                 return True
1013
1014
1015 class InfoExtractor(object):
1016         """Information Extractor class.
1017
1018         Information extractors are the classes that, given a URL, extract
1019         information from the video (or videos) the URL refers to. This
1020         information includes the real video URL, the video title and simplified
1021         title, author and others. The information is stored in a dictionary
1022         which is then passed to the FileDownloader. The FileDownloader
1023         processes this information possibly downloading the video to the file
1024         system, among other possible outcomes. The dictionaries must include
1025         the following fields:
1026
1027         id:             Video identifier.
1028         url:            Final video URL.
1029         uploader:       Nickname of the video uploader.
1030         title:          Literal title.
1031         stitle:         Simplified title.
1032         ext:            Video filename extension.
1033         format:         Video format.
1034         player_url:     SWF Player URL (may be None).
1035
1036         The following fields are optional. Their primary purpose is to allow
1037         youtube-dl to serve as the backend for a video search function, such
1038         as the one in youtube2mp3.  They are only used when their respective
1039         forced printing functions are called:
1040
1041         thumbnail:      Full URL to a video thumbnail image.
1042         description:    One-line video description.
1043
1044         Subclasses of this one should re-define the _real_initialize() and
1045         _real_extract() methods and define a _VALID_URL regexp.
1046         Probably, they should also be added to the list of extractors.
1047         """
1048
1049         _ready = False
1050         _downloader = None
1051
1052         def __init__(self, downloader=None):
1053                 """Constructor. Receives an optional downloader."""
1054                 self._ready = False
1055                 self.set_downloader(downloader)
1056
1057         def suitable(self, url):
1058                 """Receives a URL and returns True if suitable for this IE."""
1059                 return re.match(self._VALID_URL, url) is not None
1060
1061         def initialize(self):
1062                 """Initializes an instance (authentication, etc)."""
1063                 if not self._ready:
1064                         self._real_initialize()
1065                         self._ready = True
1066
1067         def extract(self, url):
1068                 """Extracts URL information and returns it in list of dicts."""
1069                 self.initialize()
1070                 return self._real_extract(url)
1071
1072         def set_downloader(self, downloader):
1073                 """Sets the downloader for this IE."""
1074                 self._downloader = downloader
1075
1076         def _real_initialize(self):
1077                 """Real initialization process. Redefine in subclasses."""
1078                 pass
1079
1080         def _real_extract(self, url):
1081                 """Real extraction process. Redefine in subclasses."""
1082                 pass
1083
1084
1085 class YoutubeIE(InfoExtractor):
1086         """Information extractor for youtube.com."""
1087
1088         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1089         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1090         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1091         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1092         _NETRC_MACHINE = 'youtube'
1093         # Listed in order of quality
1094         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1095         _video_extensions = {
1096                 '13': '3gp',
1097                 '17': 'mp4',
1098                 '18': 'mp4',
1099                 '22': 'mp4',
1100                 '37': 'mp4',
1101                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1102                 '43': 'webm',
1103                 '44': 'webm',
1104                 '45': 'webm',
1105         }
1106         _video_dimensions = {
1107                 '5': '240x400',
1108                 '6': '???',
1109                 '13': '???',
1110                 '17': '144x176',
1111                 '18': '360x640',
1112                 '22': '720x1280',
1113                 '34': '360x640',
1114                 '35': '480x854',
1115                 '37': '1080x1920',
1116                 '38': '3072x4096',
1117                 '43': '360x640',
1118                 '44': '480x854',
1119                 '45': '720x1280',
1120         }
1121         IE_NAME = u'youtube'
1122
1123         def report_lang(self):
1124                 """Report attempt to set language."""
1125                 self._downloader.to_screen(u'[youtube] Setting language')
1126
1127         def report_login(self):
1128                 """Report attempt to log in."""
1129                 self._downloader.to_screen(u'[youtube] Logging in')
1130
1131         def report_age_confirmation(self):
1132                 """Report attempt to confirm age."""
1133                 self._downloader.to_screen(u'[youtube] Confirming age')
1134
1135         def report_video_webpage_download(self, video_id):
1136                 """Report attempt to download video webpage."""
1137                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1138
1139         def report_video_info_webpage_download(self, video_id):
1140                 """Report attempt to download video info webpage."""
1141                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1142
1143         def report_information_extraction(self, video_id):
1144                 """Report attempt to extract video information."""
1145                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1146
1147         def report_unavailable_format(self, video_id, format):
1148                 """Report extracted video URL."""
1149                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1150
1151         def report_rtmp_download(self):
1152                 """Indicate the download will use the RTMP protocol."""
1153                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1154
1155         def _print_formats(self, formats):
1156                 print 'Available formats:'
1157                 for x in formats:
1158                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1159
1160         def _real_initialize(self):
1161                 if self._downloader is None:
1162                         return
1163
1164                 username = None
1165                 password = None
1166                 downloader_params = self._downloader.params
1167
1168                 # Attempt to use provided username and password or .netrc data
1169                 if downloader_params.get('username', None) is not None:
1170                         username = downloader_params['username']
1171                         password = downloader_params['password']
1172                 elif downloader_params.get('usenetrc', False):
1173                         try:
1174                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1175                                 if info is not None:
1176                                         username = info[0]
1177                                         password = info[2]
1178                                 else:
1179                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1180                         except (IOError, netrc.NetrcParseError), err:
1181                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1182                                 return
1183
1184                 # Set language
1185                 request = urllib2.Request(self._LANG_URL)
1186                 try:
1187                         self.report_lang()
1188                         urllib2.urlopen(request).read()
1189                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1191                         return
1192
1193                 # No authentication to be performed
1194                 if username is None:
1195                         return
1196
1197                 # Log in
1198                 login_form = {
1199                                 'current_form': 'loginForm',
1200                                 'next':         '/',
1201                                 'action_login': 'Log In',
1202                                 'username':     username,
1203                                 'password':     password,
1204                                 }
1205                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1206                 try:
1207                         self.report_login()
1208                         login_results = urllib2.urlopen(request).read()
1209                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1210                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1211                                 return
1212                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1213                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1214                         return
1215
1216                 # Confirm age
1217                 age_form = {
1218                                 'next_url':             '/',
1219                                 'action_confirm':       'Confirm',
1220                                 }
1221                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1222                 try:
1223                         self.report_age_confirmation()
1224                         age_results = urllib2.urlopen(request).read()
1225                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1226                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1227                         return
1228
1229         def _real_extract(self, url):
1230                 # Extract video id from URL
1231                 mobj = re.match(self._VALID_URL, url)
1232                 if mobj is None:
1233                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1234                         return
1235                 video_id = mobj.group(2)
1236
1237                 # Get video webpage
1238                 self.report_video_webpage_download(video_id)
1239                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1240                 try:
1241                         video_webpage = urllib2.urlopen(request).read()
1242                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1244                         return
1245
1246                 # Attempt to extract SWF player URL
1247                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1248                 if mobj is not None:
1249                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1250                 else:
1251                         player_url = None
1252
1253                 # Get video info
1254                 self.report_video_info_webpage_download(video_id)
1255                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1256                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1257                                         % (video_id, el_type))
1258                         request = urllib2.Request(video_info_url)
1259                         try:
1260                                 video_info_webpage = urllib2.urlopen(request).read()
1261                                 video_info = parse_qs(video_info_webpage)
1262                                 if 'token' in video_info:
1263                                         break
1264                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1265                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1266                                 return
1267                 if 'token' not in video_info:
1268                         if 'reason' in video_info:
1269                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1270                         else:
1271                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1272                         return
1273
1274                 # Start extracting information
1275                 self.report_information_extraction(video_id)
1276
1277                 # uploader
1278                 if 'author' not in video_info:
1279                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1280                         return
1281                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1282
1283                 # title
1284                 if 'title' not in video_info:
1285                         self._downloader.trouble(u'ERROR: unable to extract video title')
1286                         return
1287                 video_title = urllib.unquote_plus(video_info['title'][0])
1288                 video_title = video_title.decode('utf-8')
1289                 video_title = sanitize_title(video_title)
1290
1291                 # simplified title
1292                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1293                 simple_title = simple_title.strip(ur'_')
1294
1295                 # thumbnail image
1296                 if 'thumbnail_url' not in video_info:
1297                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1298                         video_thumbnail = ''
1299                 else:   # don't panic if we can't find it
1300                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1301
1302                 # upload date
1303                 upload_date = u'NA'
1304                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1305                 if mobj is not None:
1306                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1307                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1308                         for expression in format_expressions:
1309                                 try:
1310                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1311                                 except:
1312                                         pass
1313
1314                 # description
1315                 try:
1316                         lxml.etree
1317                 except NameError:
1318                         video_description = u'No description available.'
1319                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1320                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1321                                 if mobj is not None:
1322                                         video_description = mobj.group(1).decode('utf-8')
1323                 else:
1324                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1325                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1326                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1327                         # TODO use another parser
1328
1329                 # token
1330                 video_token = urllib.unquote_plus(video_info['token'][0])
1331
1332                 # Decide which formats to download
1333                 req_format = self._downloader.params.get('format', None)
1334
1335                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1336                         self.report_rtmp_download()
1337                         video_url_list = [(None, video_info['conn'][0])]
1338                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1339                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1340                         url_data = [parse_qs(uds) for uds in url_data_strs]
1341                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1342                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1343
1344                         format_limit = self._downloader.params.get('format_limit', None)
1345                         if format_limit is not None and format_limit in self._available_formats:
1346                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1347                         else:
1348                                 format_list = self._available_formats
1349                         existing_formats = [x for x in format_list if x in url_map]
1350                         if len(existing_formats) == 0:
1351                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1352                                 return
1353                         if self._downloader.params.get('listformats', None):
1354                                 self._print_formats(existing_formats)
1355                                 return
1356                         if req_format is None or req_format == 'best':
1357                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1358                         elif req_format == 'worst':
1359                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1360                         elif req_format in ('-1', 'all'):
1361                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1362                         else:
1363                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1364                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1365                                 req_formats = req_format.split('/')
1366                                 video_url_list = None
1367                                 for rf in req_formats:
1368                                         if rf in url_map:
1369                                                 video_url_list = [(rf, url_map[rf])]
1370                                                 break
1371                                 if video_url_list is None:
1372                                         self._downloader.trouble(u'ERROR: requested format not available')
1373                                         return
1374                 else:
1375                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1376                         return
1377
1378                 for format_param, video_real_url in video_url_list:
1379                         # At this point we have a new video
1380                         self._downloader.increment_downloads()
1381
1382                         # Extension
1383                         video_extension = self._video_extensions.get(format_param, 'flv')
1384
1385                         try:
1386                                 # Process video information
1387                                 self._downloader.process_info({
1388                                         'id':           video_id.decode('utf-8'),
1389                                         'url':          video_real_url.decode('utf-8'),
1390                                         'uploader':     video_uploader.decode('utf-8'),
1391                                         'upload_date':  upload_date,
1392                                         'title':        video_title,
1393                                         'stitle':       simple_title,
1394                                         'ext':          video_extension.decode('utf-8'),
1395                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1396                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1397                                         'description':  video_description,
1398                                         'player_url':   player_url,
1399                                 })
1400                         except UnavailableVideoError, err:
1401                                 self._downloader.trouble(u'\nERROR: unable to download video')
1402
1403
1404 class MetacafeIE(InfoExtractor):
1405         """Information Extractor for metacafe.com."""
1406
1407         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1408         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1409         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1410         _youtube_ie = None
1411         IE_NAME = u'metacafe'
1412
1413         def __init__(self, youtube_ie, downloader=None):
1414                 InfoExtractor.__init__(self, downloader)
1415                 self._youtube_ie = youtube_ie
1416
1417         def report_disclaimer(self):
1418                 """Report disclaimer retrieval."""
1419                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1420
1421         def report_age_confirmation(self):
1422                 """Report attempt to confirm age."""
1423                 self._downloader.to_screen(u'[metacafe] Confirming age')
1424
1425         def report_download_webpage(self, video_id):
1426                 """Report webpage download."""
1427                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1428
1429         def report_extraction(self, video_id):
1430                 """Report information extraction."""
1431                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1432
1433         def _real_initialize(self):
1434                 # Retrieve disclaimer
1435                 request = urllib2.Request(self._DISCLAIMER)
1436                 try:
1437                         self.report_disclaimer()
1438                         disclaimer = urllib2.urlopen(request).read()
1439                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1440                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1441                         return
1442
1443                 # Confirm age
1444                 disclaimer_form = {
1445                         'filters': '0',
1446                         'submit': "Continue - I'm over 18",
1447                         }
1448                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1449                 try:
1450                         self.report_age_confirmation()
1451                         disclaimer = urllib2.urlopen(request).read()
1452                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1453                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1454                         return
1455
1456         def _real_extract(self, url):
1457                 # Extract id and simplified title from URL
1458                 mobj = re.match(self._VALID_URL, url)
1459                 if mobj is None:
1460                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1461                         return
1462
1463                 video_id = mobj.group(1)
1464
1465                 # Check if video comes from YouTube
1466                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1467                 if mobj2 is not None:
1468                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1469                         return
1470
1471                 # At this point we have a new video
1472                 self._downloader.increment_downloads()
1473
1474                 simple_title = mobj.group(2).decode('utf-8')
1475
1476                 # Retrieve video webpage to extract further information
1477                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1478                 try:
1479                         self.report_download_webpage(video_id)
1480                         webpage = urllib2.urlopen(request).read()
1481                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1482                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1483                         return
1484
1485                 # Extract URL, uploader and title from webpage
1486                 self.report_extraction(video_id)
1487                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1488                 if mobj is not None:
1489                         mediaURL = urllib.unquote(mobj.group(1))
1490                         video_extension = mediaURL[-3:]
1491
1492                         # Extract gdaKey if available
1493                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1494                         if mobj is None:
1495                                 video_url = mediaURL
1496                         else:
1497                                 gdaKey = mobj.group(1)
1498                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1499                 else:
1500                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1501                         if mobj is None:
1502                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1503                                 return
1504                         vardict = parse_qs(mobj.group(1))
1505                         if 'mediaData' not in vardict:
1506                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1507                                 return
1508                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1509                         if mobj is None:
1510                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1511                                 return
1512                         mediaURL = mobj.group(1).replace('\\/', '/')
1513                         video_extension = mediaURL[-3:]
1514                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1515
1516                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1517                 if mobj is None:
1518                         self._downloader.trouble(u'ERROR: unable to extract title')
1519                         return
1520                 video_title = mobj.group(1).decode('utf-8')
1521                 video_title = sanitize_title(video_title)
1522
1523                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1524                 if mobj is None:
1525                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1526                         return
1527                 video_uploader = mobj.group(1)
1528
1529                 try:
1530                         # Process video information
1531                         self._downloader.process_info({
1532                                 'id':           video_id.decode('utf-8'),
1533                                 'url':          video_url.decode('utf-8'),
1534                                 'uploader':     video_uploader.decode('utf-8'),
1535                                 'upload_date':  u'NA',
1536                                 'title':        video_title,
1537                                 'stitle':       simple_title,
1538                                 'ext':          video_extension.decode('utf-8'),
1539                                 'format':       u'NA',
1540                                 'player_url':   None,
1541                         })
1542                 except UnavailableVideoError:
1543                         self._downloader.trouble(u'\nERROR: unable to download video')
1544
1545
1546 class DailymotionIE(InfoExtractor):
1547         """Information Extractor for Dailymotion"""
1548
1549         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1550         IE_NAME = u'dailymotion'
1551
1552         def __init__(self, downloader=None):
1553                 InfoExtractor.__init__(self, downloader)
1554
1555         def report_download_webpage(self, video_id):
1556                 """Report webpage download."""
1557                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1558
1559         def report_extraction(self, video_id):
1560                 """Report information extraction."""
1561                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1562
1563         def _real_initialize(self):
1564                 return
1565
1566         def _real_extract(self, url):
1567                 # Extract id and simplified title from URL
1568                 mobj = re.match(self._VALID_URL, url)
1569                 if mobj is None:
1570                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1571                         return
1572
1573                 # At this point we have a new video
1574                 self._downloader.increment_downloads()
1575                 video_id = mobj.group(1)
1576
1577                 simple_title = mobj.group(2).decode('utf-8')
1578                 video_extension = 'flv'
1579
1580                 # Retrieve video webpage to extract further information
1581                 request = urllib2.Request(url)
1582                 request.add_header('Cookie', 'family_filter=off')
1583                 try:
1584                         self.report_download_webpage(video_id)
1585                         webpage = urllib2.urlopen(request).read()
1586                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1587                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1588                         return
1589
1590                 # Extract URL, uploader and title from webpage
1591                 self.report_extraction(video_id)
1592                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1593                 if mobj is None:
1594                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1595                         return
1596                 sequence = urllib.unquote(mobj.group(1))
1597                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1598                 if mobj is None:
1599                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1600                         return
1601                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1602
1603                 # if needed add http://www.dailymotion.com/ if relative URL
1604
1605                 video_url = mediaURL
1606
1607                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1608                 if mobj is None:
1609                         self._downloader.trouble(u'ERROR: unable to extract title')
1610                         return
1611                 video_title = mobj.group(1).decode('utf-8')
1612                 video_title = sanitize_title(video_title)
1613
1614                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1615                 if mobj is None:
1616                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1617                         return
1618                 video_uploader = mobj.group(1)
1619
1620                 try:
1621                         # Process video information
1622                         self._downloader.process_info({
1623                                 'id':           video_id.decode('utf-8'),
1624                                 'url':          video_url.decode('utf-8'),
1625                                 'uploader':     video_uploader.decode('utf-8'),
1626                                 'upload_date':  u'NA',
1627                                 'title':        video_title,
1628                                 'stitle':       simple_title,
1629                                 'ext':          video_extension.decode('utf-8'),
1630                                 'format':       u'NA',
1631                                 'player_url':   None,
1632                         })
1633                 except UnavailableVideoError:
1634                         self._downloader.trouble(u'\nERROR: unable to download video')
1635
1636
1637 class GoogleIE(InfoExtractor):
1638         """Information extractor for video.google.com."""
1639
1640         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1641         IE_NAME = u'video.google'
1642
1643         def __init__(self, downloader=None):
1644                 InfoExtractor.__init__(self, downloader)
1645
1646         def report_download_webpage(self, video_id):
1647                 """Report webpage download."""
1648                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1649
1650         def report_extraction(self, video_id):
1651                 """Report information extraction."""
1652                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1653
1654         def _real_initialize(self):
1655                 return
1656
1657         def _real_extract(self, url):
1658                 # Extract id from URL
1659                 mobj = re.match(self._VALID_URL, url)
1660                 if mobj is None:
1661                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1662                         return
1663
1664                 # At this point we have a new video
1665                 self._downloader.increment_downloads()
1666                 video_id = mobj.group(1)
1667
1668                 video_extension = 'mp4'
1669
1670                 # Retrieve video webpage to extract further information
1671                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1672                 try:
1673                         self.report_download_webpage(video_id)
1674                         webpage = urllib2.urlopen(request).read()
1675                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1677                         return
1678
1679                 # Extract URL, uploader, and title from webpage
1680                 self.report_extraction(video_id)
1681                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1682                 if mobj is None:
1683                         video_extension = 'flv'
1684                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1685                 if mobj is None:
1686                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1687                         return
1688                 mediaURL = urllib.unquote(mobj.group(1))
1689                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1690                 mediaURL = mediaURL.replace('\\x26', '\x26')
1691
1692                 video_url = mediaURL
1693
1694                 mobj = re.search(r'<title>(.*)</title>', webpage)
1695                 if mobj is None:
1696                         self._downloader.trouble(u'ERROR: unable to extract title')
1697                         return
1698                 video_title = mobj.group(1).decode('utf-8')
1699                 video_title = sanitize_title(video_title)
1700                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1701
1702                 # Extract video description
1703                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1704                 if mobj is None:
1705                         self._downloader.trouble(u'ERROR: unable to extract video description')
1706                         return
1707                 video_description = mobj.group(1).decode('utf-8')
1708                 if not video_description:
1709                         video_description = 'No description available.'
1710
1711                 # Extract video thumbnail
1712                 if self._downloader.params.get('forcethumbnail', False):
1713                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1714                         try:
1715                                 webpage = urllib2.urlopen(request).read()
1716                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1718                                 return
1719                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1720                         if mobj is None:
1721                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1722                                 return
1723                         video_thumbnail = mobj.group(1)
1724                 else:   # we need something to pass to process_info
1725                         video_thumbnail = ''
1726
1727                 try:
1728                         # Process video information
1729                         self._downloader.process_info({
1730                                 'id':           video_id.decode('utf-8'),
1731                                 'url':          video_url.decode('utf-8'),
1732                                 'uploader':     u'NA',
1733                                 'upload_date':  u'NA',
1734                                 'title':        video_title,
1735                                 'stitle':       simple_title,
1736                                 'ext':          video_extension.decode('utf-8'),
1737                                 'format':       u'NA',
1738                                 'player_url':   None,
1739                         })
1740                 except UnavailableVideoError:
1741                         self._downloader.trouble(u'\nERROR: unable to download video')
1742
1743
1744 class PhotobucketIE(InfoExtractor):
1745         """Information extractor for photobucket.com."""
1746
1747         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1748         IE_NAME = u'photobucket'
1749
1750         def __init__(self, downloader=None):
1751                 InfoExtractor.__init__(self, downloader)
1752
1753         def report_download_webpage(self, video_id):
1754                 """Report webpage download."""
1755                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1756
1757         def report_extraction(self, video_id):
1758                 """Report information extraction."""
1759                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1760
1761         def _real_initialize(self):
1762                 return
1763
1764         def _real_extract(self, url):
1765                 # Extract id from URL
1766                 mobj = re.match(self._VALID_URL, url)
1767                 if mobj is None:
1768                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1769                         return
1770
1771                 # At this point we have a new video
1772                 self._downloader.increment_downloads()
1773                 video_id = mobj.group(1)
1774
1775                 video_extension = 'flv'
1776
1777                 # Retrieve video webpage to extract further information
1778                 request = urllib2.Request(url)
1779                 try:
1780                         self.report_download_webpage(video_id)
1781                         webpage = urllib2.urlopen(request).read()
1782                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1784                         return
1785
1786                 # Extract URL, uploader, and title from webpage
1787                 self.report_extraction(video_id)
1788                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1789                 if mobj is None:
1790                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1791                         return
1792                 mediaURL = urllib.unquote(mobj.group(1))
1793
1794                 video_url = mediaURL
1795
1796                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1797                 if mobj is None:
1798                         self._downloader.trouble(u'ERROR: unable to extract title')
1799                         return
1800                 video_title = mobj.group(1).decode('utf-8')
1801                 video_title = sanitize_title(video_title)
1802                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1803
1804                 video_uploader = mobj.group(2).decode('utf-8')
1805
1806                 try:
1807                         # Process video information
1808                         self._downloader.process_info({
1809                                 'id':           video_id.decode('utf-8'),
1810                                 'url':          video_url.decode('utf-8'),
1811                                 'uploader':     video_uploader,
1812                                 'upload_date':  u'NA',
1813                                 'title':        video_title,
1814                                 'stitle':       simple_title,
1815                                 'ext':          video_extension.decode('utf-8'),
1816                                 'format':       u'NA',
1817                                 'player_url':   None,
1818                         })
1819                 except UnavailableVideoError:
1820                         self._downloader.trouble(u'\nERROR: unable to download video')
1821
1822
1823 class YahooIE(InfoExtractor):
1824         """Information extractor for video.yahoo.com."""
1825
1826         # _VALID_URL matches all Yahoo! Video URLs
1827         # _VPAGE_URL matches only the extractable '/watch/' URLs
1828         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1829         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1830         IE_NAME = u'video.yahoo'
1831
1832         def __init__(self, downloader=None):
1833                 InfoExtractor.__init__(self, downloader)
1834
1835         def report_download_webpage(self, video_id):
1836                 """Report webpage download."""
1837                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1838
1839         def report_extraction(self, video_id):
1840                 """Report information extraction."""
1841                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1842
1843         def _real_initialize(self):
1844                 return
1845
1846         def _real_extract(self, url, new_video=True):
1847                 # Extract ID from URL
1848                 mobj = re.match(self._VALID_URL, url)
1849                 if mobj is None:
1850                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1851                         return
1852
1853                 # At this point we have a new video
1854                 self._downloader.increment_downloads()
1855                 video_id = mobj.group(2)
1856                 video_extension = 'flv'
1857
1858                 # Rewrite valid but non-extractable URLs as
1859                 # extractable English language /watch/ URLs
1860                 if re.match(self._VPAGE_URL, url) is None:
1861                         request = urllib2.Request(url)
1862                         try:
1863                                 webpage = urllib2.urlopen(request).read()
1864                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1865                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1866                                 return
1867
1868                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1869                         if mobj is None:
1870                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1871                                 return
1872                         yahoo_id = mobj.group(1)
1873
1874                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1875                         if mobj is None:
1876                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1877                                 return
1878                         yahoo_vid = mobj.group(1)
1879
1880                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1881                         return self._real_extract(url, new_video=False)
1882
1883                 # Retrieve video webpage to extract further information
1884                 request = urllib2.Request(url)
1885                 try:
1886                         self.report_download_webpage(video_id)
1887                         webpage = urllib2.urlopen(request).read()
1888                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1889                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1890                         return
1891
1892                 # Extract uploader and title from webpage
1893                 self.report_extraction(video_id)
1894                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1895                 if mobj is None:
1896                         self._downloader.trouble(u'ERROR: unable to extract video title')
1897                         return
1898                 video_title = mobj.group(1).decode('utf-8')
1899                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1900
1901                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1902                 if mobj is None:
1903                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1904                         return
1905                 video_uploader = mobj.group(1).decode('utf-8')
1906
1907                 # Extract video thumbnail
1908                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1909                 if mobj is None:
1910                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1911                         return
1912                 video_thumbnail = mobj.group(1).decode('utf-8')
1913
1914                 # Extract video description
1915                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1916                 if mobj is None:
1917                         self._downloader.trouble(u'ERROR: unable to extract video description')
1918                         return
1919                 video_description = mobj.group(1).decode('utf-8')
1920                 if not video_description:
1921                         video_description = 'No description available.'
1922
1923                 # Extract video height and width
1924                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1925                 if mobj is None:
1926                         self._downloader.trouble(u'ERROR: unable to extract video height')
1927                         return
1928                 yv_video_height = mobj.group(1)
1929
1930                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1931                 if mobj is None:
1932                         self._downloader.trouble(u'ERROR: unable to extract video width')
1933                         return
1934                 yv_video_width = mobj.group(1)
1935
1936                 # Retrieve video playlist to extract media URL
1937                 # I'm not completely sure what all these options are, but we
1938                 # seem to need most of them, otherwise the server sends a 401.
1939                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1940                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1941                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1942                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1943                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1944                 try:
1945                         self.report_download_webpage(video_id)
1946                         webpage = urllib2.urlopen(request).read()
1947                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1948                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1949                         return
1950
1951                 # Extract media URL from playlist XML
1952                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1953                 if mobj is None:
1954                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1955                         return
1956                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1957                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1958
1959                 try:
1960                         # Process video information
1961                         self._downloader.process_info({
1962                                 'id':           video_id.decode('utf-8'),
1963                                 'url':          video_url,
1964                                 'uploader':     video_uploader,
1965                                 'upload_date':  u'NA',
1966                                 'title':        video_title,
1967                                 'stitle':       simple_title,
1968                                 'ext':          video_extension.decode('utf-8'),
1969                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1970                                 'description':  video_description,
1971                                 'thumbnail':    video_thumbnail,
1972                                 'player_url':   None,
1973                         })
1974                 except UnavailableVideoError:
1975                         self._downloader.trouble(u'\nERROR: unable to download video')
1976
1977
1978 class VimeoIE(InfoExtractor):
1979         """Information extractor for vimeo.com."""
1980
1981         # _VALID_URL matches Vimeo URLs
1982         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1983         IE_NAME = u'vimeo'
1984
1985         def __init__(self, downloader=None):
1986                 InfoExtractor.__init__(self, downloader)
1987
1988         def report_download_webpage(self, video_id):
1989                 """Report webpage download."""
1990                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1991
1992         def report_extraction(self, video_id):
1993                 """Report information extraction."""
1994                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1995
1996         def _real_initialize(self):
1997                 return
1998
1999         def _real_extract(self, url, new_video=True):
2000                 # Extract ID from URL
2001                 mobj = re.match(self._VALID_URL, url)
2002                 if mobj is None:
2003                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2004                         return
2005
2006                 # At this point we have a new video
2007                 self._downloader.increment_downloads()
2008                 video_id = mobj.group(1)
2009
2010                 # Retrieve video webpage to extract further information
2011                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2012                 try:
2013                         self.report_download_webpage(video_id)
2014                         webpage = urllib2.urlopen(request).read()
2015                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2017                         return
2018
2019                 # Now we begin extracting as much information as we can from what we
2020                 # retrieved. First we extract the information common to all extractors,
2021                 # and latter we extract those that are Vimeo specific.
2022                 self.report_extraction(video_id)
2023
2024                 # Extract title
2025                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2026                 if mobj is None:
2027                         self._downloader.trouble(u'ERROR: unable to extract video title')
2028                         return
2029                 video_title = mobj.group(1).decode('utf-8')
2030                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2031
2032                 # Extract uploader
2033                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2034                 if mobj is None:
2035                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2036                         return
2037                 video_uploader = mobj.group(1).decode('utf-8')
2038
2039                 # Extract video thumbnail
2040                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2041                 if mobj is None:
2042                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2043                         return
2044                 video_thumbnail = mobj.group(1).decode('utf-8')
2045
2046                 # # Extract video description
2047                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2048                 # if mobj is None:
2049                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2050                 #       return
2051                 # video_description = mobj.group(1).decode('utf-8')
2052                 # if not video_description: video_description = 'No description available.'
2053                 video_description = 'Foo.'
2054
2055                 # Vimeo specific: extract request signature
2056                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2057                 if mobj is None:
2058                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2059                         return
2060                 sig = mobj.group(1).decode('utf-8')
2061
2062                 # Vimeo specific: extract video quality information
2063                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2064                 if mobj is None:
2065                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2066                         return
2067                 quality = mobj.group(1).decode('utf-8')
2068
2069                 if int(quality) == 1:
2070                         quality = 'hd'
2071                 else:
2072                         quality = 'sd'
2073
2074                 # Vimeo specific: Extract request signature expiration
2075                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2076                 if mobj is None:
2077                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2078                         return
2079                 sig_exp = mobj.group(1).decode('utf-8')
2080
2081                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2082
2083                 try:
2084                         # Process video information
2085                         self._downloader.process_info({
2086                                 'id':           video_id.decode('utf-8'),
2087                                 'url':          video_url,
2088                                 'uploader':     video_uploader,
2089                                 'upload_date':  u'NA',
2090                                 'title':        video_title,
2091                                 'stitle':       simple_title,
2092                                 'ext':          u'mp4',
2093                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2094                                 'description':  video_description,
2095                                 'thumbnail':    video_thumbnail,
2096                                 'description':  video_description,
2097                                 'player_url':   None,
2098                         })
2099                 except UnavailableVideoError:
2100                         self._downloader.trouble(u'ERROR: unable to download video')
2101
2102
2103 class GenericIE(InfoExtractor):
2104         """Generic last-resort information extractor."""
2105
2106         _VALID_URL = r'.*'
2107         IE_NAME = u'generic'
2108
2109         def __init__(self, downloader=None):
2110                 InfoExtractor.__init__(self, downloader)
2111
2112         def report_download_webpage(self, video_id):
2113                 """Report webpage download."""
2114                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2115                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2116
2117         def report_extraction(self, video_id):
2118                 """Report information extraction."""
2119                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2120
2121         def _real_initialize(self):
2122                 return
2123
2124         def _real_extract(self, url):
2125                 # At this point we have a new video
2126                 self._downloader.increment_downloads()
2127
2128                 video_id = url.split('/')[-1]
2129                 request = urllib2.Request(url)
2130                 try:
2131                         self.report_download_webpage(video_id)
2132                         webpage = urllib2.urlopen(request).read()
2133                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2134                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2135                         return
2136                 except ValueError, err:
2137                         # since this is the last-resort InfoExtractor, if
2138                         # this error is thrown, it'll be thrown here
2139                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2140                         return
2141
2142                 self.report_extraction(video_id)
2143                 # Start with something easy: JW Player in SWFObject
2144                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2145                 if mobj is None:
2146                         # Broaden the search a little bit
2147                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2148                 if mobj is None:
2149                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2150                         return
2151
2152                 # It's possible that one of the regexes
2153                 # matched, but returned an empty group:
2154                 if mobj.group(1) is None:
2155                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2156                         return
2157
2158                 video_url = urllib.unquote(mobj.group(1))
2159                 video_id = os.path.basename(video_url)
2160
2161                 # here's a fun little line of code for you:
2162                 video_extension = os.path.splitext(video_id)[1][1:]
2163                 video_id = os.path.splitext(video_id)[0]
2164
2165                 # it's tempting to parse this further, but you would
2166                 # have to take into account all the variations like
2167                 #   Video Title - Site Name
2168                 #   Site Name | Video Title
2169                 #   Video Title - Tagline | Site Name
2170                 # and so on and so forth; it's just not practical
2171                 mobj = re.search(r'<title>(.*)</title>', webpage)
2172                 if mobj is None:
2173                         self._downloader.trouble(u'ERROR: unable to extract title')
2174                         return
2175                 video_title = mobj.group(1).decode('utf-8')
2176                 video_title = sanitize_title(video_title)
2177                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2178
2179                 # video uploader is domain name
2180                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2181                 if mobj is None:
2182                         self._downloader.trouble(u'ERROR: unable to extract title')
2183                         return
2184                 video_uploader = mobj.group(1).decode('utf-8')
2185
2186                 try:
2187                         # Process video information
2188                         self._downloader.process_info({
2189                                 'id':           video_id.decode('utf-8'),
2190                                 'url':          video_url.decode('utf-8'),
2191                                 'uploader':     video_uploader,
2192                                 'upload_date':  u'NA',
2193                                 'title':        video_title,
2194                                 'stitle':       simple_title,
2195                                 'ext':          video_extension.decode('utf-8'),
2196                                 'format':       u'NA',
2197                                 'player_url':   None,
2198                         })
2199                 except UnavailableVideoError, err:
2200                         self._downloader.trouble(u'\nERROR: unable to download video')
2201
2202
2203 class YoutubeSearchIE(InfoExtractor):
2204         """Information Extractor for YouTube search queries."""
2205         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2206         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2207         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2208         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2209         _youtube_ie = None
2210         _max_youtube_results = 1000
2211         IE_NAME = u'youtube:search'
2212
2213         def __init__(self, youtube_ie, downloader=None):
2214                 InfoExtractor.__init__(self, downloader)
2215                 self._youtube_ie = youtube_ie
2216
2217         def report_download_page(self, query, pagenum):
2218                 """Report attempt to download playlist page with given number."""
2219                 query = query.decode(preferredencoding())
2220                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2221
2222         def _real_initialize(self):
2223                 self._youtube_ie.initialize()
2224
2225         def _real_extract(self, query):
2226                 mobj = re.match(self._VALID_URL, query)
2227                 if mobj is None:
2228                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2229                         return
2230
2231                 prefix, query = query.split(':')
2232                 prefix = prefix[8:]
2233                 query = query.encode('utf-8')
2234                 if prefix == '':
2235                         self._download_n_results(query, 1)
2236                         return
2237                 elif prefix == 'all':
2238                         self._download_n_results(query, self._max_youtube_results)
2239                         return
2240                 else:
2241                         try:
2242                                 n = long(prefix)
2243                                 if n <= 0:
2244                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2245                                         return
2246                                 elif n > self._max_youtube_results:
2247                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2248                                         n = self._max_youtube_results
2249                                 self._download_n_results(query, n)
2250                                 return
2251                         except ValueError: # parsing prefix as integer fails
2252                                 self._download_n_results(query, 1)
2253                                 return
2254
2255         def _download_n_results(self, query, n):
2256                 """Downloads a specified number of results for a query"""
2257
2258                 video_ids = []
2259                 already_seen = set()
2260                 pagenum = 1
2261
2262                 while True:
2263                         self.report_download_page(query, pagenum)
2264                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2265                         request = urllib2.Request(result_url)
2266                         try:
2267                                 page = urllib2.urlopen(request).read()
2268                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2269                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2270                                 return
2271
2272                         # Extract video identifiers
2273                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2274                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2275                                 if video_id not in already_seen:
2276                                         video_ids.append(video_id)
2277                                         already_seen.add(video_id)
2278                                         if len(video_ids) == n:
2279                                                 # Specified n videos reached
2280                                                 for id in video_ids:
2281                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2282                                                 return
2283
2284                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2285                                 for id in video_ids:
2286                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2287                                 return
2288
2289                         pagenum = pagenum + 1
2290
2291
2292 class GoogleSearchIE(InfoExtractor):
2293         """Information Extractor for Google Video search queries."""
2294         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2295         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2296         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2297         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2298         _google_ie = None
2299         _max_google_results = 1000
2300         IE_NAME = u'video.google:search'
2301
2302         def __init__(self, google_ie, downloader=None):
2303                 InfoExtractor.__init__(self, downloader)
2304                 self._google_ie = google_ie
2305
2306         def report_download_page(self, query, pagenum):
2307                 """Report attempt to download playlist page with given number."""
2308                 query = query.decode(preferredencoding())
2309                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2310
2311         def _real_initialize(self):
2312                 self._google_ie.initialize()
2313
2314         def _real_extract(self, query):
2315                 mobj = re.match(self._VALID_URL, query)
2316                 if mobj is None:
2317                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2318                         return
2319
2320                 prefix, query = query.split(':')
2321                 prefix = prefix[8:]
2322                 query = query.encode('utf-8')
2323                 if prefix == '':
2324                         self._download_n_results(query, 1)
2325                         return
2326                 elif prefix == 'all':
2327                         self._download_n_results(query, self._max_google_results)
2328                         return
2329                 else:
2330                         try:
2331                                 n = long(prefix)
2332                                 if n <= 0:
2333                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2334                                         return
2335                                 elif n > self._max_google_results:
2336                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2337                                         n = self._max_google_results
2338                                 self._download_n_results(query, n)
2339                                 return
2340                         except ValueError: # parsing prefix as integer fails
2341                                 self._download_n_results(query, 1)
2342                                 return
2343
2344         def _download_n_results(self, query, n):
2345                 """Downloads a specified number of results for a query"""
2346
2347                 video_ids = []
2348                 already_seen = set()
2349                 pagenum = 1
2350
2351                 while True:
2352                         self.report_download_page(query, pagenum)
2353                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2354                         request = urllib2.Request(result_url)
2355                         try:
2356                                 page = urllib2.urlopen(request).read()
2357                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2358                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2359                                 return
2360
2361                         # Extract video identifiers
2362                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2363                                 video_id = mobj.group(1)
2364                                 if video_id not in already_seen:
2365                                         video_ids.append(video_id)
2366                                         already_seen.add(video_id)
2367                                         if len(video_ids) == n:
2368                                                 # Specified n videos reached
2369                                                 for id in video_ids:
2370                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2371                                                 return
2372
2373                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2374                                 for id in video_ids:
2375                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2376                                 return
2377
2378                         pagenum = pagenum + 1
2379
2380
2381 class YahooSearchIE(InfoExtractor):
2382         """Information Extractor for Yahoo! Video search queries."""
2383         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2384         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2385         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2386         _MORE_PAGES_INDICATOR = r'\s*Next'
2387         _yahoo_ie = None
2388         _max_yahoo_results = 1000
2389         IE_NAME = u'video.yahoo:search'
2390
2391         def __init__(self, yahoo_ie, downloader=None):
2392                 InfoExtractor.__init__(self, downloader)
2393                 self._yahoo_ie = yahoo_ie
2394
2395         def report_download_page(self, query, pagenum):
2396                 """Report attempt to download playlist page with given number."""
2397                 query = query.decode(preferredencoding())
2398                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2399
2400         def _real_initialize(self):
2401                 self._yahoo_ie.initialize()
2402
2403         def _real_extract(self, query):
2404                 mobj = re.match(self._VALID_URL, query)
2405                 if mobj is None:
2406                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2407                         return
2408
2409                 prefix, query = query.split(':')
2410                 prefix = prefix[8:]
2411                 query = query.encode('utf-8')
2412                 if prefix == '':
2413                         self._download_n_results(query, 1)
2414                         return
2415                 elif prefix == 'all':
2416                         self._download_n_results(query, self._max_yahoo_results)
2417                         return
2418                 else:
2419                         try:
2420                                 n = long(prefix)
2421                                 if n <= 0:
2422                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2423                                         return
2424                                 elif n > self._max_yahoo_results:
2425                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2426                                         n = self._max_yahoo_results
2427                                 self._download_n_results(query, n)
2428                                 return
2429                         except ValueError: # parsing prefix as integer fails
2430                                 self._download_n_results(query, 1)
2431                                 return
2432
2433         def _download_n_results(self, query, n):
2434                 """Downloads a specified number of results for a query"""
2435
2436                 video_ids = []
2437                 already_seen = set()
2438                 pagenum = 1
2439
2440                 while True:
2441                         self.report_download_page(query, pagenum)
2442                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2443                         request = urllib2.Request(result_url)
2444                         try:
2445                                 page = urllib2.urlopen(request).read()
2446                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2447                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2448                                 return
2449
2450                         # Extract video identifiers
2451                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2452                                 video_id = mobj.group(1)
2453                                 if video_id not in already_seen:
2454                                         video_ids.append(video_id)
2455                                         already_seen.add(video_id)
2456                                         if len(video_ids) == n:
2457                                                 # Specified n videos reached
2458                                                 for id in video_ids:
2459                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2460                                                 return
2461
2462                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2463                                 for id in video_ids:
2464                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2465                                 return
2466
2467                         pagenum = pagenum + 1
2468
2469
2470 class YoutubePlaylistIE(InfoExtractor):
2471         """Information Extractor for YouTube playlists."""
2472
2473         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2474         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2475         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2476         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2477         _youtube_ie = None
2478         IE_NAME = u'youtube:playlist'
2479
2480         def __init__(self, youtube_ie, downloader=None):
2481                 InfoExtractor.__init__(self, downloader)
2482                 self._youtube_ie = youtube_ie
2483
2484         def report_download_page(self, playlist_id, pagenum):
2485                 """Report attempt to download playlist page with given number."""
2486                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2487
2488         def _real_initialize(self):
2489                 self._youtube_ie.initialize()
2490
2491         def _real_extract(self, url):
2492                 # Extract playlist id
2493                 mobj = re.match(self._VALID_URL, url)
2494                 if mobj is None:
2495                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2496                         return
2497
2498                 # Single video case
2499                 if mobj.group(3) is not None:
2500                         self._youtube_ie.extract(mobj.group(3))
2501                         return
2502
2503                 # Download playlist pages
2504                 # prefix is 'p' as default for playlists but there are other types that need extra care
2505                 playlist_prefix = mobj.group(1)
2506                 if playlist_prefix == 'a':
2507                         playlist_access = 'artist'
2508                 else:
2509                         playlist_prefix = 'p'
2510                         playlist_access = 'view_play_list'
2511                 playlist_id = mobj.group(2)
2512                 video_ids = []
2513                 pagenum = 1
2514
2515                 while True:
2516                         self.report_download_page(playlist_id, pagenum)
2517                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2518                         request = urllib2.Request(url)
2519                         try:
2520                                 page = urllib2.urlopen(request).read()
2521                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2522                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2523                                 return
2524
2525                         # Extract video identifiers
2526                         ids_in_page = []
2527                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2528                                 if mobj.group(1) not in ids_in_page:
2529                                         ids_in_page.append(mobj.group(1))
2530                         video_ids.extend(ids_in_page)
2531
2532                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2533                                 break
2534                         pagenum = pagenum + 1
2535
2536                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2537                 playlistend = self._downloader.params.get('playlistend', -1)
2538                 video_ids = video_ids[playliststart:playlistend]
2539
2540                 for id in video_ids:
2541                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2542                 return
2543
2544
2545 class YoutubeUserIE(InfoExtractor):
2546         """Information Extractor for YouTube users."""
2547
2548         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2549         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2550         _GDATA_PAGE_SIZE = 50
2551         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2552         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2553         _youtube_ie = None
2554         IE_NAME = u'youtube:user'
2555
2556         def __init__(self, youtube_ie, downloader=None):
2557                 InfoExtractor.__init__(self, downloader)
2558                 self._youtube_ie = youtube_ie
2559
2560         def report_download_page(self, username, start_index):
2561                 """Report attempt to download user page."""
2562                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2563                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2564
2565         def _real_initialize(self):
2566                 self._youtube_ie.initialize()
2567
2568         def _real_extract(self, url):
2569                 # Extract username
2570                 mobj = re.match(self._VALID_URL, url)
2571                 if mobj is None:
2572                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2573                         return
2574
2575                 username = mobj.group(1)
2576
2577                 # Download video ids using YouTube Data API. Result size per
2578                 # query is limited (currently to 50 videos) so we need to query
2579                 # page by page until there are no video ids - it means we got
2580                 # all of them.
2581
2582                 video_ids = []
2583                 pagenum = 0
2584
2585                 while True:
2586                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2587                         self.report_download_page(username, start_index)
2588
2589                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2590
2591                         try:
2592                                 page = urllib2.urlopen(request).read()
2593                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2594                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2595                                 return
2596
2597                         # Extract video identifiers
2598                         ids_in_page = []
2599
2600                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2601                                 if mobj.group(1) not in ids_in_page:
2602                                         ids_in_page.append(mobj.group(1))
2603
2604                         video_ids.extend(ids_in_page)
2605
2606                         # A little optimization - if current page is not
2607                         # "full", ie. does not contain PAGE_SIZE video ids then
2608                         # we can assume that this page is the last one - there
2609                         # are no more ids on further pages - no need to query
2610                         # again.
2611
2612                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2613                                 break
2614
2615                         pagenum += 1
2616
2617                 all_ids_count = len(video_ids)
2618                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2619                 playlistend = self._downloader.params.get('playlistend', -1)
2620
2621                 if playlistend == -1:
2622                         video_ids = video_ids[playliststart:]
2623                 else:
2624                         video_ids = video_ids[playliststart:playlistend]
2625
2626                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2627                                 (username, all_ids_count, len(video_ids)))
2628
2629                 for video_id in video_ids:
2630                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2631
2632
2633 class DepositFilesIE(InfoExtractor):
2634         """Information extractor for depositfiles.com"""
2635
2636         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2637         IE_NAME = u'DepositFiles'
2638
2639         def __init__(self, downloader=None):
2640                 InfoExtractor.__init__(self, downloader)
2641
2642         def report_download_webpage(self, file_id):
2643                 """Report webpage download."""
2644                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2645
2646         def report_extraction(self, file_id):
2647                 """Report information extraction."""
2648                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2649
2650         def _real_initialize(self):
2651                 return
2652
2653         def _real_extract(self, url):
2654                 # At this point we have a new file
2655                 self._downloader.increment_downloads()
2656
2657                 file_id = url.split('/')[-1]
2658                 # Rebuild url in english locale
2659                 url = 'http://depositfiles.com/en/files/' + file_id
2660
2661                 # Retrieve file webpage with 'Free download' button pressed
2662                 free_download_indication = { 'gateway_result' : '1' }
2663                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2664                 try:
2665                         self.report_download_webpage(file_id)
2666                         webpage = urllib2.urlopen(request).read()
2667                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2668                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2669                         return
2670
2671                 # Search for the real file URL
2672                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2673                 if (mobj is None) or (mobj.group(1) is None):
2674                         # Try to figure out reason of the error.
2675                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2676                         if (mobj is not None) and (mobj.group(1) is not None):
2677                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2678                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2679                         else:
2680                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2681                         return
2682
2683                 file_url = mobj.group(1)
2684                 file_extension = os.path.splitext(file_url)[1][1:]
2685
2686                 # Search for file title
2687                 mobj = re.search(r'<b title="(.*?)">', webpage)
2688                 if mobj is None:
2689                         self._downloader.trouble(u'ERROR: unable to extract title')
2690                         return
2691                 file_title = mobj.group(1).decode('utf-8')
2692
2693                 try:
2694                         # Process file information
2695                         self._downloader.process_info({
2696                                 'id':           file_id.decode('utf-8'),
2697                                 'url':          file_url.decode('utf-8'),
2698                                 'uploader':     u'NA',
2699                                 'upload_date':  u'NA',
2700                                 'title':        file_title,
2701                                 'stitle':       file_title,
2702                                 'ext':          file_extension.decode('utf-8'),
2703                                 'format':       u'NA',
2704                                 'player_url':   None,
2705                         })
2706                 except UnavailableVideoError, err:
2707                         self._downloader.trouble(u'ERROR: unable to download file')
2708
2709
2710 class FacebookIE(InfoExtractor):
2711         """Information Extractor for Facebook"""
2712
2713         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2714         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2715         _NETRC_MACHINE = 'facebook'
2716         _available_formats = ['video', 'highqual', 'lowqual']
2717         _video_extensions = {
2718                 'video': 'mp4',
2719                 'highqual': 'mp4',
2720                 'lowqual': 'mp4',
2721         }
2722         IE_NAME = u'facebook'
2723
2724         def __init__(self, downloader=None):
2725                 InfoExtractor.__init__(self, downloader)
2726
2727         def _reporter(self, message):
2728                 """Add header and report message."""
2729                 self._downloader.to_screen(u'[facebook] %s' % message)
2730
2731         def report_login(self):
2732                 """Report attempt to log in."""
2733                 self._reporter(u'Logging in')
2734
2735         def report_video_webpage_download(self, video_id):
2736                 """Report attempt to download video webpage."""
2737                 self._reporter(u'%s: Downloading video webpage' % video_id)
2738
2739         def report_information_extraction(self, video_id):
2740                 """Report attempt to extract video information."""
2741                 self._reporter(u'%s: Extracting video information' % video_id)
2742
2743         def _parse_page(self, video_webpage):
2744                 """Extract video information from page"""
2745                 # General data
2746                 data = {'title': r'\("video_title", "(.*?)"\)',
2747                         'description': r'<div class="datawrap">(.*?)</div>',
2748                         'owner': r'\("video_owner_name", "(.*?)"\)',
2749                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2750                         }
2751                 video_info = {}
2752                 for piece in data.keys():
2753                         mobj = re.search(data[piece], video_webpage)
2754                         if mobj is not None:
2755                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2756
2757                 # Video urls
2758                 video_urls = {}
2759                 for fmt in self._available_formats:
2760                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2761                         if mobj is not None:
2762                                 # URL is in a Javascript segment inside an escaped Unicode format within
2763                                 # the generally utf-8 page
2764                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2765                 video_info['video_urls'] = video_urls
2766
2767                 return video_info
2768
2769         def _real_initialize(self):
2770                 if self._downloader is None:
2771                         return
2772
2773                 useremail = None
2774                 password = None
2775                 downloader_params = self._downloader.params
2776
2777                 # Attempt to use provided username and password or .netrc data
2778                 if downloader_params.get('username', None) is not None:
2779                         useremail = downloader_params['username']
2780                         password = downloader_params['password']
2781                 elif downloader_params.get('usenetrc', False):
2782                         try:
2783                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2784                                 if info is not None:
2785                                         useremail = info[0]
2786                                         password = info[2]
2787                                 else:
2788                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2789                         except (IOError, netrc.NetrcParseError), err:
2790                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2791                                 return
2792
2793                 if useremail is None:
2794                         return
2795
2796                 # Log in
2797                 login_form = {
2798                         'email': useremail,
2799                         'pass': password,
2800                         'login': 'Log+In'
2801                         }
2802                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2803                 try:
2804                         self.report_login()
2805                         login_results = urllib2.urlopen(request).read()
2806                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2807                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2808                                 return
2809                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2810                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2811                         return
2812
2813         def _real_extract(self, url):
2814                 mobj = re.match(self._VALID_URL, url)
2815                 if mobj is None:
2816                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2817                         return
2818                 video_id = mobj.group('ID')
2819
2820                 # Get video webpage
2821                 self.report_video_webpage_download(video_id)
2822                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2823                 try:
2824                         page = urllib2.urlopen(request)
2825                         video_webpage = page.read()
2826                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2827                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2828                         return
2829
2830                 # Start extracting information
2831                 self.report_information_extraction(video_id)
2832
2833                 # Extract information
2834                 video_info = self._parse_page(video_webpage)
2835
2836                 # uploader
2837                 if 'owner' not in video_info:
2838                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2839                         return
2840                 video_uploader = video_info['owner']
2841
2842                 # title
2843                 if 'title' not in video_info:
2844                         self._downloader.trouble(u'ERROR: unable to extract video title')
2845                         return
2846                 video_title = video_info['title']
2847                 video_title = video_title.decode('utf-8')
2848                 video_title = sanitize_title(video_title)
2849
2850                 # simplified title
2851                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2852                 simple_title = simple_title.strip(ur'_')
2853
2854                 # thumbnail image
2855                 if 'thumbnail' not in video_info:
2856                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2857                         video_thumbnail = ''
2858                 else:
2859                         video_thumbnail = video_info['thumbnail']
2860
2861                 # upload date
2862                 upload_date = u'NA'
2863                 if 'upload_date' in video_info:
2864                         upload_time = video_info['upload_date']
2865                         timetuple = email.utils.parsedate_tz(upload_time)
2866                         if timetuple is not None:
2867                                 try:
2868                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2869                                 except:
2870                                         pass
2871
2872                 # description
2873                 video_description = video_info.get('description', 'No description available.')
2874
2875                 url_map = video_info['video_urls']
2876                 if len(url_map.keys()) > 0:
2877                         # Decide which formats to download
2878                         req_format = self._downloader.params.get('format', None)
2879                         format_limit = self._downloader.params.get('format_limit', None)
2880
2881                         if format_limit is not None and format_limit in self._available_formats:
2882                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2883                         else:
2884                                 format_list = self._available_formats
2885                         existing_formats = [x for x in format_list if x in url_map]
2886                         if len(existing_formats) == 0:
2887                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2888                                 return
2889                         if req_format is None:
2890                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2891                         elif req_format == 'worst':
2892                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2893                         elif req_format == '-1':
2894                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2895                         else:
2896                                 # Specific format
2897                                 if req_format not in url_map:
2898                                         self._downloader.trouble(u'ERROR: requested format not available')
2899                                         return
2900                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2901
2902                 for format_param, video_real_url in video_url_list:
2903
2904                         # At this point we have a new video
2905                         self._downloader.increment_downloads()
2906
2907                         # Extension
2908                         video_extension = self._video_extensions.get(format_param, 'mp4')
2909
2910                         try:
2911                                 # Process video information
2912                                 self._downloader.process_info({
2913                                         'id':           video_id.decode('utf-8'),
2914                                         'url':          video_real_url.decode('utf-8'),
2915                                         'uploader':     video_uploader.decode('utf-8'),
2916                                         'upload_date':  upload_date,
2917                                         'title':        video_title,
2918                                         'stitle':       simple_title,
2919                                         'ext':          video_extension.decode('utf-8'),
2920                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2921                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2922                                         'description':  video_description.decode('utf-8'),
2923                                         'player_url':   None,
2924                                 })
2925                         except UnavailableVideoError, err:
2926                                 self._downloader.trouble(u'\nERROR: unable to download video')
2927
2928 class BlipTVIE(InfoExtractor):
2929         """Information extractor for blip.tv"""
2930
2931         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2932         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2933         IE_NAME = u'blip.tv'
2934
2935         def report_extraction(self, file_id):
2936                 """Report information extraction."""
2937                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2938
2939         def report_direct_download(self, title):
2940                 """Report information extraction."""
2941                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2942
2943         def _simplify_title(self, title):
2944                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2945                 res = res.strip(ur'_')
2946                 return res
2947
2948         def _real_extract(self, url):
2949                 mobj = re.match(self._VALID_URL, url)
2950                 if mobj is None:
2951                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2952                         return
2953
2954                 if '?' in url:
2955                         cchar = '&'
2956                 else:
2957                         cchar = '?'
2958                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2959                 request = urllib2.Request(json_url)
2960                 self.report_extraction(mobj.group(1))
2961                 info = None
2962                 try:
2963                         urlh = urllib2.urlopen(request)
2964                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2965                                 basename = url.split('/')[-1]
2966                                 title,ext = os.path.splitext(basename)
2967                                 ext = ext.replace('.', '')
2968                                 self.report_direct_download(title)
2969                                 info = {
2970                                         'id': title,
2971                                         'url': url,
2972                                         'title': title,
2973                                         'stitle': self._simplify_title(title),
2974                                         'ext': ext,
2975                                         'urlhandle': urlh
2976                                 }
2977                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2978                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2979                         return
2980                 if info is None: # Regular URL
2981                         try:
2982                                 json_code = urlh.read()
2983                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2984                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2985                                 return
2986
2987                         try:
2988                                 json_data = json.loads(json_code)
2989                                 if 'Post' in json_data:
2990                                         data = json_data['Post']
2991                                 else:
2992                                         data = json_data
2993
2994                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2995                                 video_url = data['media']['url']
2996                                 umobj = re.match(self._URL_EXT, video_url)
2997                                 if umobj is None:
2998                                         raise ValueError('Can not determine filename extension')
2999                                 ext = umobj.group(1)
3000
3001                                 info = {
3002                                         'id': data['item_id'],
3003                                         'url': video_url,
3004                                         'uploader': data['display_name'],
3005                                         'upload_date': upload_date,
3006                                         'title': data['title'],
3007                                         'stitle': self._simplify_title(data['title']),
3008                                         'ext': ext,
3009                                         'format': data['media']['mimeType'],
3010                                         'thumbnail': data['thumbnailUrl'],
3011                                         'description': data['description'],
3012                                         'player_url': data['embedUrl']
3013                                 }
3014                         except (ValueError,KeyError), err:
3015                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3016                                 return
3017
3018                 self._downloader.increment_downloads()
3019
3020                 try:
3021                         self._downloader.process_info(info)
3022                 except UnavailableVideoError, err:
3023                         self._downloader.trouble(u'\nERROR: unable to download video')
3024
3025
3026 class MyVideoIE(InfoExtractor):
3027         """Information Extractor for myvideo.de."""
3028
3029         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3030         IE_NAME = u'myvideo'
3031
3032         def __init__(self, downloader=None):
3033                 InfoExtractor.__init__(self, downloader)
3034
3035         def report_download_webpage(self, video_id):
3036                 """Report webpage download."""
3037                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3038
3039         def report_extraction(self, video_id):
3040                 """Report information extraction."""
3041                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3042
3043         def _real_initialize(self):
3044                 return
3045
3046         def _real_extract(self,url):
3047                 mobj = re.match(self._VALID_URL, url)
3048                 if mobj is None:
3049                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3050                         return
3051
3052                 video_id = mobj.group(1)
3053                 simple_title = mobj.group(2).decode('utf-8')
3054                 # should actually not be necessary
3055                 simple_title = sanitize_title(simple_title)
3056                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3057
3058                 # Get video webpage
3059                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3060                 try:
3061                         self.report_download_webpage(video_id)
3062                         webpage = urllib2.urlopen(request).read()
3063                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3064                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3065                         return
3066
3067                 self.report_extraction(video_id)
3068                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3069                                  webpage)
3070                 if mobj is None:
3071                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3072                         return
3073                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3074
3075                 mobj = re.search('<title>([^<]+)</title>', webpage)
3076                 if mobj is None:
3077                         self._downloader.trouble(u'ERROR: unable to extract title')
3078                         return
3079
3080                 video_title = mobj.group(1)
3081                 video_title = sanitize_title(video_title)
3082
3083                 try:
3084                         self._downloader.process_info({
3085                                 'id':           video_id,
3086                                 'url':          video_url,
3087                                 'uploader':     u'NA',
3088                                 'upload_date':  u'NA',
3089                                 'title':        video_title,
3090                                 'stitle':       simple_title,
3091                                 'ext':          u'flv',
3092                                 'format':       u'NA',
3093                                 'player_url':   None,
3094                         })
3095                 except UnavailableVideoError:
3096                         self._downloader.trouble(u'\nERROR: Unable to download video')
3097
3098 class ComedyCentralIE(InfoExtractor):
3099         """Information extractor for The Daily Show and Colbert Report """
3100
3101         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3102         IE_NAME = u'comedycentral'
3103
3104         def report_extraction(self, episode_id):
3105                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3106
3107         def report_config_download(self, episode_id):
3108                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3109
3110         def report_index_download(self, episode_id):
3111                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3112
3113         def report_player_url(self, episode_id):
3114                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3115
3116         def _simplify_title(self, title):
3117                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3118                 res = res.strip(ur'_')
3119                 return res
3120
3121         def _real_extract(self, url):
3122                 mobj = re.match(self._VALID_URL, url)
3123                 if mobj is None:
3124                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3125                         return
3126
3127                 if mobj.group('shortname'):
3128                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3129                                 url = 'http://www.thedailyshow.com/full-episodes/'
3130                         else:
3131                                 url = 'http://www.colbertnation.com/full-episodes/'
3132                         mobj = re.match(self._VALID_URL, url)
3133                         assert mobj is not None
3134
3135                 dlNewest = not mobj.group('episode')
3136                 if dlNewest:
3137                         epTitle = mobj.group('showname')
3138                 else:
3139                         epTitle = mobj.group('episode')
3140
3141                 req = urllib2.Request(url)
3142                 self.report_extraction(epTitle)
3143                 try:
3144                         htmlHandle = urllib2.urlopen(req)
3145                         html = htmlHandle.read()
3146                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3147                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3148                         return
3149                 if dlNewest:
3150                         url = htmlHandle.geturl()
3151                         mobj = re.match(self._VALID_URL, url)
3152                         if mobj is None:
3153                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3154                                 return
3155                         if mobj.group('episode') == '':
3156                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3157                                 return
3158                         epTitle = mobj.group('episode')
3159
3160                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3161                 if len(mMovieParams) == 0:
3162                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3163                         return
3164
3165                 playerUrl_raw = mMovieParams[0][0]
3166                 self.report_player_url(epTitle)
3167                 try:
3168                         urlHandle = urllib2.urlopen(playerUrl_raw)
3169                         playerUrl = urlHandle.geturl()
3170                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3171                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3172                         return
3173
3174                 uri = mMovieParams[0][1]
3175                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3176                 self.report_index_download(epTitle)
3177                 try:
3178                         indexXml = urllib2.urlopen(indexUrl).read()
3179                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3180                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3181                         return
3182
3183                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3184                 itemEls = idoc.findall('.//item')
3185                 for itemEl in itemEls:
3186                         mediaId = itemEl.findall('./guid')[0].text
3187                         shortMediaId = mediaId.split(':')[-1]
3188                         showId = mediaId.split(':')[-2].replace('.com', '')
3189                         officialTitle = itemEl.findall('./title')[0].text
3190                         officialDate = itemEl.findall('./pubDate')[0].text
3191
3192                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3193                                                 urllib.urlencode({'uri': mediaId}))
3194                         configReq = urllib2.Request(configUrl)
3195                         self.report_config_download(epTitle)
3196                         try:
3197                                 configXml = urllib2.urlopen(configReq).read()
3198                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3199                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3200                                 return
3201
3202                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3203                         turls = []
3204                         for rendition in cdoc.findall('.//rendition'):
3205                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3206                                 turls.append(finfo)
3207
3208                         if len(turls) == 0:
3209                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3210                                 continue
3211
3212                         # For now, just pick the highest bitrate
3213                         format,video_url = turls[-1]
3214
3215                         self._downloader.increment_downloads()
3216
3217                         effTitle = showId + '-' + epTitle
3218                         info = {
3219                                 'id': shortMediaId,
3220                                 'url': video_url,
3221                                 'uploader': showId,
3222                                 'upload_date': officialDate,
3223                                 'title': effTitle,
3224                                 'stitle': self._simplify_title(effTitle),
3225                                 'ext': 'mp4',
3226                                 'format': format,
3227                                 'thumbnail': None,
3228                                 'description': officialTitle,
3229                                 'player_url': playerUrl
3230                         }
3231
3232                         try:
3233                                 self._downloader.process_info(info)
3234                         except UnavailableVideoError, err:
3235                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3236                                 continue
3237
3238
3239 class EscapistIE(InfoExtractor):
3240         """Information extractor for The Escapist """
3241
3242         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3243         IE_NAME = u'escapist'
3244
3245         def report_extraction(self, showName):
3246                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3247
3248         def report_config_download(self, showName):
3249                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3250
3251         def _simplify_title(self, title):
3252                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3253                 res = res.strip(ur'_')
3254                 return res
3255
3256         def _real_extract(self, url):
3257                 htmlParser = HTMLParser.HTMLParser()
3258
3259                 mobj = re.match(self._VALID_URL, url)
3260                 if mobj is None:
3261                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3262                         return
3263                 showName = mobj.group('showname')
3264                 videoId = mobj.group('episode')
3265
3266                 self.report_extraction(showName)
3267                 try:
3268                         webPage = urllib2.urlopen(url).read()
3269                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3270                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3271                         return
3272
3273                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3274                 description = htmlParser.unescape(descMatch.group(1))
3275                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3276                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3277                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3278                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3279                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3280                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3281
3282                 self.report_config_download(showName)
3283                 try:
3284                         configJSON = urllib2.urlopen(configUrl).read()
3285                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3286                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3287                         return
3288
3289                 # Technically, it's JavaScript, not JSON
3290                 configJSON = configJSON.replace("'", '"')
3291
3292                 try:
3293                         config = json.loads(configJSON)
3294                 except (ValueError,), err:
3295                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3296                         return
3297
3298                 playlist = config['playlist']
3299                 videoUrl = playlist[1]['url']
3300
3301                 self._downloader.increment_downloads()
3302                 info = {
3303                         'id': videoId,
3304                         'url': videoUrl,
3305                         'uploader': showName,
3306                         'upload_date': None,
3307                         'title': showName,
3308                         'stitle': self._simplify_title(showName),
3309                         'ext': 'flv',
3310                         'format': 'flv',
3311                         'thumbnail': imgUrl,
3312                         'description': description,
3313                         'player_url': playerUrl,
3314                 }
3315
3316                 try:
3317                         self._downloader.process_info(info)
3318                 except UnavailableVideoError, err:
3319                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3320
3321
3322 class CollegeHumorIE(InfoExtractor):
3323         """Information extractor for collegehumor.com"""
3324
3325         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3326         IE_NAME = u'collegehumor'
3327
3328         def report_webpage(self, video_id):
3329                 """Report information extraction."""
3330                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3331
3332         def report_extraction(self, video_id):
3333                 """Report information extraction."""
3334                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3335
3336         def _simplify_title(self, title):
3337                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3338                 res = res.strip(ur'_')
3339                 return res
3340
3341         def _real_extract(self, url):
3342                 htmlParser = HTMLParser.HTMLParser()
3343
3344                 mobj = re.match(self._VALID_URL, url)
3345                 if mobj is None:
3346                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3347                         return
3348                 video_id = mobj.group('videoid')
3349
3350                 self.report_webpage(video_id)
3351                 request = urllib2.Request(url)
3352                 try:
3353                         webpage = urllib2.urlopen(request).read()
3354                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3355                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3356                         return
3357
3358                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3359                 if m is None:
3360                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3361                         return
3362                 internal_video_id = m.group('internalvideoid')
3363
3364                 info = {
3365                         'id': video_id,
3366                         'internal_id': internal_video_id,
3367                 }
3368
3369                 self.report_extraction(video_id)
3370                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3371                 try:
3372                         metaXml = urllib2.urlopen(xmlUrl).read()
3373                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3374                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3375                         return
3376
3377                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3378                 try:
3379                         videoNode = mdoc.findall('./video')[0]
3380                         info['description'] = videoNode.findall('./description')[0].text
3381                         info['title'] = videoNode.findall('./caption')[0].text
3382                         info['stitle'] = self._simplify_title(info['title'])
3383                         info['url'] = videoNode.findall('./file')[0].text
3384                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3385                         info['ext'] = info['url'].rpartition('.')[2]
3386                         info['format'] = info['ext']
3387                 except IndexError:
3388                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3389                         return
3390
3391                 self._downloader.increment_downloads()
3392
3393                 try:
3394                         self._downloader.process_info(info)
3395                 except UnavailableVideoError, err:
3396                         self._downloader.trouble(u'\nERROR: unable to download video')
3397
3398
3399 class XVideosIE(InfoExtractor):
3400         """Information extractor for xvideos.com"""
3401
3402         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3403         IE_NAME = u'xvideos'
3404
3405         def report_webpage(self, video_id):
3406                 """Report information extraction."""
3407                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3408
3409         def report_extraction(self, video_id):
3410                 """Report information extraction."""
3411                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3412
3413         def _simplify_title(self, title):
3414                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3415                 res = res.strip(ur'_')
3416                 return res
3417
3418         def _real_extract(self, url):
3419                 htmlParser = HTMLParser.HTMLParser()
3420
3421                 mobj = re.match(self._VALID_URL, url)
3422                 if mobj is None:
3423                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3424                         return
3425                 video_id = mobj.group(1).decode('utf-8')
3426
3427                 self.report_webpage(video_id)
3428
3429                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3430                 try:
3431                         webpage = urllib2.urlopen(request).read()
3432                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3433                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3434                         return
3435
3436                 self.report_extraction(video_id)
3437
3438
3439                 # Extract video URL
3440                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3441                 if mobj is None:
3442                         self._downloader.trouble(u'ERROR: unable to extract video url')
3443                         return
3444                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3445
3446
3447                 # Extract title
3448                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3449                 if mobj is None:
3450                         self._downloader.trouble(u'ERROR: unable to extract video title')
3451                         return
3452                 video_title = mobj.group(1).decode('utf-8')
3453
3454
3455                 # Extract video thumbnail
3456                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3457                 if mobj is None:
3458                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3459                         return
3460                 video_thumbnail = mobj.group(1).decode('utf-8')
3461
3462
3463
3464                 self._downloader.increment_downloads()
3465                 info = {
3466                         'id': video_id,
3467                         'url': video_url,
3468                         'uploader': None,
3469                         'upload_date': None,
3470                         'title': video_title,
3471                         'stitle': self._simplify_title(video_title),
3472                         'ext': 'flv',
3473                         'format': 'flv',
3474                         'thumbnail': video_thumbnail,
3475                         'description': None,
3476                         'player_url': None,
3477                 }
3478
3479                 try:
3480                         self._downloader.process_info(info)
3481                 except UnavailableVideoError, err:
3482                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3483
3484
3485 class SoundcloudIE(InfoExtractor):
3486         """Information extractor for soundcloud.com
3487            To access the media, the uid of the song and a stream token
3488            must be extracted from the page source and the script must make
3489            a request to media.soundcloud.com/crossdomain.xml. Then
3490            the media can be grabbed by requesting from an url composed
3491            of the stream token and uid
3492          """
3493
3494         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3495         IE_NAME = u'soundcloud'
3496
3497         def __init__(self, downloader=None):
3498                 InfoExtractor.__init__(self, downloader)
3499
3500         def report_webpage(self, video_id):
3501                 """Report information extraction."""
3502                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3503
3504         def report_extraction(self, video_id):
3505                 """Report information extraction."""
3506                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3507
3508         def _real_initialize(self):
3509                 return
3510
3511         def _real_extract(self, url):
3512                 htmlParser = HTMLParser.HTMLParser()
3513
3514                 mobj = re.match(self._VALID_URL, url)
3515                 if mobj is None:
3516                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3517                         return
3518
3519                 # extract uploader (which is in the url)
3520                 uploader = mobj.group(1).decode('utf-8')
3521                 # extract simple title (uploader + slug of song title)
3522                 slug_title =  mobj.group(2).decode('utf-8')
3523                 simple_title = uploader + '-' + slug_title
3524
3525                 self.report_webpage('%s/%s' % (uploader, slug_title))
3526
3527                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3528                 try:
3529                         webpage = urllib2.urlopen(request).read()
3530                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3531                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3532                         return
3533
3534                 self.report_extraction('%s/%s' % (uploader, slug_title))
3535
3536                 # extract uid and stream token that soundcloud hands out for access
3537                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3538                 if mobj:
3539                         video_id = mobj.group(1)
3540                         stream_token = mobj.group(2)
3541
3542                 # extract unsimplified title
3543                 mobj = re.search('"title":"(.*?)",', webpage)
3544                 if mobj:
3545                         title = mobj.group(1)
3546
3547                 # construct media url (with uid/token)
3548                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3549                 mediaURL = mediaURL % (video_id, stream_token)
3550
3551                 # description
3552                 description = u'No description available'
3553                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3554                 if mobj:
3555                         description = mobj.group(1)
3556
3557                 # upload date
3558                 upload_date = None
3559                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3560                 if mobj:
3561                         try:
3562                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3563                         except Exception as e:
3564                                 print str(e)
3565
3566                 # for soundcloud, a request to a cross domain is required for cookies
3567                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3568
3569                 try:
3570                         self._downloader.process_info({
3571                                 'id':           video_id.decode('utf-8'),
3572                                 'url':          mediaURL,
3573                                 'uploader':     uploader.decode('utf-8'),
3574                                 'upload_date':  upload_date,
3575                                 'title':        simple_title.decode('utf-8'),
3576                                 'stitle':       simple_title.decode('utf-8'),
3577                                 'ext':          u'mp3',
3578                                 'format':       u'NA',
3579                                 'player_url':   None,
3580                                 'description': description.decode('utf-8')
3581                         })
3582                 except UnavailableVideoError:
3583                         self._downloader.trouble(u'\nERROR: unable to download video')
3584
3585
3586 class PostProcessor(object):
3587         """Post Processor class.
3588
3589         PostProcessor objects can be added to downloaders with their
3590         add_post_processor() method. When the downloader has finished a
3591         successful download, it will take its internal chain of PostProcessors
3592         and start calling the run() method on each one of them, first with
3593         an initial argument and then with the returned value of the previous
3594         PostProcessor.
3595
3596         The chain will be stopped if one of them ever returns None or the end
3597         of the chain is reached.
3598
3599         PostProcessor objects follow a "mutual registration" process similar
3600         to InfoExtractor objects.
3601         """
3602
3603         _downloader = None
3604
3605         def __init__(self, downloader=None):
3606                 self._downloader = downloader
3607
3608         def set_downloader(self, downloader):
3609                 """Sets the downloader for this PP."""
3610                 self._downloader = downloader
3611
3612         def run(self, information):
3613                 """Run the PostProcessor.
3614
3615                 The "information" argument is a dictionary like the ones
3616                 composed by InfoExtractors. The only difference is that this
3617                 one has an extra field called "filepath" that points to the
3618                 downloaded file.
3619
3620                 When this method returns None, the postprocessing chain is
3621                 stopped. However, this method may return an information
3622                 dictionary that will be passed to the next postprocessing
3623                 object in the chain. It can be the one it received after
3624                 changing some fields.
3625
3626                 In addition, this method may raise a PostProcessingError
3627                 exception that will be taken into account by the downloader
3628                 it was called from.
3629                 """
3630                 return information # by default, do nothing
3631
3632
3633 class FFmpegExtractAudioPP(PostProcessor):
3634
3635         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3636                 PostProcessor.__init__(self, downloader)
3637                 if preferredcodec is None:
3638                         preferredcodec = 'best'
3639                 self._preferredcodec = preferredcodec
3640                 self._preferredquality = preferredquality
3641                 self._keepvideo = keepvideo
3642
3643         @staticmethod
3644         def get_audio_codec(path):
3645                 try:
3646                         cmd = ['ffprobe', '-show_streams', '--', path]
3647                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3648                         output = handle.communicate()[0]
3649                         if handle.wait() != 0:
3650                                 return None
3651                 except (IOError, OSError):
3652                         return None
3653                 audio_codec = None
3654                 for line in output.split('\n'):
3655                         if line.startswith('codec_name='):
3656                                 audio_codec = line.split('=')[1].strip()
3657                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3658                                 return audio_codec
3659                 return None
3660
3661         @staticmethod
3662         def run_ffmpeg(path, out_path, codec, more_opts):
3663                 try:
3664                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3665                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3666                         return (ret == 0)
3667                 except (IOError, OSError):
3668                         return False
3669
3670         def run(self, information):
3671                 path = information['filepath']
3672
3673                 filecodec = self.get_audio_codec(path)
3674                 if filecodec is None:
3675                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3676                         return None
3677
3678                 more_opts = []
3679                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3680                         if filecodec in ['aac', 'mp3', 'vorbis']:
3681                                 # Lossless if possible
3682                                 acodec = 'copy'
3683                                 extension = filecodec
3684                                 if filecodec == 'aac':
3685                                         more_opts = ['-f', 'adts']
3686                                 if filecodec == 'vorbis':
3687                                         extension = 'ogg'
3688                         else:
3689                                 # MP3 otherwise.
3690                                 acodec = 'libmp3lame'
3691                                 extension = 'mp3'
3692                                 more_opts = []
3693                                 if self._preferredquality is not None:
3694                                         more_opts += ['-ab', self._preferredquality]
3695                 else:
3696                         # We convert the audio (lossy)
3697                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3698                         extension = self._preferredcodec
3699                         more_opts = []
3700                         if self._preferredquality is not None:
3701                                 more_opts += ['-ab', self._preferredquality]
3702                         if self._preferredcodec == 'aac':
3703                                 more_opts += ['-f', 'adts']
3704                         if self._preferredcodec == 'vorbis':
3705                                 extension = 'ogg'
3706
3707                 (prefix, ext) = os.path.splitext(path)
3708                 new_path = prefix + '.' + extension
3709                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3710                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3711
3712                 if not status:
3713                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3714                         return None
3715
3716                 # Try to update the date time for extracted audio file.
3717                 if information.get('filetime') is not None:
3718                         try:
3719                                 os.utime(new_path, (time.time(), information['filetime']))
3720                         except:
3721                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3722
3723                 if not self._keepvideo:
3724                         try:
3725                                 os.remove(path)
3726                         except (IOError, OSError):
3727                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3728                                 return None
3729
3730                 information['filepath'] = new_path
3731                 return information
3732
3733
3734 def updateSelf(downloader, filename):
3735         ''' Update the program file with the latest version from the repository '''
3736         # Note: downloader only used for options
3737         if not os.access(filename, os.W_OK):
3738                 sys.exit('ERROR: no write permissions on %s' % filename)
3739
3740         downloader.to_screen('Updating to latest version...')
3741
3742         try:
3743                 try:
3744                         urlh = urllib.urlopen(UPDATE_URL)
3745                         newcontent = urlh.read()
3746
3747                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3748                         if vmatch is not None and vmatch.group(1) == __version__:
3749                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3750                                 return
3751                 finally:
3752                         urlh.close()
3753         except (IOError, OSError), err:
3754                 sys.exit('ERROR: unable to download latest version')
3755
3756         try:
3757                 outf = open(filename, 'wb')
3758                 try:
3759                         outf.write(newcontent)
3760                 finally:
3761                         outf.close()
3762         except (IOError, OSError), err:
3763                 sys.exit('ERROR: unable to overwrite current version')
3764
3765         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3766
3767 def parseOpts():
3768         # Deferred imports
3769         import getpass
3770         import optparse
3771
3772         def _format_option_string(option):
3773                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3774
3775                 opts = []
3776
3777                 if option._short_opts: opts.append(option._short_opts[0])
3778                 if option._long_opts: opts.append(option._long_opts[0])
3779                 if len(opts) > 1: opts.insert(1, ', ')
3780
3781                 if option.takes_value(): opts.append(' %s' % option.metavar)
3782
3783                 return "".join(opts)
3784
3785         def _find_term_columns():
3786                 columns = os.environ.get('COLUMNS', None)
3787                 if columns:
3788                         return int(columns)
3789
3790                 try:
3791                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3792                         out,err = sp.communicate()
3793                         return int(out.split()[1])
3794                 except:
3795                         pass
3796                 return None
3797
3798         max_width = 80
3799         max_help_position = 80
3800
3801         # No need to wrap help messages if we're on a wide console
3802         columns = _find_term_columns()
3803         if columns: max_width = columns
3804
3805         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3806         fmt.format_option_strings = _format_option_string
3807
3808         kw = {
3809                 'version'   : __version__,
3810                 'formatter' : fmt,
3811                 'usage' : '%prog [options] url [url...]',
3812                 'conflict_handler' : 'resolve',
3813         }
3814
3815         parser = optparse.OptionParser(**kw)
3816
3817         # option groups
3818         general        = optparse.OptionGroup(parser, 'General Options')
3819         selection      = optparse.OptionGroup(parser, 'Video Selection')
3820         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3821         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3822         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3823         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3824         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3825
3826         general.add_option('-h', '--help',
3827                         action='help', help='print this help text and exit')
3828         general.add_option('-v', '--version',
3829                         action='version', help='print program version and exit')
3830         general.add_option('-U', '--update',
3831                         action='store_true', dest='update_self', help='update this program to latest version')
3832         general.add_option('-i', '--ignore-errors',
3833                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3834         general.add_option('-r', '--rate-limit',
3835                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3836         general.add_option('-R', '--retries',
3837                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3838         general.add_option('--dump-user-agent',
3839                         action='store_true', dest='dump_user_agent',
3840                         help='display the current browser identification', default=False)
3841         general.add_option('--list-extractors',
3842                         action='store_true', dest='list_extractors',
3843                         help='List all supported extractors and the URLs they would handle', default=False)
3844
3845         selection.add_option('--playlist-start',
3846                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3847         selection.add_option('--playlist-end',
3848                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3849         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3850         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3851
3852         authentication.add_option('-u', '--username',
3853                         dest='username', metavar='USERNAME', help='account username')
3854         authentication.add_option('-p', '--password',
3855                         dest='password', metavar='PASSWORD', help='account password')
3856         authentication.add_option('-n', '--netrc',
3857                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3858
3859
3860         video_format.add_option('-f', '--format',
3861                         action='store', dest='format', metavar='FORMAT', help='video format code')
3862         video_format.add_option('--all-formats',
3863                         action='store_const', dest='format', help='download all available video formats', const='all')
3864         video_format.add_option('--max-quality',
3865                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3866         video_format.add_option('-F', '--list-formats',
3867                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3868
3869
3870         verbosity.add_option('-q', '--quiet',
3871                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3872         verbosity.add_option('-s', '--simulate',
3873                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3874         verbosity.add_option('--skip-download',
3875                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3876         verbosity.add_option('-g', '--get-url',
3877                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3878         verbosity.add_option('-e', '--get-title',
3879                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3880         verbosity.add_option('--get-thumbnail',
3881                         action='store_true', dest='getthumbnail',
3882                         help='simulate, quiet but print thumbnail URL', default=False)
3883         verbosity.add_option('--get-description',
3884                         action='store_true', dest='getdescription',
3885                         help='simulate, quiet but print video description', default=False)
3886         verbosity.add_option('--get-filename',
3887                         action='store_true', dest='getfilename',
3888                         help='simulate, quiet but print output filename', default=False)
3889         verbosity.add_option('--get-format',
3890                         action='store_true', dest='getformat',
3891                         help='simulate, quiet but print output format', default=False)
3892         verbosity.add_option('--no-progress',
3893                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3894         verbosity.add_option('--console-title',
3895                         action='store_true', dest='consoletitle',
3896                         help='display progress in console titlebar', default=False)
3897
3898
3899         filesystem.add_option('-t', '--title',
3900                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3901         filesystem.add_option('-l', '--literal',
3902                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3903         filesystem.add_option('-A', '--auto-number',
3904                         action='store_true', dest='autonumber',
3905                         help='number downloaded files starting from 00000', default=False)
3906         filesystem.add_option('-o', '--output',
3907                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3908         filesystem.add_option('-a', '--batch-file',
3909                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3910         filesystem.add_option('-w', '--no-overwrites',
3911                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3912         filesystem.add_option('-c', '--continue',
3913                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3914         filesystem.add_option('--no-continue',
3915                         action='store_false', dest='continue_dl',
3916                         help='do not resume partially downloaded files (restart from beginning)')
3917         filesystem.add_option('--cookies',
3918                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3919         filesystem.add_option('--no-part',
3920                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3921         filesystem.add_option('--no-mtime',
3922                         action='store_false', dest='updatetime',
3923                         help='do not use the Last-modified header to set the file modification time', default=True)
3924         filesystem.add_option('--write-description',
3925                         action='store_true', dest='writedescription',
3926                         help='write video description to a .description file', default=False)
3927         filesystem.add_option('--write-info-json',
3928                         action='store_true', dest='writeinfojson',
3929                         help='write video metadata to a .info.json file', default=False)
3930
3931
3932         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3933                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3934         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3935                         help='"best", "aac", "vorbis" or "mp3"; best by default')
3936         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3937                         help='ffmpeg audio bitrate specification, 128k by default')
3938         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3939                         help='keeps the video file on disk after the post-processing; the video is erased by default')
3940
3941
3942         parser.add_option_group(general)
3943         parser.add_option_group(selection)
3944         parser.add_option_group(filesystem)
3945         parser.add_option_group(verbosity)
3946         parser.add_option_group(video_format)
3947         parser.add_option_group(authentication)
3948         parser.add_option_group(postproc)
3949
3950         opts, args = parser.parse_args()
3951
3952         return parser, opts, args
3953
3954 def gen_extractors():
3955         """ Return a list of an instance of every supported extractor.
3956         The order does matter; the first extractor matched is the one handling the URL.
3957         """
3958         youtube_ie = YoutubeIE()
3959         google_ie = GoogleIE()
3960         yahoo_ie = YahooIE()
3961         return [
3962                 YoutubePlaylistIE(youtube_ie),
3963                 YoutubeUserIE(youtube_ie),
3964                 YoutubeSearchIE(youtube_ie),
3965                 youtube_ie,
3966                 MetacafeIE(youtube_ie),
3967                 DailymotionIE(),
3968                 google_ie,
3969                 GoogleSearchIE(google_ie),
3970                 PhotobucketIE(),
3971                 yahoo_ie,
3972                 YahooSearchIE(yahoo_ie),
3973                 DepositFilesIE(),
3974                 FacebookIE(),
3975                 BlipTVIE(),
3976                 VimeoIE(),
3977                 MyVideoIE(),
3978                 ComedyCentralIE(),
3979                 EscapistIE(),
3980                 CollegeHumorIE(),
3981                 XVideosIE(),
3982                 SoundcloudIE(),
3983
3984                 GenericIE()
3985         ]
3986
3987 def main():
3988         parser, opts, args = parseOpts()
3989
3990         # Open appropriate CookieJar
3991         if opts.cookiefile is None:
3992                 jar = cookielib.CookieJar()
3993         else:
3994                 try:
3995                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3996                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3997                                 jar.load()
3998                 except (IOError, OSError), err:
3999                         sys.exit(u'ERROR: unable to open cookie file')
4000
4001         # Dump user agent
4002         if opts.dump_user_agent:
4003                 print std_headers['User-Agent']
4004                 sys.exit(0)
4005
4006         # Batch file verification
4007         batchurls = []
4008         if opts.batchfile is not None:
4009                 try:
4010                         if opts.batchfile == '-':
4011                                 batchfd = sys.stdin
4012                         else:
4013                                 batchfd = open(opts.batchfile, 'r')
4014                         batchurls = batchfd.readlines()
4015                         batchurls = [x.strip() for x in batchurls]
4016                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4017                 except IOError:
4018                         sys.exit(u'ERROR: batch file could not be read')
4019         all_urls = batchurls + args
4020
4021         # General configuration
4022         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4023         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4024         urllib2.install_opener(opener)
4025         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4026
4027         extractors = gen_extractors()
4028
4029         if opts.list_extractors:
4030                 for ie in extractors:
4031                         print(ie.IE_NAME)
4032                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4033                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4034                         for mu in matchedUrls:
4035                                 print(u'  ' + mu)
4036                 sys.exit(0)
4037
4038         # Conflicting, missing and erroneous options
4039         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4040                 parser.error(u'using .netrc conflicts with giving username/password')
4041         if opts.password is not None and opts.username is None:
4042                 parser.error(u'account username missing')
4043         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4044                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4045         if opts.usetitle and opts.useliteral:
4046                 parser.error(u'using title conflicts with using literal title')
4047         if opts.username is not None and opts.password is None:
4048                 opts.password = getpass.getpass(u'Type account password and press return:')
4049         if opts.ratelimit is not None:
4050                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4051                 if numeric_limit is None:
4052                         parser.error(u'invalid rate limit specified')
4053                 opts.ratelimit = numeric_limit
4054         if opts.retries is not None:
4055                 try:
4056                         opts.retries = long(opts.retries)
4057                 except (TypeError, ValueError), err:
4058                         parser.error(u'invalid retry count specified')
4059         try:
4060                 opts.playliststart = int(opts.playliststart)
4061                 if opts.playliststart <= 0:
4062                         raise ValueError(u'Playlist start must be positive')
4063         except (TypeError, ValueError), err:
4064                 parser.error(u'invalid playlist start number specified')
4065         try:
4066                 opts.playlistend = int(opts.playlistend)
4067                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4068                         raise ValueError(u'Playlist end must be greater than playlist start')
4069         except (TypeError, ValueError), err:
4070                 parser.error(u'invalid playlist end number specified')
4071         if opts.extractaudio:
4072                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4073                         parser.error(u'invalid audio format specified')
4074
4075         # File downloader
4076         fd = FileDownloader({
4077                 'usenetrc': opts.usenetrc,
4078                 'username': opts.username,
4079                 'password': opts.password,
4080                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4081                 'forceurl': opts.geturl,
4082                 'forcetitle': opts.gettitle,
4083                 'forcethumbnail': opts.getthumbnail,
4084                 'forcedescription': opts.getdescription,
4085                 'forcefilename': opts.getfilename,
4086                 'forceformat': opts.getformat,
4087                 'simulate': opts.simulate,
4088                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4089                 'format': opts.format,
4090                 'format_limit': opts.format_limit,
4091                 'listformats': opts.listformats,
4092                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4093                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4094                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4095                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4096                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4097                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4098                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4099                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4100                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4101                         or u'%(id)s.%(ext)s'),
4102                 'ignoreerrors': opts.ignoreerrors,
4103                 'ratelimit': opts.ratelimit,
4104                 'nooverwrites': opts.nooverwrites,
4105                 'retries': opts.retries,
4106                 'continuedl': opts.continue_dl,
4107                 'noprogress': opts.noprogress,
4108                 'playliststart': opts.playliststart,
4109                 'playlistend': opts.playlistend,
4110                 'logtostderr': opts.outtmpl == '-',
4111                 'consoletitle': opts.consoletitle,
4112                 'nopart': opts.nopart,
4113                 'updatetime': opts.updatetime,
4114                 'writedescription': opts.writedescription,
4115                 'writeinfojson': opts.writeinfojson,
4116                 'matchtitle': opts.matchtitle,
4117                 'rejecttitle': opts.rejecttitle,
4118                 })
4119         for extractor in extractors:
4120                 fd.add_info_extractor(extractor)
4121
4122         # PostProcessors
4123         if opts.extractaudio:
4124                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4125
4126         # Update version
4127         if opts.update_self:
4128                 updateSelf(fd, sys.argv[0])
4129
4130         # Maybe do nothing
4131         if len(all_urls) < 1:
4132                 if not opts.update_self:
4133                         parser.error(u'you must provide at least one URL')
4134                 else:
4135                         sys.exit()
4136         retcode = fd.download(all_urls)
4137
4138         # Dump cookie jar if requested
4139         if opts.cookiefile is not None:
4140                 try:
4141                         jar.save()
4142                 except (IOError, OSError), err:
4143                         sys.exit(u'ERROR: unable to save cookie jar')
4144
4145         sys.exit(retcode)
4146
4147
4148 if __name__ == '__main__':
4149         try:
4150                 main()
4151         except DownloadError:
4152                 sys.exit(1)
4153         except SameFileError:
4154                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4155         except KeyboardInterrupt:
4156                 sys.exit(u'\nERROR: Interrupted by user')
4157
4158 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: