youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         )
  19
  20 __license__ = 'Public Domain'
  21 __version__ = '2011.11.23'
  22
  23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  24
  25 import cookielib
  26 import datetime
  27 import gzip
  28 import htmlentitydefs
  29 import HTMLParser
  30 import httplib
  31 import locale
  32 import math
  33 import netrc
  34 import os
  35 import os.path
  36 import re
  37 import socket
  38 import string
  39 import subprocess
  40 import sys
  41 import time
  42 import urllib
  43 import urllib2
  44 import warnings
  45 import zlib
  46
  47 if os.name == 'nt':
  48         import ctypes
  49
  50 try:
  51         import email.utils
  52 except ImportError: # Python 2.4
  53         import email.Utils
  54 try:
  55         import cStringIO as StringIO
  56 except ImportError:
  57         import StringIO
  58
  59 # parse_qs was moved from the cgi module to the urlparse module recently.
  60 try:
  61         from urlparse import parse_qs
  62 except ImportError:
  63         from cgi import parse_qs
  64
  65 try:
  66         import lxml.etree
  67 except ImportError:
  68         pass # Handled below
  69
  70 try:
  71         import xml.etree.ElementTree
  72 except ImportError: # Python<2.5: Not officially supported, but let it slip
  73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  74
  75 std_headers = {
  76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  79         'Accept-Encoding': 'gzip, deflate',
  80         'Accept-Language': 'en-us,en;q=0.5',
  81 }
  82
  83 try:
  84         import json
  85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  86         import re
  87         class json(object):
  88                 @staticmethod
  89                 def loads(s):
  90                         s = s.decode('UTF-8')
  91                         def raiseError(msg, i):
  92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  93                         def skipSpace(i, expectMore=True):
  94                                 while i < len(s) and s[i] in ' \t\r\n':
  95                                         i += 1
  96                                 if expectMore:
  97                                         if i >= len(s):
  98                                                 raiseError('Premature end', i)
  99                                 return i
 100                         def decodeEscape(match):
 101                                 esc = match.group(1)
 102                                 _STATIC = {
 103                                         '"': '"',
 104                                         '\\': '\\',
 105                                         '/': '/',
 106                                         'b': unichr(0x8),
 107                                         'f': unichr(0xc),
 108                                         'n': '\n',
 109                                         'r': '\r',
 110                                         't': '\t',
 111                                 }
 112                                 if esc in _STATIC:
 113                                         return _STATIC[esc]
 114                                 if esc[0] == 'u':
 115                                         if len(esc) == 1+4:
 116                                                 return unichr(int(esc[1:5], 16))
 117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 118                                                 hi = int(esc[1:5], 16)
 119                                                 low = int(esc[7:11], 16)
 120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 121                                 raise ValueError('Unknown escape ' + str(esc))
 122                         def parseString(i):
 123                                 i += 1
 124                                 e = i
 125                                 while True:
 126                                         e = s.index('"', e)
 127                                         bslashes = 0
 128                                         while s[e-bslashes-1] == '\\':
 129                                                 bslashes += 1
 130                                         if bslashes % 2 == 1:
 131                                                 e += 1
 132                                                 continue
 133                                         break
 134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 135                                 stri = rexp.sub(decodeEscape, s[i:e])
 136                                 return (e+1,stri)
 137                         def parseObj(i):
 138                                 i += 1
 139                                 res = {}
 140                                 i = skipSpace(i)
 141                                 if s[i] == '}': # Empty dictionary
 142                                         return (i+1,res)
 143                                 while True:
 144                                         if s[i] != '"':
 145                                                 raiseError('Expected a string object key', i)
 146                                         i,key = parseString(i)
 147                                         i = skipSpace(i)
 148                                         if i >= len(s) or s[i] != ':':
 149                                                 raiseError('Expected a colon', i)
 150                                         i,val = parse(i+1)
 151                                         res[key] = val
 152                                         i = skipSpace(i)
 153                                         if s[i] == '}':
 154                                                 return (i+1, res)
 155                                         if s[i] != ',':
 156                                                 raiseError('Expected comma or closing curly brace', i)
 157                                         i = skipSpace(i+1)
 158                         def parseArray(i):
 159                                 res = []
 160                                 i = skipSpace(i+1)
 161                                 if s[i] == ']': # Empty array
 162                                         return (i+1,res)
 163                                 while True:
 164                                         i,val = parse(i)
 165                                         res.append(val)
 166                                         i = skipSpace(i) # Raise exception if premature end
 167                                         if s[i] == ']':
 168                                                 return (i+1, res)
 169                                         if s[i] != ',':
 170                                                 raiseError('Expected a comma or closing bracket', i)
 171                                         i = skipSpace(i+1)
 172                         def parseDiscrete(i):
 173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 174                                         if s.startswith(k, i):
 175                                                 return (i+len(k), v)
 176                                 raiseError('Not a boolean (or null)', i)
 177                         def parseNumber(i):
 178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 179                                 if mobj is None:
 180                                         raiseError('Not a number', i)
 181                                 nums = mobj.group(1)
 182                                 if '.' in nums or 'e' in nums or 'E' in nums:
 183                                         return (i+len(nums), float(nums))
 184                                 return (i+len(nums), int(nums))
 185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 186                         def parse(i):
 187                                 i = skipSpace(i)
 188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 189                                 i = skipSpace(i, False)
 190                                 return (i,res)
 191                         i,res = parse(0)
 192                         if i < len(s):
 193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 194                         return res
 195
 196 def preferredencoding():
 197         """Get preferred encoding.
 198
 199         Returns the best encoding scheme for the system, based on
 200         locale.getpreferredencoding() and some further tweaks.
 201         """
 202         def yield_preferredencoding():
 203                 try:
 204                         pref = locale.getpreferredencoding()
 205                         u'TEST'.encode(pref)
 206                 except:
 207                         pref = 'UTF-8'
 208                 while True:
 209                         yield pref
 210         return yield_preferredencoding().next()
 211
 212
 213 def htmlentity_transform(matchobj):
 214         """Transforms an HTML entity to a Unicode character.
 215
 216         This function receives a match object and is intended to be used with
 217         the re.sub() function.
 218         """
 219         entity = matchobj.group(1)
 220
 221         # Known non-numeric HTML entity
 222         if entity in htmlentitydefs.name2codepoint:
 223                 return unichr(htmlentitydefs.name2codepoint[entity])
 224
 225         # Unicode character
 226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 227         if mobj is not None:
 228                 numstr = mobj.group(1)
 229                 if numstr.startswith(u'x'):
 230                         base = 16
 231                         numstr = u'0%s' % numstr
 232                 else:
 233                         base = 10
 234                 return unichr(long(numstr, base))
 235
 236         # Unknown entity in name, return its literal representation
 237         return (u'&%s;' % entity)
 238
 239
 240 def sanitize_title(utitle):
 241         """Sanitizes a video title so it could be used as part of a filename."""
 242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 243         return utitle.replace(unicode(os.sep), u'%')
 244
 245
 246 def sanitize_open(filename, open_mode):
 247         """Try to open the given filename, and slightly tweak it if this fails.
 248
 249         Attempts to open the given filename. If this fails, it tries to change
 250         the filename slightly, step by step, until it's either able to open it
 251         or it fails and raises a final exception, like the standard open()
 252         function.
 253
 254         It returns the tuple (stream, definitive_file_name).
 255         """
 256         try:
 257                 if filename == u'-':
 258                         if sys.platform == 'win32':
 259                                 import msvcrt
 260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 261                         return (sys.stdout, filename)
 262                 stream = open(filename, open_mode)
 263                 return (stream, filename)
 264         except (IOError, OSError), err:
 265                 # In case of error, try to remove win32 forbidden chars
 266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 267
 268                 # An exception here should be caught in the caller
 269                 stream = open(filename, open_mode)
 270                 return (stream, filename)
 271
 272
 273 def timeconvert(timestr):
 274         """Convert RFC 2822 defined time string into system timestamp"""
 275         timestamp = None
 276         timetuple = email.utils.parsedate_tz(timestr)
 277         if timetuple is not None:
 278                 timestamp = email.utils.mktime_tz(timetuple)
 279         return timestamp
 280
 281 def _simplify_title(title):
 282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 283         return expr.sub(u'_', title).strip(u'_')
 284
 285 class DownloadError(Exception):
 286         """Download Error exception.
 287
 288         This exception may be thrown by FileDownloader objects if they are not
 289         configured to continue on errors. They will contain the appropriate
 290         error message.
 291         """
 292         pass
 293
 294
 295 class SameFileError(Exception):
 296         """Same File exception.
 297
 298         This exception will be thrown by FileDownloader objects if they detect
 299         multiple files would have to be downloaded to the same file on disk.
 300         """
 301         pass
 302
 303
 304 class PostProcessingError(Exception):
 305         """Post Processing exception.
 306
 307         This exception may be raised by PostProcessor's .run() method to
 308         indicate an error in the postprocessing task.
 309         """
 310         pass
 311
 312
 313 class UnavailableVideoError(Exception):
 314         """Unavailable Format exception.
 315
 316         This exception will be thrown when a video is requested
 317         in a format that is not available for that video.
 318         """
 319         pass
 320
 321
 322 class ContentTooShortError(Exception):
 323         """Content Too Short exception.
 324
 325         This exception may be raised by FileDownloader objects when a file they
 326         download is too small for what the server announced first, indicating
 327         the connection was probably interrupted.
 328         """
 329         # Both in bytes
 330         downloaded = None
 331         expected = None
 332
 333         def __init__(self, downloaded, expected):
 334                 self.downloaded = downloaded
 335                 self.expected = expected
 336
 337
 338 class YoutubeDLHandler(urllib2.HTTPHandler):
 339         """Handler for HTTP requests and responses.
 340
 341         This class, when installed with an OpenerDirector, automatically adds
 342         the standard headers to every HTTP request and handles gzipped and
 343         deflated responses from web servers. If compression is to be avoided in
 344         a particular request, the original request in the program code only has
 345         to include the HTTP header "Youtubedl-No-Compression", which will be
 346         removed before making the real request.
 347
 348         Part of this code was copied from:
 349
 350         http://techknack.net/python-urllib2-handlers/
 351
 352         Andrew Rowls, the author of that code, agreed to release it to the
 353         public domain.
 354         """
 355
 356         @staticmethod
 357         def deflate(data):
 358                 try:
 359                         return zlib.decompress(data, -zlib.MAX_WBITS)
 360                 except zlib.error:
 361                         return zlib.decompress(data)
 362
 363         @staticmethod
 364         def addinfourl_wrapper(stream, headers, url, code):
 365                 if hasattr(urllib2.addinfourl, 'getcode'):
 366                         return urllib2.addinfourl(stream, headers, url, code)
 367                 ret = urllib2.addinfourl(stream, headers, url)
 368                 ret.code = code
 369                 return ret
 370
 371         def http_request(self, req):
 372                 for h in std_headers:
 373                         if h in req.headers:
 374                                 del req.headers[h]
 375                         req.add_header(h, std_headers[h])
 376                 if 'Youtubedl-no-compression' in req.headers:
 377                         if 'Accept-encoding' in req.headers:
 378                                 del req.headers['Accept-encoding']
 379                         del req.headers['Youtubedl-no-compression']
 380                 return req
 381
 382         def http_response(self, req, resp):
 383                 old_resp = resp
 384                 # gzip
 385                 if resp.headers.get('Content-encoding', '') == 'gzip':
 386                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 387                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 388                         resp.msg = old_resp.msg
 389                 # deflate
 390                 if resp.headers.get('Content-encoding', '') == 'deflate':
 391                         gz = StringIO.StringIO(self.deflate(resp.read()))
 392                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 393                         resp.msg = old_resp.msg
 394                 return resp
 395
 396
 397 class FileDownloader(object):
 398         """File Downloader class.
 399
 400         File downloader objects are the ones responsible of downloading the
 401         actual video file and writing it to disk if the user has requested
 402         it, among some other tasks. In most cases there should be one per
 403         program. As, given a video URL, the downloader doesn't know how to
 404         extract all the needed information, task that InfoExtractors do, it
 405         has to pass the URL to one of them.
 406
 407         For this, file downloader objects have a method that allows
 408         InfoExtractors to be registered in a given order. When it is passed
 409         a URL, the file downloader handles it to the first InfoExtractor it
 410         finds that reports being able to handle it. The InfoExtractor extracts
 411         all the information about the video or videos the URL refers to, and
 412         asks the FileDownloader to process the video information, possibly
 413         downloading the video.
 414
 415         File downloaders accept a lot of parameters. In order not to saturate
 416         the object constructor with arguments, it receives a dictionary of
 417         options instead. These options are available through the params
 418         attribute for the InfoExtractors to use. The FileDownloader also
 419         registers itself as the downloader in charge for the InfoExtractors
 420         that are added to it, so this is a "mutual registration".
 421
 422         Available options:
 423
 424         username:         Username for authentication purposes.
 425         password:         Password for authentication purposes.
 426         usenetrc:         Use netrc for authentication instead.
 427         quiet:            Do not print messages to stdout.
 428         forceurl:         Force printing final URL.
 429         forcetitle:       Force printing title.
 430         forcethumbnail:   Force printing thumbnail URL.
 431         forcedescription: Force printing description.
 432         forcefilename:    Force printing final filename.
 433         simulate:         Do not download the video files.
 434         format:           Video format code.
 435         format_limit:     Highest quality format to try.
 436         outtmpl:          Template for output names.
 437         ignoreerrors:     Do not stop on download errors.
 438         ratelimit:        Download speed limit, in bytes/sec.
 439         nooverwrites:     Prevent overwriting files.
 440         retries:          Number of times to retry for HTTP error 5xx
 441         continuedl:       Try to continue downloads if possible.
 442         noprogress:       Do not print the progress bar.
 443         playliststart:    Playlist item to start at.
 444         playlistend:      Playlist item to end at.
 445         matchtitle:       Download only matching titles.
 446         rejecttitle:      Reject downloads for matching titles.
 447         logtostderr:      Log messages to stderr instead of stdout.
 448         consoletitle:     Display progress in console window's titlebar.
 449         nopart:           Do not use temporary .part files.
 450         updatetime:       Use the Last-modified header to set output file timestamps.
 451         writedescription: Write the video description to a .description file
 452         writeinfojson:    Write the video description to a .info.json file
 453         """
 454
 455         params = None
 456         _ies = []
 457         _pps = []
 458         _download_retcode = None
 459         _num_downloads = None
 460         _screen_file = None
 461
 462         def __init__(self, params):
 463                 """Create a FileDownloader object with the given options."""
 464                 self._ies = []
 465                 self._pps = []
 466                 self._download_retcode = 0
 467                 self._num_downloads = 0
 468                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 469                 self.params = params
 470
 471         @staticmethod
 472         def format_bytes(bytes):
 473                 if bytes is None:
 474                         return 'N/A'
 475                 if type(bytes) is str:
 476                         bytes = float(bytes)
 477                 if bytes == 0.0:
 478                         exponent = 0
 479                 else:
 480                         exponent = long(math.log(bytes, 1024.0))
 481                 suffix = 'bkMGTPEZY'[exponent]
 482                 converted = float(bytes) / float(1024 ** exponent)
 483                 return '%.2f%s' % (converted, suffix)
 484
 485         @staticmethod
 486         def calc_percent(byte_counter, data_len):
 487                 if data_len is None:
 488                         return '---.-%'
 489                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 490
 491         @staticmethod
 492         def calc_eta(start, now, total, current):
 493                 if total is None:
 494                         return '--:--'
 495                 dif = now - start
 496                 if current == 0 or dif < 0.001: # One millisecond
 497                         return '--:--'
 498                 rate = float(current) / dif
 499                 eta = long((float(total) - float(current)) / rate)
 500                 (eta_mins, eta_secs) = divmod(eta, 60)
 501                 if eta_mins > 99:
 502                         return '--:--'
 503                 return '%02d:%02d' % (eta_mins, eta_secs)
 504
 505         @staticmethod
 506         def calc_speed(start, now, bytes):
 507                 dif = now - start
 508                 if bytes == 0 or dif < 0.001: # One millisecond
 509                         return '%10s' % '---b/s'
 510                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 511
 512         @staticmethod
 513         def best_block_size(elapsed_time, bytes):
 514                 new_min = max(bytes / 2.0, 1.0)
 515                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 516                 if elapsed_time < 0.001:
 517                         return long(new_max)
 518                 rate = bytes / elapsed_time
 519                 if rate > new_max:
 520                         return long(new_max)
 521                 if rate < new_min:
 522                         return long(new_min)
 523                 return long(rate)
 524
 525         @staticmethod
 526         def parse_bytes(bytestr):
 527                 """Parse a string indicating a byte quantity into a long integer."""
 528                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 529                 if matchobj is None:
 530                         return None
 531                 number = float(matchobj.group(1))
 532                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 533                 return long(round(number * multiplier))
 534
 535         def add_info_extractor(self, ie):
 536                 """Add an InfoExtractor object to the end of the list."""
 537                 self._ies.append(ie)
 538                 ie.set_downloader(self)
 539
 540         def add_post_processor(self, pp):
 541                 """Add a PostProcessor object to the end of the chain."""
 542                 self._pps.append(pp)
 543                 pp.set_downloader(self)
 544
 545         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 546                 """Print message to stdout if not in quiet mode."""
 547                 try:
 548                         if not self.params.get('quiet', False):
 549                                 terminator = [u'\n', u''][skip_eol]
 550                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 551                         self._screen_file.flush()
 552                 except (UnicodeEncodeError), err:
 553                         if not ignore_encoding_errors:
 554                                 raise
 555
 556         def to_stderr(self, message):
 557                 """Print message to stderr."""
 558                 print >>sys.stderr, message.encode(preferredencoding())
 559
 560         def to_cons_title(self, message):
 561                 """Set console/terminal window title to message."""
 562                 if not self.params.get('consoletitle', False):
 563                         return
 564                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 565                         # c_wchar_p() might not be necessary if `message` is
 566                         # already of type unicode()
 567                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 568                 elif 'TERM' in os.environ:
 569                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 570
 571         def fixed_template(self):
 572                 """Checks if the output template is fixed."""
 573                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 574
 575         def trouble(self, message=None):
 576                 """Determine action to take when a download problem appears.
 577
 578                 Depending on if the downloader has been configured to ignore
 579                 download errors or not, this method may throw an exception or
 580                 not when errors are found, after printing the message.
 581                 """
 582                 if message is not None:
 583                         self.to_stderr(message)
 584                 if not self.params.get('ignoreerrors', False):
 585                         raise DownloadError(message)
 586                 self._download_retcode = 1
 587
 588         def slow_down(self, start_time, byte_counter):
 589                 """Sleep if the download speed is over the rate limit."""
 590                 rate_limit = self.params.get('ratelimit', None)
 591                 if rate_limit is None or byte_counter == 0:
 592                         return
 593                 now = time.time()
 594                 elapsed = now - start_time
 595                 if elapsed <= 0.0:
 596                         return
 597                 speed = float(byte_counter) / elapsed
 598                 if speed > rate_limit:
 599                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 600
 601         def temp_name(self, filename):
 602                 """Returns a temporary filename for the given filename."""
 603                 if self.params.get('nopart', False) or filename == u'-' or \
 604                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 605                         return filename
 606                 return filename + u'.part'
 607
 608         def undo_temp_name(self, filename):
 609                 if filename.endswith(u'.part'):
 610                         return filename[:-len(u'.part')]
 611                 return filename
 612
 613         def try_rename(self, old_filename, new_filename):
 614                 try:
 615                         if old_filename == new_filename:
 616                                 return
 617                         os.rename(old_filename, new_filename)
 618                 except (IOError, OSError), err:
 619                         self.trouble(u'ERROR: unable to rename file')
 620
 621         def try_utime(self, filename, last_modified_hdr):
 622                 """Try to set the last-modified time of the given file."""
 623                 if last_modified_hdr is None:
 624                         return
 625                 if not os.path.isfile(filename):
 626                         return
 627                 timestr = last_modified_hdr
 628                 if timestr is None:
 629                         return
 630                 filetime = timeconvert(timestr)
 631                 if filetime is None:
 632                         return filetime
 633                 try:
 634                         os.utime(filename, (time.time(), filetime))
 635                 except:
 636                         pass
 637                 return filetime
 638
 639         def report_writedescription(self, descfn):
 640                 """ Report that the description file is being written """
 641                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 642
 643         def report_writeinfojson(self, infofn):
 644                 """ Report that the metadata file has been written """
 645                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 646
 647         def report_destination(self, filename):
 648                 """Report destination filename."""
 649                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 650
 651         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 652                 """Report download progress."""
 653                 if self.params.get('noprogress', False):
 654                         return
 655                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 656                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 657                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 658                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 659
 660         def report_resuming_byte(self, resume_len):
 661                 """Report attempt to resume at given byte."""
 662                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 663
 664         def report_retry(self, count, retries):
 665                 """Report retry in case of HTTP error 5xx"""
 666                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 667
 668         def report_file_already_downloaded(self, file_name):
 669                 """Report file has already been fully downloaded."""
 670                 try:
 671                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 672                 except (UnicodeEncodeError), err:
 673                         self.to_screen(u'[download] The file has already been downloaded')
 674
 675         def report_unable_to_resume(self):
 676                 """Report it was impossible to resume download."""
 677                 self.to_screen(u'[download] Unable to resume')
 678
 679         def report_finish(self):
 680                 """Report download finished."""
 681                 if self.params.get('noprogress', False):
 682                         self.to_screen(u'[download] Download completed')
 683                 else:
 684                         self.to_screen(u'')
 685
 686         def increment_downloads(self):
 687                 """Increment the ordinal that assigns a number to each file."""
 688                 self._num_downloads += 1
 689
 690         def prepare_filename(self, info_dict):
 691                 """Generate the output filename."""
 692                 try:
 693                         template_dict = dict(info_dict)
 694                         template_dict['epoch'] = unicode(long(time.time()))
 695                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 696                         filename = self.params['outtmpl'] % template_dict
 697                         return filename
 698                 except (ValueError, KeyError), err:
 699                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 700                         return None
 701
 702         def process_info(self, info_dict):
 703                 """Process a single dictionary returned by an InfoExtractor."""
 704
 705                 max_downloads = int(self.params.get('max_downloads'))
 706                 if max_downloads is not None:
 707                         if self._num_downloads > max_downloads:
 708                                 self.to_screen(u'[download] Maximum number of downloads reached. Skipping ' + info_dict['title'])
 709                                 return
 710
 711                 filename = self.prepare_filename(info_dict)
 712
 713                 # Forced printings
 714                 if self.params.get('forcetitle', False):
 715                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 716                 if self.params.get('forceurl', False):
 717                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 718                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 719                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 720                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 721                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 722                 if self.params.get('forcefilename', False) and filename is not None:
 723                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 724                 if self.params.get('forceformat', False):
 725                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 726
 727                 # Do nothing else if in simulate mode
 728                 if self.params.get('simulate', False):
 729                         return
 730
 731                 if filename is None:
 732                         return
 733
 734                 matchtitle=self.params.get('matchtitle',False)
 735                 rejecttitle=self.params.get('rejecttitle',False)
 736                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 737                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 738                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 739                         return
 740                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 741                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 742                         return
 743
 744                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 745                         self.to_stderr(u'WARNING: file exists and will be skipped')
 746                         return
 747
 748                 try:
 749                         dn = os.path.dirname(filename)
 750                         if dn != '' and not os.path.exists(dn):
 751                                 os.makedirs(dn)
 752                 except (OSError, IOError), err:
 753                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 754                         return
 755
 756                 if self.params.get('writedescription', False):
 757                         try:
 758                                 descfn = filename + '.description'
 759                                 self.report_writedescription(descfn)
 760                                 descfile = open(descfn, 'wb')
 761                                 try:
 762                                         descfile.write(info_dict['description'].encode('utf-8'))
 763                                 finally:
 764                                         descfile.close()
 765                         except (OSError, IOError):
 766                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 767                                 return
 768
 769                 if self.params.get('writeinfojson', False):
 770                         infofn = filename + '.info.json'
 771                         self.report_writeinfojson(infofn)
 772                         try:
 773                                 json.dump
 774                         except (NameError,AttributeError):
 775                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 776                                 return
 777                         try:
 778                                 infof = open(infofn, 'wb')
 779                                 try:
 780                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 781                                         json.dump(json_info_dict, infof)
 782                                 finally:
 783                                         infof.close()
 784                         except (OSError, IOError):
 785                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 786                                 return
 787
 788                 if not self.params.get('skip_download', False):
 789                         try:
 790                                 success = self._do_download(filename, info_dict)
 791                         except (OSError, IOError), err:
 792                                 raise UnavailableVideoError
 793                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 794                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 795                                 return
 796                         except (ContentTooShortError, ), err:
 797                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 798                                 return
 799
 800                         if success:
 801                                 try:
 802                                         self.post_process(filename, info_dict)
 803                                 except (PostProcessingError), err:
 804                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 805                                         return
 806
 807         def download(self, url_list):
 808                 """Download a given list of URLs."""
 809                 if len(url_list) > 1 and self.fixed_template():
 810                         raise SameFileError(self.params['outtmpl'])
 811
 812                 for url in url_list:
 813                         suitable_found = False
 814                         for ie in self._ies:
 815                                 # Go to next InfoExtractor if not suitable
 816                                 if not ie.suitable(url):
 817                                         continue
 818
 819                                 # Suitable InfoExtractor found
 820                                 suitable_found = True
 821
 822                                 # Extract information from URL and process it
 823                                 ie.extract(url)
 824
 825                                 # Suitable InfoExtractor had been found; go to next URL
 826                                 break
 827
 828                         if not suitable_found:
 829                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 830
 831                 return self._download_retcode
 832
 833         def post_process(self, filename, ie_info):
 834                 """Run the postprocessing chain on the given file."""
 835                 info = dict(ie_info)
 836                 info['filepath'] = filename
 837                 for pp in self._pps:
 838                         info = pp.run(info)
 839                         if info is None:
 840                                 break
 841
 842         def _download_with_rtmpdump(self, filename, url, player_url):
 843                 self.report_destination(filename)
 844                 tmpfilename = self.temp_name(filename)
 845
 846                 # Check for rtmpdump first
 847                 try:
 848                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 849                 except (OSError, IOError):
 850                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 851                         return False
 852
 853                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 854                 # the connection was interrumpted and resuming appears to be
 855                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 856                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 857                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 858                 while retval == 2 or retval == 1:
 859                         prevsize = os.path.getsize(tmpfilename)
 860                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 861                         time.sleep(5.0) # This seems to be needed
 862                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 863                         cursize = os.path.getsize(tmpfilename)
 864                         if prevsize == cursize and retval == 1:
 865                                 break
 866                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 867                         if prevsize == cursize and retval == 2 and cursize > 1024:
 868                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 869                                 retval = 0
 870                                 break
 871                 if retval == 0:
 872                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 873                         self.try_rename(tmpfilename, filename)
 874                         return True
 875                 else:
 876                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 877                         return False
 878
 879         def _do_download(self, filename, info_dict):
 880                 url = info_dict['url']
 881                 player_url = info_dict.get('player_url', None)
 882
 883                 # Check file already present
 884                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 885                         self.report_file_already_downloaded(filename)
 886                         return True
 887
 888                 # Attempt to download using rtmpdump
 889                 if url.startswith('rtmp'):
 890                         return self._download_with_rtmpdump(filename, url, player_url)
 891
 892                 tmpfilename = self.temp_name(filename)
 893                 stream = None
 894
 895                 # Do not include the Accept-Encoding header
 896                 headers = {'Youtubedl-no-compression': 'True'}
 897                 basic_request = urllib2.Request(url, None, headers)
 898                 request = urllib2.Request(url, None, headers)
 899
 900                 # Establish possible resume length
 901                 if os.path.isfile(tmpfilename):
 902                         resume_len = os.path.getsize(tmpfilename)
 903                 else:
 904                         resume_len = 0
 905
 906                 open_mode = 'wb'
 907                 if resume_len != 0:
 908                         if self.params.get('continuedl', False):
 909                                 self.report_resuming_byte(resume_len)
 910                                 request.add_header('Range','bytes=%d-' % resume_len)
 911                                 open_mode = 'ab'
 912                         else:
 913                                 resume_len = 0
 914
 915                 count = 0
 916                 retries = self.params.get('retries', 0)
 917                 while count <= retries:
 918                         # Establish connection
 919                         try:
 920                                 if count == 0 and 'urlhandle' in info_dict:
 921                                         data = info_dict['urlhandle']
 922                                 data = urllib2.urlopen(request)
 923                                 break
 924                         except (urllib2.HTTPError, ), err:
 925                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 926                                         # Unexpected HTTP error
 927                                         raise
 928                                 elif err.code == 416:
 929                                         # Unable to resume (requested range not satisfiable)
 930                                         try:
 931                                                 # Open the connection again without the range header
 932                                                 data = urllib2.urlopen(basic_request)
 933                                                 content_length = data.info()['Content-Length']
 934                                         except (urllib2.HTTPError, ), err:
 935                                                 if err.code < 500 or err.code >= 600:
 936                                                         raise
 937                                         else:
 938                                                 # Examine the reported length
 939                                                 if (content_length is not None and
 940                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 941                                                         # The file had already been fully downloaded.
 942                                                         # Explanation to the above condition: in issue #175 it was revealed that
 943                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 944                                                         # changing the file size slightly and causing problems for some users. So
 945                                                         # I decided to implement a suggested change and consider the file
 946                                                         # completely downloaded if the file size differs less than 100 bytes from
 947                                                         # the one in the hard drive.
 948                                                         self.report_file_already_downloaded(filename)
 949                                                         self.try_rename(tmpfilename, filename)
 950                                                         return True
 951                                                 else:
 952                                                         # The length does not match, we start the download over
 953                                                         self.report_unable_to_resume()
 954                                                         open_mode = 'wb'
 955                                                         break
 956                         # Retry
 957                         count += 1
 958                         if count <= retries:
 959                                 self.report_retry(count, retries)
 960
 961                 if count > retries:
 962                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 963                         return False
 964
 965                 data_len = data.info().get('Content-length', None)
 966                 if data_len is not None:
 967                         data_len = long(data_len) + resume_len
 968                 data_len_str = self.format_bytes(data_len)
 969                 byte_counter = 0 + resume_len
 970                 block_size = 1024
 971                 start = time.time()
 972                 while True:
 973                         # Download and write
 974                         before = time.time()
 975                         data_block = data.read(block_size)
 976                         after = time.time()
 977                         if len(data_block) == 0:
 978                                 break
 979                         byte_counter += len(data_block)
 980
 981                         # Open file just in time
 982                         if stream is None:
 983                                 try:
 984                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 985                                         assert stream is not None
 986                                         filename = self.undo_temp_name(tmpfilename)
 987                                         self.report_destination(filename)
 988                                 except (OSError, IOError), err:
 989                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 990                                         return False
 991                         try:
 992                                 stream.write(data_block)
 993                         except (IOError, OSError), err:
 994                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 995                                 return False
 996                         block_size = self.best_block_size(after - before, len(data_block))
 997
 998                         # Progress message
 999                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1000                         if data_len is None:
1001                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1002                         else:
1003                                 percent_str = self.calc_percent(byte_counter, data_len)
1004                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1005                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1006
1007                         # Apply rate limit
1008                         self.slow_down(start, byte_counter - resume_len)
1009
1010                 if stream is None:
1011                         self.trouble(u'\nERROR: Did not get any data blocks')
1012                         return False
1013                 stream.close()
1014                 self.report_finish()
1015                 if data_len is not None and byte_counter != data_len:
1016                         raise ContentTooShortError(byte_counter, long(data_len))
1017                 self.try_rename(tmpfilename, filename)
1018
1019                 # Update file modification time
1020                 if self.params.get('updatetime', True):
1021                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1022
1023                 return True
1024
1025
1026 class InfoExtractor(object):
1027         """Information Extractor class.
1028
1029         Information extractors are the classes that, given a URL, extract
1030         information from the video (or videos) the URL refers to. This
1031         information includes the real video URL, the video title and simplified
1032         title, author and others. The information is stored in a dictionary
1033         which is then passed to the FileDownloader. The FileDownloader
1034         processes this information possibly downloading the video to the file
1035         system, among other possible outcomes. The dictionaries must include
1036         the following fields:
1037
1038         id:             Video identifier.
1039         url:            Final video URL.
1040         uploader:       Nickname of the video uploader.
1041         title:          Literal title.
1042         stitle:         Simplified title.
1043         ext:            Video filename extension.
1044         format:         Video format.
1045         player_url:     SWF Player URL (may be None).
1046
1047         The following fields are optional. Their primary purpose is to allow
1048         youtube-dl to serve as the backend for a video search function, such
1049         as the one in youtube2mp3.  They are only used when their respective
1050         forced printing functions are called:
1051
1052         thumbnail:      Full URL to a video thumbnail image.
1053         description:    One-line video description.
1054
1055         Subclasses of this one should re-define the _real_initialize() and
1056         _real_extract() methods and define a _VALID_URL regexp.
1057         Probably, they should also be added to the list of extractors.
1058         """
1059
1060         _ready = False
1061         _downloader = None
1062
1063         def __init__(self, downloader=None):
1064                 """Constructor. Receives an optional downloader."""
1065                 self._ready = False
1066                 self.set_downloader(downloader)
1067
1068         def suitable(self, url):
1069                 """Receives a URL and returns True if suitable for this IE."""
1070                 return re.match(self._VALID_URL, url) is not None
1071
1072         def initialize(self):
1073                 """Initializes an instance (authentication, etc)."""
1074                 if not self._ready:
1075                         self._real_initialize()
1076                         self._ready = True
1077
1078         def extract(self, url):
1079                 """Extracts URL information and returns it in list of dicts."""
1080                 self.initialize()
1081                 return self._real_extract(url)
1082
1083         def set_downloader(self, downloader):
1084                 """Sets the downloader for this IE."""
1085                 self._downloader = downloader
1086
1087         def _real_initialize(self):
1088                 """Real initialization process. Redefine in subclasses."""
1089                 pass
1090
1091         def _real_extract(self, url):
1092                 """Real extraction process. Redefine in subclasses."""
1093                 pass
1094
1095
1096 class YoutubeIE(InfoExtractor):
1097         """Information extractor for youtube.com."""
1098
1099         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1100         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1101         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1102         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1103         _NETRC_MACHINE = 'youtube'
1104         # Listed in order of quality
1105         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1106         _video_extensions = {
1107                 '13': '3gp',
1108                 '17': 'mp4',
1109                 '18': 'mp4',
1110                 '22': 'mp4',
1111                 '37': 'mp4',
1112                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1113                 '43': 'webm',
1114                 '44': 'webm',
1115                 '45': 'webm',
1116         }
1117         _video_dimensions = {
1118                 '5': '240x400',
1119                 '6': '???',
1120                 '13': '???',
1121                 '17': '144x176',
1122                 '18': '360x640',
1123                 '22': '720x1280',
1124                 '34': '360x640',
1125                 '35': '480x854',
1126                 '37': '1080x1920',
1127                 '38': '3072x4096',
1128                 '43': '360x640',
1129                 '44': '480x854',
1130                 '45': '720x1280',
1131         }
1132         IE_NAME = u'youtube'
1133
1134         def report_lang(self):
1135                 """Report attempt to set language."""
1136                 self._downloader.to_screen(u'[youtube] Setting language')
1137
1138         def report_login(self):
1139                 """Report attempt to log in."""
1140                 self._downloader.to_screen(u'[youtube] Logging in')
1141
1142         def report_age_confirmation(self):
1143                 """Report attempt to confirm age."""
1144                 self._downloader.to_screen(u'[youtube] Confirming age')
1145
1146         def report_video_webpage_download(self, video_id):
1147                 """Report attempt to download video webpage."""
1148                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1149
1150         def report_video_info_webpage_download(self, video_id):
1151                 """Report attempt to download video info webpage."""
1152                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1153
1154         def report_information_extraction(self, video_id):
1155                 """Report attempt to extract video information."""
1156                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1157
1158         def report_unavailable_format(self, video_id, format):
1159                 """Report extracted video URL."""
1160                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1161
1162         def report_rtmp_download(self):
1163                 """Indicate the download will use the RTMP protocol."""
1164                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1165
1166         def _print_formats(self, formats):
1167                 print 'Available formats:'
1168                 for x in formats:
1169                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1170
1171         def _real_initialize(self):
1172                 if self._downloader is None:
1173                         return
1174
1175                 username = None
1176                 password = None
1177                 downloader_params = self._downloader.params
1178
1179                 # Attempt to use provided username and password or .netrc data
1180                 if downloader_params.get('username', None) is not None:
1181                         username = downloader_params['username']
1182                         password = downloader_params['password']
1183                 elif downloader_params.get('usenetrc', False):
1184                         try:
1185                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1186                                 if info is not None:
1187                                         username = info[0]
1188                                         password = info[2]
1189                                 else:
1190                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1191                         except (IOError, netrc.NetrcParseError), err:
1192                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1193                                 return
1194
1195                 # Set language
1196                 request = urllib2.Request(self._LANG_URL)
1197                 try:
1198                         self.report_lang()
1199                         urllib2.urlopen(request).read()
1200                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1201                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1202                         return
1203
1204                 # No authentication to be performed
1205                 if username is None:
1206                         return
1207
1208                 # Log in
1209                 login_form = {
1210                                 'current_form': 'loginForm',
1211                                 'next':         '/',
1212                                 'action_login': 'Log In',
1213                                 'username':     username,
1214                                 'password':     password,
1215                                 }
1216                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1217                 try:
1218                         self.report_login()
1219                         login_results = urllib2.urlopen(request).read()
1220                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1221                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1222                                 return
1223                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1224                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1225                         return
1226
1227                 # Confirm age
1228                 age_form = {
1229                                 'next_url':             '/',
1230                                 'action_confirm':       'Confirm',
1231                                 }
1232                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1233                 try:
1234                         self.report_age_confirmation()
1235                         age_results = urllib2.urlopen(request).read()
1236                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1237                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1238                         return
1239
1240         def _real_extract(self, url):
1241                 # Extract video id from URL
1242                 mobj = re.match(self._VALID_URL, url)
1243                 if mobj is None:
1244                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1245                         return
1246                 video_id = mobj.group(2)
1247
1248                 # Get video webpage
1249                 self.report_video_webpage_download(video_id)
1250                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1251                 try:
1252                         video_webpage = urllib2.urlopen(request).read()
1253                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1254                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1255                         return
1256
1257                 # Attempt to extract SWF player URL
1258                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1259                 if mobj is not None:
1260                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1261                 else:
1262                         player_url = None
1263
1264                 # Get video info
1265                 self.report_video_info_webpage_download(video_id)
1266                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1267                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1268                                         % (video_id, el_type))
1269                         request = urllib2.Request(video_info_url)
1270                         try:
1271                                 video_info_webpage = urllib2.urlopen(request).read()
1272                                 video_info = parse_qs(video_info_webpage)
1273                                 if 'token' in video_info:
1274                                         break
1275                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1276                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1277                                 return
1278                 if 'token' not in video_info:
1279                         if 'reason' in video_info:
1280                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1281                         else:
1282                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1283                         return
1284
1285                 # Start extracting information
1286                 self.report_information_extraction(video_id)
1287
1288                 # uploader
1289                 if 'author' not in video_info:
1290                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1291                         return
1292                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1293
1294                 # title
1295                 if 'title' not in video_info:
1296                         self._downloader.trouble(u'ERROR: unable to extract video title')
1297                         return
1298                 video_title = urllib.unquote_plus(video_info['title'][0])
1299                 video_title = video_title.decode('utf-8')
1300                 video_title = sanitize_title(video_title)
1301
1302                 # simplified title
1303                 simple_title = _simplify_title(video_title)
1304
1305                 # thumbnail image
1306                 if 'thumbnail_url' not in video_info:
1307                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1308                         video_thumbnail = ''
1309                 else:   # don't panic if we can't find it
1310                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1311
1312                 # upload date
1313                 upload_date = u'NA'
1314                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1315                 if mobj is not None:
1316                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1317                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1318                         for expression in format_expressions:
1319                                 try:
1320                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1321                                 except:
1322                                         pass
1323
1324                 # description
1325                 try:
1326                         lxml.etree
1327                 except NameError:
1328                         video_description = u'No description available.'
1329                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1330                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1331                                 if mobj is not None:
1332                                         video_description = mobj.group(1).decode('utf-8')
1333                 else:
1334                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1335                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1336                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1337                         # TODO use another parser
1338
1339                 # token
1340                 video_token = urllib.unquote_plus(video_info['token'][0])
1341
1342                 # Decide which formats to download
1343                 req_format = self._downloader.params.get('format', None)
1344
1345                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1346                         self.report_rtmp_download()
1347                         video_url_list = [(None, video_info['conn'][0])]
1348                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1349                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1350                         url_data = [parse_qs(uds) for uds in url_data_strs]
1351                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1352                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1353
1354                         format_limit = self._downloader.params.get('format_limit', None)
1355                         if format_limit is not None and format_limit in self._available_formats:
1356                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1357                         else:
1358                                 format_list = self._available_formats
1359                         existing_formats = [x for x in format_list if x in url_map]
1360                         if len(existing_formats) == 0:
1361                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1362                                 return
1363                         if self._downloader.params.get('listformats', None):
1364                                 self._print_formats(existing_formats)
1365                                 return
1366                         if req_format is None or req_format == 'best':
1367                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1368                         elif req_format == 'worst':
1369                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1370                         elif req_format in ('-1', 'all'):
1371                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1372                         else:
1373                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1374                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1375                                 req_formats = req_format.split('/')
1376                                 video_url_list = None
1377                                 for rf in req_formats:
1378                                         if rf in url_map:
1379                                                 video_url_list = [(rf, url_map[rf])]
1380                                                 break
1381                                 if video_url_list is None:
1382                                         self._downloader.trouble(u'ERROR: requested format not available')
1383                                         return
1384                 else:
1385                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1386                         return
1387
1388                 for format_param, video_real_url in video_url_list:
1389                         # At this point we have a new video
1390                         self._downloader.increment_downloads()
1391
1392                         # Extension
1393                         video_extension = self._video_extensions.get(format_param, 'flv')
1394
1395                         try:
1396                                 # Process video information
1397                                 self._downloader.process_info({
1398                                         'id':           video_id.decode('utf-8'),
1399                                         'url':          video_real_url.decode('utf-8'),
1400                                         'uploader':     video_uploader.decode('utf-8'),
1401                                         'upload_date':  upload_date,
1402                                         'title':        video_title,
1403                                         'stitle':       simple_title,
1404                                         'ext':          video_extension.decode('utf-8'),
1405                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1406                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1407                                         'description':  video_description,
1408                                         'player_url':   player_url,
1409                                 })
1410                         except UnavailableVideoError, err:
1411                                 self._downloader.trouble(u'\nERROR: unable to download video')
1412
1413
1414 class MetacafeIE(InfoExtractor):
1415         """Information Extractor for metacafe.com."""
1416
1417         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1418         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1419         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1420         _youtube_ie = None
1421         IE_NAME = u'metacafe'
1422
1423         def __init__(self, youtube_ie, downloader=None):
1424                 InfoExtractor.__init__(self, downloader)
1425                 self._youtube_ie = youtube_ie
1426
1427         def report_disclaimer(self):
1428                 """Report disclaimer retrieval."""
1429                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1430
1431         def report_age_confirmation(self):
1432                 """Report attempt to confirm age."""
1433                 self._downloader.to_screen(u'[metacafe] Confirming age')
1434
1435         def report_download_webpage(self, video_id):
1436                 """Report webpage download."""
1437                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1438
1439         def report_extraction(self, video_id):
1440                 """Report information extraction."""
1441                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1442
1443         def _real_initialize(self):
1444                 # Retrieve disclaimer
1445                 request = urllib2.Request(self._DISCLAIMER)
1446                 try:
1447                         self.report_disclaimer()
1448                         disclaimer = urllib2.urlopen(request).read()
1449                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1450                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1451                         return
1452
1453                 # Confirm age
1454                 disclaimer_form = {
1455                         'filters': '0',
1456                         'submit': "Continue - I'm over 18",
1457                         }
1458                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1459                 try:
1460                         self.report_age_confirmation()
1461                         disclaimer = urllib2.urlopen(request).read()
1462                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1463                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1464                         return
1465
1466         def _real_extract(self, url):
1467                 # Extract id and simplified title from URL
1468                 mobj = re.match(self._VALID_URL, url)
1469                 if mobj is None:
1470                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1471                         return
1472
1473                 video_id = mobj.group(1)
1474
1475                 # Check if video comes from YouTube
1476                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1477                 if mobj2 is not None:
1478                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1479                         return
1480
1481                 # At this point we have a new video
1482                 self._downloader.increment_downloads()
1483
1484                 simple_title = mobj.group(2).decode('utf-8')
1485
1486                 # Retrieve video webpage to extract further information
1487                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1488                 try:
1489                         self.report_download_webpage(video_id)
1490                         webpage = urllib2.urlopen(request).read()
1491                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1492                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1493                         return
1494
1495                 # Extract URL, uploader and title from webpage
1496                 self.report_extraction(video_id)
1497                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1498                 if mobj is not None:
1499                         mediaURL = urllib.unquote(mobj.group(1))
1500                         video_extension = mediaURL[-3:]
1501
1502                         # Extract gdaKey if available
1503                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1504                         if mobj is None:
1505                                 video_url = mediaURL
1506                         else:
1507                                 gdaKey = mobj.group(1)
1508                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1509                 else:
1510                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1511                         if mobj is None:
1512                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1513                                 return
1514                         vardict = parse_qs(mobj.group(1))
1515                         if 'mediaData' not in vardict:
1516                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1517                                 return
1518                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1519                         if mobj is None:
1520                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1521                                 return
1522                         mediaURL = mobj.group(1).replace('\\/', '/')
1523                         video_extension = mediaURL[-3:]
1524                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1525
1526                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1527                 if mobj is None:
1528                         self._downloader.trouble(u'ERROR: unable to extract title')
1529                         return
1530                 video_title = mobj.group(1).decode('utf-8')
1531                 video_title = sanitize_title(video_title)
1532
1533                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1534                 if mobj is None:
1535                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1536                         return
1537                 video_uploader = mobj.group(1)
1538
1539                 try:
1540                         # Process video information
1541                         self._downloader.process_info({
1542                                 'id':           video_id.decode('utf-8'),
1543                                 'url':          video_url.decode('utf-8'),
1544                                 'uploader':     video_uploader.decode('utf-8'),
1545                                 'upload_date':  u'NA',
1546                                 'title':        video_title,
1547                                 'stitle':       simple_title,
1548                                 'ext':          video_extension.decode('utf-8'),
1549                                 'format':       u'NA',
1550                                 'player_url':   None,
1551                         })
1552                 except UnavailableVideoError:
1553                         self._downloader.trouble(u'\nERROR: unable to download video')
1554
1555
1556 class DailymotionIE(InfoExtractor):
1557         """Information Extractor for Dailymotion"""
1558
1559         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1560         IE_NAME = u'dailymotion'
1561
1562         def __init__(self, downloader=None):
1563                 InfoExtractor.__init__(self, downloader)
1564
1565         def report_download_webpage(self, video_id):
1566                 """Report webpage download."""
1567                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1568
1569         def report_extraction(self, video_id):
1570                 """Report information extraction."""
1571                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1572
1573         def _real_extract(self, url):
1574                 # Extract id and simplified title from URL
1575                 mobj = re.match(self._VALID_URL, url)
1576                 if mobj is None:
1577                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1578                         return
1579
1580                 # At this point we have a new video
1581                 self._downloader.increment_downloads()
1582                 video_id = mobj.group(1)
1583
1584                 simple_title = mobj.group(2).decode('utf-8')
1585                 video_extension = 'flv'
1586
1587                 # Retrieve video webpage to extract further information
1588                 request = urllib2.Request(url)
1589                 request.add_header('Cookie', 'family_filter=off')
1590                 try:
1591                         self.report_download_webpage(video_id)
1592                         webpage = urllib2.urlopen(request).read()
1593                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1594                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1595                         return
1596
1597                 # Extract URL, uploader and title from webpage
1598                 self.report_extraction(video_id)
1599                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1600                 if mobj is None:
1601                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1602                         return
1603                 sequence = urllib.unquote(mobj.group(1))
1604                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1605                 if mobj is None:
1606                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1607                         return
1608                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1609
1610                 # if needed add http://www.dailymotion.com/ if relative URL
1611
1612                 video_url = mediaURL
1613
1614                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1615                 if mobj is None:
1616                         self._downloader.trouble(u'ERROR: unable to extract title')
1617                         return
1618                 video_title = mobj.group(1).decode('utf-8')
1619                 video_title = sanitize_title(video_title)
1620
1621                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1622                 if mobj is None:
1623                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1624                         return
1625                 video_uploader = mobj.group(1)
1626
1627                 try:
1628                         # Process video information
1629                         self._downloader.process_info({
1630                                 'id':           video_id.decode('utf-8'),
1631                                 'url':          video_url.decode('utf-8'),
1632                                 'uploader':     video_uploader.decode('utf-8'),
1633                                 'upload_date':  u'NA',
1634                                 'title':        video_title,
1635                                 'stitle':       simple_title,
1636                                 'ext':          video_extension.decode('utf-8'),
1637                                 'format':       u'NA',
1638                                 'player_url':   None,
1639                         })
1640                 except UnavailableVideoError:
1641                         self._downloader.trouble(u'\nERROR: unable to download video')
1642
1643
1644 class GoogleIE(InfoExtractor):
1645         """Information extractor for video.google.com."""
1646
1647         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1648         IE_NAME = u'video.google'
1649
1650         def __init__(self, downloader=None):
1651                 InfoExtractor.__init__(self, downloader)
1652
1653         def report_download_webpage(self, video_id):
1654                 """Report webpage download."""
1655                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1656
1657         def report_extraction(self, video_id):
1658                 """Report information extraction."""
1659                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1660
1661         def _real_extract(self, url):
1662                 # Extract id from URL
1663                 mobj = re.match(self._VALID_URL, url)
1664                 if mobj is None:
1665                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1666                         return
1667
1668                 # At this point we have a new video
1669                 self._downloader.increment_downloads()
1670                 video_id = mobj.group(1)
1671
1672                 video_extension = 'mp4'
1673
1674                 # Retrieve video webpage to extract further information
1675                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1676                 try:
1677                         self.report_download_webpage(video_id)
1678                         webpage = urllib2.urlopen(request).read()
1679                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1680                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1681                         return
1682
1683                 # Extract URL, uploader, and title from webpage
1684                 self.report_extraction(video_id)
1685                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1686                 if mobj is None:
1687                         video_extension = 'flv'
1688                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1689                 if mobj is None:
1690                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1691                         return
1692                 mediaURL = urllib.unquote(mobj.group(1))
1693                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1694                 mediaURL = mediaURL.replace('\\x26', '\x26')
1695
1696                 video_url = mediaURL
1697
1698                 mobj = re.search(r'<title>(.*)</title>', webpage)
1699                 if mobj is None:
1700                         self._downloader.trouble(u'ERROR: unable to extract title')
1701                         return
1702                 video_title = mobj.group(1).decode('utf-8')
1703                 video_title = sanitize_title(video_title)
1704                 simple_title = _simplify_title(video_title)
1705
1706                 # Extract video description
1707                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1708                 if mobj is None:
1709                         self._downloader.trouble(u'ERROR: unable to extract video description')
1710                         return
1711                 video_description = mobj.group(1).decode('utf-8')
1712                 if not video_description:
1713                         video_description = 'No description available.'
1714
1715                 # Extract video thumbnail
1716                 if self._downloader.params.get('forcethumbnail', False):
1717                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1718                         try:
1719                                 webpage = urllib2.urlopen(request).read()
1720                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1721                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1722                                 return
1723                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1724                         if mobj is None:
1725                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1726                                 return
1727                         video_thumbnail = mobj.group(1)
1728                 else:   # we need something to pass to process_info
1729                         video_thumbnail = ''
1730
1731                 try:
1732                         # Process video information
1733                         self._downloader.process_info({
1734                                 'id':           video_id.decode('utf-8'),
1735                                 'url':          video_url.decode('utf-8'),
1736                                 'uploader':     u'NA',
1737                                 'upload_date':  u'NA',
1738                                 'title':        video_title,
1739                                 'stitle':       simple_title,
1740                                 'ext':          video_extension.decode('utf-8'),
1741                                 'format':       u'NA',
1742                                 'player_url':   None,
1743                         })
1744                 except UnavailableVideoError:
1745                         self._downloader.trouble(u'\nERROR: unable to download video')
1746
1747
1748 class PhotobucketIE(InfoExtractor):
1749         """Information extractor for photobucket.com."""
1750
1751         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1752         IE_NAME = u'photobucket'
1753
1754         def __init__(self, downloader=None):
1755                 InfoExtractor.__init__(self, downloader)
1756
1757         def report_download_webpage(self, video_id):
1758                 """Report webpage download."""
1759                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1760
1761         def report_extraction(self, video_id):
1762                 """Report information extraction."""
1763                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1764
1765         def _real_extract(self, url):
1766                 # Extract id from URL
1767                 mobj = re.match(self._VALID_URL, url)
1768                 if mobj is None:
1769                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1770                         return
1771
1772                 # At this point we have a new video
1773                 self._downloader.increment_downloads()
1774                 video_id = mobj.group(1)
1775
1776                 video_extension = 'flv'
1777
1778                 # Retrieve video webpage to extract further information
1779                 request = urllib2.Request(url)
1780                 try:
1781                         self.report_download_webpage(video_id)
1782                         webpage = urllib2.urlopen(request).read()
1783                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1784                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1785                         return
1786
1787                 # Extract URL, uploader, and title from webpage
1788                 self.report_extraction(video_id)
1789                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1790                 if mobj is None:
1791                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1792                         return
1793                 mediaURL = urllib.unquote(mobj.group(1))
1794
1795                 video_url = mediaURL
1796
1797                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1798                 if mobj is None:
1799                         self._downloader.trouble(u'ERROR: unable to extract title')
1800                         return
1801                 video_title = mobj.group(1).decode('utf-8')
1802                 video_title = sanitize_title(video_title)
1803                 simple_title = _simplify_title(vide_title)
1804
1805                 video_uploader = mobj.group(2).decode('utf-8')
1806
1807                 try:
1808                         # Process video information
1809                         self._downloader.process_info({
1810                                 'id':           video_id.decode('utf-8'),
1811                                 'url':          video_url.decode('utf-8'),
1812                                 'uploader':     video_uploader,
1813                                 'upload_date':  u'NA',
1814                                 'title':        video_title,
1815                                 'stitle':       simple_title,
1816                                 'ext':          video_extension.decode('utf-8'),
1817                                 'format':       u'NA',
1818                                 'player_url':   None,
1819                         })
1820                 except UnavailableVideoError:
1821                         self._downloader.trouble(u'\nERROR: unable to download video')
1822
1823
1824 class YahooIE(InfoExtractor):
1825         """Information extractor for video.yahoo.com."""
1826
1827         # _VALID_URL matches all Yahoo! Video URLs
1828         # _VPAGE_URL matches only the extractable '/watch/' URLs
1829         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1830         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1831         IE_NAME = u'video.yahoo'
1832
1833         def __init__(self, downloader=None):
1834                 InfoExtractor.__init__(self, downloader)
1835
1836         def report_download_webpage(self, video_id):
1837                 """Report webpage download."""
1838                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1839
1840         def report_extraction(self, video_id):
1841                 """Report information extraction."""
1842                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1843
1844         def _real_extract(self, url, new_video=True):
1845                 # Extract ID from URL
1846                 mobj = re.match(self._VALID_URL, url)
1847                 if mobj is None:
1848                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1849                         return
1850
1851                 # At this point we have a new video
1852                 self._downloader.increment_downloads()
1853                 video_id = mobj.group(2)
1854                 video_extension = 'flv'
1855
1856                 # Rewrite valid but non-extractable URLs as
1857                 # extractable English language /watch/ URLs
1858                 if re.match(self._VPAGE_URL, url) is None:
1859                         request = urllib2.Request(url)
1860                         try:
1861                                 webpage = urllib2.urlopen(request).read()
1862                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1863                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1864                                 return
1865
1866                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1867                         if mobj is None:
1868                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1869                                 return
1870                         yahoo_id = mobj.group(1)
1871
1872                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1873                         if mobj is None:
1874                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1875                                 return
1876                         yahoo_vid = mobj.group(1)
1877
1878                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1879                         return self._real_extract(url, new_video=False)
1880
1881                 # Retrieve video webpage to extract further information
1882                 request = urllib2.Request(url)
1883                 try:
1884                         self.report_download_webpage(video_id)
1885                         webpage = urllib2.urlopen(request).read()
1886                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1887                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1888                         return
1889
1890                 # Extract uploader and title from webpage
1891                 self.report_extraction(video_id)
1892                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1893                 if mobj is None:
1894                         self._downloader.trouble(u'ERROR: unable to extract video title')
1895                         return
1896                 video_title = mobj.group(1).decode('utf-8')
1897                 simple_title = _simplify_title(video_title)
1898
1899                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1900                 if mobj is None:
1901                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1902                         return
1903                 video_uploader = mobj.group(1).decode('utf-8')
1904
1905                 # Extract video thumbnail
1906                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1907                 if mobj is None:
1908                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1909                         return
1910                 video_thumbnail = mobj.group(1).decode('utf-8')
1911
1912                 # Extract video description
1913                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1914                 if mobj is None:
1915                         self._downloader.trouble(u'ERROR: unable to extract video description')
1916                         return
1917                 video_description = mobj.group(1).decode('utf-8')
1918                 if not video_description:
1919                         video_description = 'No description available.'
1920
1921                 # Extract video height and width
1922                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1923                 if mobj is None:
1924                         self._downloader.trouble(u'ERROR: unable to extract video height')
1925                         return
1926                 yv_video_height = mobj.group(1)
1927
1928                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1929                 if mobj is None:
1930                         self._downloader.trouble(u'ERROR: unable to extract video width')
1931                         return
1932                 yv_video_width = mobj.group(1)
1933
1934                 # Retrieve video playlist to extract media URL
1935                 # I'm not completely sure what all these options are, but we
1936                 # seem to need most of them, otherwise the server sends a 401.
1937                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1938                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1939                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1940                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1941                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1942                 try:
1943                         self.report_download_webpage(video_id)
1944                         webpage = urllib2.urlopen(request).read()
1945                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1946                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1947                         return
1948
1949                 # Extract media URL from playlist XML
1950                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1951                 if mobj is None:
1952                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1953                         return
1954                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1955                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1956
1957                 try:
1958                         # Process video information
1959                         self._downloader.process_info({
1960                                 'id':           video_id.decode('utf-8'),
1961                                 'url':          video_url,
1962                                 'uploader':     video_uploader,
1963                                 'upload_date':  u'NA',
1964                                 'title':        video_title,
1965                                 'stitle':       simple_title,
1966                                 'ext':          video_extension.decode('utf-8'),
1967                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1968                                 'description':  video_description,
1969                                 'thumbnail':    video_thumbnail,
1970                                 'player_url':   None,
1971                         })
1972                 except UnavailableVideoError:
1973                         self._downloader.trouble(u'\nERROR: unable to download video')
1974
1975
1976 class VimeoIE(InfoExtractor):
1977         """Information extractor for vimeo.com."""
1978
1979         # _VALID_URL matches Vimeo URLs
1980         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1981         IE_NAME = u'vimeo'
1982
1983         def __init__(self, downloader=None):
1984                 InfoExtractor.__init__(self, downloader)
1985
1986         def report_download_webpage(self, video_id):
1987                 """Report webpage download."""
1988                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1989
1990         def report_extraction(self, video_id):
1991                 """Report information extraction."""
1992                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1993
1994         def _real_extract(self, url, new_video=True):
1995                 # Extract ID from URL
1996                 mobj = re.match(self._VALID_URL, url)
1997                 if mobj is None:
1998                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1999                         return
2000
2001                 # At this point we have a new video
2002                 self._downloader.increment_downloads()
2003                 video_id = mobj.group(1)
2004
2005                 # Retrieve video webpage to extract further information
2006                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2007                 try:
2008                         self.report_download_webpage(video_id)
2009                         webpage = urllib2.urlopen(request).read()
2010                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2011                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2012                         return
2013
2014                 # Now we begin extracting as much information as we can from what we
2015                 # retrieved. First we extract the information common to all extractors,
2016                 # and latter we extract those that are Vimeo specific.
2017                 self.report_extraction(video_id)
2018
2019                 # Extract title
2020                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2021                 if mobj is None:
2022                         self._downloader.trouble(u'ERROR: unable to extract video title')
2023                         return
2024                 video_title = mobj.group(1).decode('utf-8')
2025                 simple_title = _simplify_title(video_title)
2026
2027                 # Extract uploader
2028                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2029                 if mobj is None:
2030                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2031                         return
2032                 video_uploader = mobj.group(1).decode('utf-8')
2033
2034                 # Extract video thumbnail
2035                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2036                 if mobj is None:
2037                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2038                         return
2039                 video_thumbnail = mobj.group(1).decode('utf-8')
2040
2041                 # # Extract video description
2042                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2043                 # if mobj is None:
2044                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2045                 #       return
2046                 # video_description = mobj.group(1).decode('utf-8')
2047                 # if not video_description: video_description = 'No description available.'
2048                 video_description = 'Foo.'
2049
2050                 # Vimeo specific: extract request signature
2051                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2052                 if mobj is None:
2053                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2054                         return
2055                 sig = mobj.group(1).decode('utf-8')
2056
2057                 # Vimeo specific: extract video quality information
2058                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2059                 if mobj is None:
2060                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2061                         return
2062                 quality = mobj.group(1).decode('utf-8')
2063
2064                 if int(quality) == 1:
2065                         quality = 'hd'
2066                 else:
2067                         quality = 'sd'
2068
2069                 # Vimeo specific: Extract request signature expiration
2070                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2071                 if mobj is None:
2072                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2073                         return
2074                 sig_exp = mobj.group(1).decode('utf-8')
2075
2076                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2077
2078                 try:
2079                         # Process video information
2080                         self._downloader.process_info({
2081                                 'id':           video_id.decode('utf-8'),
2082                                 'url':          video_url,
2083                                 'uploader':     video_uploader,
2084                                 'upload_date':  u'NA',
2085                                 'title':        video_title,
2086                                 'stitle':       simple_title,
2087                                 'ext':          u'mp4',
2088                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2089                                 'description':  video_description,
2090                                 'thumbnail':    video_thumbnail,
2091                                 'description':  video_description,
2092                                 'player_url':   None,
2093                         })
2094                 except UnavailableVideoError:
2095                         self._downloader.trouble(u'ERROR: unable to download video')
2096
2097
2098 class GenericIE(InfoExtractor):
2099         """Generic last-resort information extractor."""
2100
2101         _VALID_URL = r'.*'
2102         IE_NAME = u'generic'
2103
2104         def __init__(self, downloader=None):
2105                 InfoExtractor.__init__(self, downloader)
2106
2107         def report_download_webpage(self, video_id):
2108                 """Report webpage download."""
2109                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2110                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2111
2112         def report_extraction(self, video_id):
2113                 """Report information extraction."""
2114                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2115
2116         def _real_extract(self, url):
2117                 # At this point we have a new video
2118                 self._downloader.increment_downloads()
2119
2120                 video_id = url.split('/')[-1]
2121                 request = urllib2.Request(url)
2122                 try:
2123                         self.report_download_webpage(video_id)
2124                         webpage = urllib2.urlopen(request).read()
2125                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2126                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2127                         return
2128                 except ValueError, err:
2129                         # since this is the last-resort InfoExtractor, if
2130                         # this error is thrown, it'll be thrown here
2131                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2132                         return
2133
2134                 self.report_extraction(video_id)
2135                 # Start with something easy: JW Player in SWFObject
2136                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2137                 if mobj is None:
2138                         # Broaden the search a little bit
2139                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2140                 if mobj is None:
2141                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2142                         return
2143
2144                 # It's possible that one of the regexes
2145                 # matched, but returned an empty group:
2146                 if mobj.group(1) is None:
2147                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2148                         return
2149
2150                 video_url = urllib.unquote(mobj.group(1))
2151                 video_id = os.path.basename(video_url)
2152
2153                 # here's a fun little line of code for you:
2154                 video_extension = os.path.splitext(video_id)[1][1:]
2155                 video_id = os.path.splitext(video_id)[0]
2156
2157                 # it's tempting to parse this further, but you would
2158                 # have to take into account all the variations like
2159                 #   Video Title - Site Name
2160                 #   Site Name | Video Title
2161                 #   Video Title - Tagline | Site Name
2162                 # and so on and so forth; it's just not practical
2163                 mobj = re.search(r'<title>(.*)</title>', webpage)
2164                 if mobj is None:
2165                         self._downloader.trouble(u'ERROR: unable to extract title')
2166                         return
2167                 video_title = mobj.group(1).decode('utf-8')
2168                 video_title = sanitize_title(video_title)
2169                 simple_title = _simplify_title(video_title)
2170
2171                 # video uploader is domain name
2172                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2173                 if mobj is None:
2174                         self._downloader.trouble(u'ERROR: unable to extract title')
2175                         return
2176                 video_uploader = mobj.group(1).decode('utf-8')
2177
2178                 try:
2179                         # Process video information
2180                         self._downloader.process_info({
2181                                 'id':           video_id.decode('utf-8'),
2182                                 'url':          video_url.decode('utf-8'),
2183                                 'uploader':     video_uploader,
2184                                 'upload_date':  u'NA',
2185                                 'title':        video_title,
2186                                 'stitle':       simple_title,
2187                                 'ext':          video_extension.decode('utf-8'),
2188                                 'format':       u'NA',
2189                                 'player_url':   None,
2190                         })
2191                 except UnavailableVideoError, err:
2192                         self._downloader.trouble(u'\nERROR: unable to download video')
2193
2194
2195 class YoutubeSearchIE(InfoExtractor):
2196         """Information Extractor for YouTube search queries."""
2197         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2198         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2199         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2200         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2201         _youtube_ie = None
2202         _max_youtube_results = 1000
2203         IE_NAME = u'youtube:search'
2204
2205         def __init__(self, youtube_ie, downloader=None):
2206                 InfoExtractor.__init__(self, downloader)
2207                 self._youtube_ie = youtube_ie
2208
2209         def report_download_page(self, query, pagenum):
2210                 """Report attempt to download playlist page with given number."""
2211                 query = query.decode(preferredencoding())
2212                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2213
2214         def _real_initialize(self):
2215                 self._youtube_ie.initialize()
2216
2217         def _real_extract(self, query):
2218                 mobj = re.match(self._VALID_URL, query)
2219                 if mobj is None:
2220                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2221                         return
2222
2223                 prefix, query = query.split(':')
2224                 prefix = prefix[8:]
2225                 query = query.encode('utf-8')
2226                 if prefix == '':
2227                         self._download_n_results(query, 1)
2228                         return
2229                 elif prefix == 'all':
2230                         self._download_n_results(query, self._max_youtube_results)
2231                         return
2232                 else:
2233                         try:
2234                                 n = long(prefix)
2235                                 if n <= 0:
2236                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2237                                         return
2238                                 elif n > self._max_youtube_results:
2239                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2240                                         n = self._max_youtube_results
2241                                 self._download_n_results(query, n)
2242                                 return
2243                         except ValueError: # parsing prefix as integer fails
2244                                 self._download_n_results(query, 1)
2245                                 return
2246
2247         def _download_n_results(self, query, n):
2248                 """Downloads a specified number of results for a query"""
2249
2250                 video_ids = []
2251                 already_seen = set()
2252                 pagenum = 1
2253
2254                 while True:
2255                         self.report_download_page(query, pagenum)
2256                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2257                         request = urllib2.Request(result_url)
2258                         try:
2259                                 page = urllib2.urlopen(request).read()
2260                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2261                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2262                                 return
2263
2264                         # Extract video identifiers
2265                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2266                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2267                                 if video_id not in already_seen:
2268                                         video_ids.append(video_id)
2269                                         already_seen.add(video_id)
2270                                         if len(video_ids) == n:
2271                                                 # Specified n videos reached
2272                                                 for id in video_ids:
2273                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2274                                                 return
2275
2276                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2277                                 for id in video_ids:
2278                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2279                                 return
2280
2281                         pagenum = pagenum + 1
2282
2283
2284 class GoogleSearchIE(InfoExtractor):
2285         """Information Extractor for Google Video search queries."""
2286         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2287         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2288         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2289         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2290         _google_ie = None
2291         _max_google_results = 1000
2292         IE_NAME = u'video.google:search'
2293
2294         def __init__(self, google_ie, downloader=None):
2295                 InfoExtractor.__init__(self, downloader)
2296                 self._google_ie = google_ie
2297
2298         def report_download_page(self, query, pagenum):
2299                 """Report attempt to download playlist page with given number."""
2300                 query = query.decode(preferredencoding())
2301                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2302
2303         def _real_initialize(self):
2304                 self._google_ie.initialize()
2305
2306         def _real_extract(self, query):
2307                 mobj = re.match(self._VALID_URL, query)
2308                 if mobj is None:
2309                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2310                         return
2311
2312                 prefix, query = query.split(':')
2313                 prefix = prefix[8:]
2314                 query = query.encode('utf-8')
2315                 if prefix == '':
2316                         self._download_n_results(query, 1)
2317                         return
2318                 elif prefix == 'all':
2319                         self._download_n_results(query, self._max_google_results)
2320                         return
2321                 else:
2322                         try:
2323                                 n = long(prefix)
2324                                 if n <= 0:
2325                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2326                                         return
2327                                 elif n > self._max_google_results:
2328                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2329                                         n = self._max_google_results
2330                                 self._download_n_results(query, n)
2331                                 return
2332                         except ValueError: # parsing prefix as integer fails
2333                                 self._download_n_results(query, 1)
2334                                 return
2335
2336         def _download_n_results(self, query, n):
2337                 """Downloads a specified number of results for a query"""
2338
2339                 video_ids = []
2340                 already_seen = set()
2341                 pagenum = 1
2342
2343                 while True:
2344                         self.report_download_page(query, pagenum)
2345                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2346                         request = urllib2.Request(result_url)
2347                         try:
2348                                 page = urllib2.urlopen(request).read()
2349                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2350                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2351                                 return
2352
2353                         # Extract video identifiers
2354                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2355                                 video_id = mobj.group(1)
2356                                 if video_id not in already_seen:
2357                                         video_ids.append(video_id)
2358                                         already_seen.add(video_id)
2359                                         if len(video_ids) == n:
2360                                                 # Specified n videos reached
2361                                                 for id in video_ids:
2362                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2363                                                 return
2364
2365                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2366                                 for id in video_ids:
2367                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2368                                 return
2369
2370                         pagenum = pagenum + 1
2371
2372
2373 class YahooSearchIE(InfoExtractor):
2374         """Information Extractor for Yahoo! Video search queries."""
2375         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2376         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2377         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2378         _MORE_PAGES_INDICATOR = r'\s*Next'
2379         _yahoo_ie = None
2380         _max_yahoo_results = 1000
2381         IE_NAME = u'video.yahoo:search'
2382
2383         def __init__(self, yahoo_ie, downloader=None):
2384                 InfoExtractor.__init__(self, downloader)
2385                 self._yahoo_ie = yahoo_ie
2386
2387         def report_download_page(self, query, pagenum):
2388                 """Report attempt to download playlist page with given number."""
2389                 query = query.decode(preferredencoding())
2390                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2391
2392         def _real_initialize(self):
2393                 self._yahoo_ie.initialize()
2394
2395         def _real_extract(self, query):
2396                 mobj = re.match(self._VALID_URL, query)
2397                 if mobj is None:
2398                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2399                         return
2400
2401                 prefix, query = query.split(':')
2402                 prefix = prefix[8:]
2403                 query = query.encode('utf-8')
2404                 if prefix == '':
2405                         self._download_n_results(query, 1)
2406                         return
2407                 elif prefix == 'all':
2408                         self._download_n_results(query, self._max_yahoo_results)
2409                         return
2410                 else:
2411                         try:
2412                                 n = long(prefix)
2413                                 if n <= 0:
2414                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2415                                         return
2416                                 elif n > self._max_yahoo_results:
2417                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2418                                         n = self._max_yahoo_results
2419                                 self._download_n_results(query, n)
2420                                 return
2421                         except ValueError: # parsing prefix as integer fails
2422                                 self._download_n_results(query, 1)
2423                                 return
2424
2425         def _download_n_results(self, query, n):
2426                 """Downloads a specified number of results for a query"""
2427
2428                 video_ids = []
2429                 already_seen = set()
2430                 pagenum = 1
2431
2432                 while True:
2433                         self.report_download_page(query, pagenum)
2434                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2435                         request = urllib2.Request(result_url)
2436                         try:
2437                                 page = urllib2.urlopen(request).read()
2438                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2439                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2440                                 return
2441
2442                         # Extract video identifiers
2443                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2444                                 video_id = mobj.group(1)
2445                                 if video_id not in already_seen:
2446                                         video_ids.append(video_id)
2447                                         already_seen.add(video_id)
2448                                         if len(video_ids) == n:
2449                                                 # Specified n videos reached
2450                                                 for id in video_ids:
2451                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2452                                                 return
2453
2454                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2455                                 for id in video_ids:
2456                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2457                                 return
2458
2459                         pagenum = pagenum + 1
2460
2461
2462 class YoutubePlaylistIE(InfoExtractor):
2463         """Information Extractor for YouTube playlists."""
2464
2465         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2466         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2467         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2468         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2469         _youtube_ie = None
2470         IE_NAME = u'youtube:playlist'
2471
2472         def __init__(self, youtube_ie, downloader=None):
2473                 InfoExtractor.__init__(self, downloader)
2474                 self._youtube_ie = youtube_ie
2475
2476         def report_download_page(self, playlist_id, pagenum):
2477                 """Report attempt to download playlist page with given number."""
2478                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2479
2480         def _real_initialize(self):
2481                 self._youtube_ie.initialize()
2482
2483         def _real_extract(self, url):
2484                 # Extract playlist id
2485                 mobj = re.match(self._VALID_URL, url)
2486                 if mobj is None:
2487                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2488                         return
2489
2490                 # Single video case
2491                 if mobj.group(3) is not None:
2492                         self._youtube_ie.extract(mobj.group(3))
2493                         return
2494
2495                 # Download playlist pages
2496                 # prefix is 'p' as default for playlists but there are other types that need extra care
2497                 playlist_prefix = mobj.group(1)
2498                 if playlist_prefix == 'a':
2499                         playlist_access = 'artist'
2500                 else:
2501                         playlist_prefix = 'p'
2502                         playlist_access = 'view_play_list'
2503                 playlist_id = mobj.group(2)
2504                 video_ids = []
2505                 pagenum = 1
2506
2507                 while True:
2508                         self.report_download_page(playlist_id, pagenum)
2509                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2510                         request = urllib2.Request(url)
2511                         try:
2512                                 page = urllib2.urlopen(request).read()
2513                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2514                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2515                                 return
2516
2517                         # Extract video identifiers
2518                         ids_in_page = []
2519                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2520                                 if mobj.group(1) not in ids_in_page:
2521                                         ids_in_page.append(mobj.group(1))
2522                         video_ids.extend(ids_in_page)
2523
2524                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2525                                 break
2526                         pagenum = pagenum + 1
2527
2528                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2529                 playlistend = self._downloader.params.get('playlistend', -1)
2530                 video_ids = video_ids[playliststart:playlistend]
2531
2532                 for id in video_ids:
2533                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2534                 return
2535
2536
2537 class YoutubeUserIE(InfoExtractor):
2538         """Information Extractor for YouTube users."""
2539
2540         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2541         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2542         _GDATA_PAGE_SIZE = 50
2543         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2544         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2545         _youtube_ie = None
2546         IE_NAME = u'youtube:user'
2547
2548         def __init__(self, youtube_ie, downloader=None):
2549                 InfoExtractor.__init__(self, downloader)
2550                 self._youtube_ie = youtube_ie
2551
2552         def report_download_page(self, username, start_index):
2553                 """Report attempt to download user page."""
2554                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2555                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2556
2557         def _real_initialize(self):
2558                 self._youtube_ie.initialize()
2559
2560         def _real_extract(self, url):
2561                 # Extract username
2562                 mobj = re.match(self._VALID_URL, url)
2563                 if mobj is None:
2564                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2565                         return
2566
2567                 username = mobj.group(1)
2568
2569                 # Download video ids using YouTube Data API. Result size per
2570                 # query is limited (currently to 50 videos) so we need to query
2571                 # page by page until there are no video ids - it means we got
2572                 # all of them.
2573
2574                 video_ids = []
2575                 pagenum = 0
2576
2577                 while True:
2578                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2579                         self.report_download_page(username, start_index)
2580
2581                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2582
2583                         try:
2584                                 page = urllib2.urlopen(request).read()
2585                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2586                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2587                                 return
2588
2589                         # Extract video identifiers
2590                         ids_in_page = []
2591
2592                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2593                                 if mobj.group(1) not in ids_in_page:
2594                                         ids_in_page.append(mobj.group(1))
2595
2596                         video_ids.extend(ids_in_page)
2597
2598                         # A little optimization - if current page is not
2599                         # "full", ie. does not contain PAGE_SIZE video ids then
2600                         # we can assume that this page is the last one - there
2601                         # are no more ids on further pages - no need to query
2602                         # again.
2603
2604                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2605                                 break
2606
2607                         pagenum += 1
2608
2609                 all_ids_count = len(video_ids)
2610                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2611                 playlistend = self._downloader.params.get('playlistend', -1)
2612
2613                 if playlistend == -1:
2614                         video_ids = video_ids[playliststart:]
2615                 else:
2616                         video_ids = video_ids[playliststart:playlistend]
2617
2618                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2619                                 (username, all_ids_count, len(video_ids)))
2620
2621                 for video_id in video_ids:
2622                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2623
2624
2625 class DepositFilesIE(InfoExtractor):
2626         """Information extractor for depositfiles.com"""
2627
2628         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2629         IE_NAME = u'DepositFiles'
2630
2631         def __init__(self, downloader=None):
2632                 InfoExtractor.__init__(self, downloader)
2633
2634         def report_download_webpage(self, file_id):
2635                 """Report webpage download."""
2636                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2637
2638         def report_extraction(self, file_id):
2639                 """Report information extraction."""
2640                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2641
2642         def _real_extract(self, url):
2643                 # At this point we have a new file
2644                 self._downloader.increment_downloads()
2645
2646                 file_id = url.split('/')[-1]
2647                 # Rebuild url in english locale
2648                 url = 'http://depositfiles.com/en/files/' + file_id
2649
2650                 # Retrieve file webpage with 'Free download' button pressed
2651                 free_download_indication = { 'gateway_result' : '1' }
2652                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2653                 try:
2654                         self.report_download_webpage(file_id)
2655                         webpage = urllib2.urlopen(request).read()
2656                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2657                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2658                         return
2659
2660                 # Search for the real file URL
2661                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2662                 if (mobj is None) or (mobj.group(1) is None):
2663                         # Try to figure out reason of the error.
2664                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2665                         if (mobj is not None) and (mobj.group(1) is not None):
2666                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2667                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2668                         else:
2669                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2670                         return
2671
2672                 file_url = mobj.group(1)
2673                 file_extension = os.path.splitext(file_url)[1][1:]
2674
2675                 # Search for file title
2676                 mobj = re.search(r'<b title="(.*?)">', webpage)
2677                 if mobj is None:
2678                         self._downloader.trouble(u'ERROR: unable to extract title')
2679                         return
2680                 file_title = mobj.group(1).decode('utf-8')
2681
2682                 try:
2683                         # Process file information
2684                         self._downloader.process_info({
2685                                 'id':           file_id.decode('utf-8'),
2686                                 'url':          file_url.decode('utf-8'),
2687                                 'uploader':     u'NA',
2688                                 'upload_date':  u'NA',
2689                                 'title':        file_title,
2690                                 'stitle':       file_title,
2691                                 'ext':          file_extension.decode('utf-8'),
2692                                 'format':       u'NA',
2693                                 'player_url':   None,
2694                         })
2695                 except UnavailableVideoError, err:
2696                         self._downloader.trouble(u'ERROR: unable to download file')
2697
2698
2699 class FacebookIE(InfoExtractor):
2700         """Information Extractor for Facebook"""
2701
2702         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2703         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2704         _NETRC_MACHINE = 'facebook'
2705         _available_formats = ['video', 'highqual', 'lowqual']
2706         _video_extensions = {
2707                 'video': 'mp4',
2708                 'highqual': 'mp4',
2709                 'lowqual': 'mp4',
2710         }
2711         IE_NAME = u'facebook'
2712
2713         def __init__(self, downloader=None):
2714                 InfoExtractor.__init__(self, downloader)
2715
2716         def _reporter(self, message):
2717                 """Add header and report message."""
2718                 self._downloader.to_screen(u'[facebook] %s' % message)
2719
2720         def report_login(self):
2721                 """Report attempt to log in."""
2722                 self._reporter(u'Logging in')
2723
2724         def report_video_webpage_download(self, video_id):
2725                 """Report attempt to download video webpage."""
2726                 self._reporter(u'%s: Downloading video webpage' % video_id)
2727
2728         def report_information_extraction(self, video_id):
2729                 """Report attempt to extract video information."""
2730                 self._reporter(u'%s: Extracting video information' % video_id)
2731
2732         def _parse_page(self, video_webpage):
2733                 """Extract video information from page"""
2734                 # General data
2735                 data = {'title': r'\("video_title", "(.*?)"\)',
2736                         'description': r'<div class="datawrap">(.*?)</div>',
2737                         'owner': r'\("video_owner_name", "(.*?)"\)',
2738                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2739                         }
2740                 video_info = {}
2741                 for piece in data.keys():
2742                         mobj = re.search(data[piece], video_webpage)
2743                         if mobj is not None:
2744                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2745
2746                 # Video urls
2747                 video_urls = {}
2748                 for fmt in self._available_formats:
2749                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2750                         if mobj is not None:
2751                                 # URL is in a Javascript segment inside an escaped Unicode format within
2752                                 # the generally utf-8 page
2753                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2754                 video_info['video_urls'] = video_urls
2755
2756                 return video_info
2757
2758         def _real_initialize(self):
2759                 if self._downloader is None:
2760                         return
2761
2762                 useremail = None
2763                 password = None
2764                 downloader_params = self._downloader.params
2765
2766                 # Attempt to use provided username and password or .netrc data
2767                 if downloader_params.get('username', None) is not None:
2768                         useremail = downloader_params['username']
2769                         password = downloader_params['password']
2770                 elif downloader_params.get('usenetrc', False):
2771                         try:
2772                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2773                                 if info is not None:
2774                                         useremail = info[0]
2775                                         password = info[2]
2776                                 else:
2777                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2778                         except (IOError, netrc.NetrcParseError), err:
2779                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2780                                 return
2781
2782                 if useremail is None:
2783                         return
2784
2785                 # Log in
2786                 login_form = {
2787                         'email': useremail,
2788                         'pass': password,
2789                         'login': 'Log+In'
2790                         }
2791                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2792                 try:
2793                         self.report_login()
2794                         login_results = urllib2.urlopen(request).read()
2795                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2796                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2797                                 return
2798                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2799                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2800                         return
2801
2802         def _real_extract(self, url):
2803                 mobj = re.match(self._VALID_URL, url)
2804                 if mobj is None:
2805                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2806                         return
2807                 video_id = mobj.group('ID')
2808
2809                 # Get video webpage
2810                 self.report_video_webpage_download(video_id)
2811                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2812                 try:
2813                         page = urllib2.urlopen(request)
2814                         video_webpage = page.read()
2815                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2816                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2817                         return
2818
2819                 # Start extracting information
2820                 self.report_information_extraction(video_id)
2821
2822                 # Extract information
2823                 video_info = self._parse_page(video_webpage)
2824
2825                 # uploader
2826                 if 'owner' not in video_info:
2827                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2828                         return
2829                 video_uploader = video_info['owner']
2830
2831                 # title
2832                 if 'title' not in video_info:
2833                         self._downloader.trouble(u'ERROR: unable to extract video title')
2834                         return
2835                 video_title = video_info['title']
2836                 video_title = video_title.decode('utf-8')
2837                 video_title = sanitize_title(video_title)
2838
2839                 simple_title = _simplify_title(video_title)
2840
2841                 # thumbnail image
2842                 if 'thumbnail' not in video_info:
2843                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2844                         video_thumbnail = ''
2845                 else:
2846                         video_thumbnail = video_info['thumbnail']
2847
2848                 # upload date
2849                 upload_date = u'NA'
2850                 if 'upload_date' in video_info:
2851                         upload_time = video_info['upload_date']
2852                         timetuple = email.utils.parsedate_tz(upload_time)
2853                         if timetuple is not None:
2854                                 try:
2855                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2856                                 except:
2857                                         pass
2858
2859                 # description
2860                 video_description = video_info.get('description', 'No description available.')
2861
2862                 url_map = video_info['video_urls']
2863                 if len(url_map.keys()) > 0:
2864                         # Decide which formats to download
2865                         req_format = self._downloader.params.get('format', None)
2866                         format_limit = self._downloader.params.get('format_limit', None)
2867
2868                         if format_limit is not None and format_limit in self._available_formats:
2869                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2870                         else:
2871                                 format_list = self._available_formats
2872                         existing_formats = [x for x in format_list if x in url_map]
2873                         if len(existing_formats) == 0:
2874                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2875                                 return
2876                         if req_format is None:
2877                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2878                         elif req_format == 'worst':
2879                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2880                         elif req_format == '-1':
2881                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2882                         else:
2883                                 # Specific format
2884                                 if req_format not in url_map:
2885                                         self._downloader.trouble(u'ERROR: requested format not available')
2886                                         return
2887                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2888
2889                 for format_param, video_real_url in video_url_list:
2890
2891                         # At this point we have a new video
2892                         self._downloader.increment_downloads()
2893
2894                         # Extension
2895                         video_extension = self._video_extensions.get(format_param, 'mp4')
2896
2897                         try:
2898                                 # Process video information
2899                                 self._downloader.process_info({
2900                                         'id':           video_id.decode('utf-8'),
2901                                         'url':          video_real_url.decode('utf-8'),
2902                                         'uploader':     video_uploader.decode('utf-8'),
2903                                         'upload_date':  upload_date,
2904                                         'title':        video_title,
2905                                         'stitle':       simple_title,
2906                                         'ext':          video_extension.decode('utf-8'),
2907                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2908                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2909                                         'description':  video_description.decode('utf-8'),
2910                                         'player_url':   None,
2911                                 })
2912                         except UnavailableVideoError, err:
2913                                 self._downloader.trouble(u'\nERROR: unable to download video')
2914
2915 class BlipTVIE(InfoExtractor):
2916         """Information extractor for blip.tv"""
2917
2918         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2919         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2920         IE_NAME = u'blip.tv'
2921
2922         def report_extraction(self, file_id):
2923                 """Report information extraction."""
2924                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2925
2926         def report_direct_download(self, title):
2927                 """Report information extraction."""
2928                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2929
2930         def _real_extract(self, url):
2931                 mobj = re.match(self._VALID_URL, url)
2932                 if mobj is None:
2933                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2934                         return
2935
2936                 if '?' in url:
2937                         cchar = '&'
2938                 else:
2939                         cchar = '?'
2940                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2941                 request = urllib2.Request(json_url)
2942                 self.report_extraction(mobj.group(1))
2943                 info = None
2944                 try:
2945                         urlh = urllib2.urlopen(request)
2946                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2947                                 basename = url.split('/')[-1]
2948                                 title,ext = os.path.splitext(basename)
2949                                 title = title.decode('UTF-8')
2950                                 ext = ext.replace('.', '')
2951                                 self.report_direct_download(title)
2952                                 info = {
2953                                         'id': title,
2954                                         'url': url,
2955                                         'title': title,
2956                                         'stitle': _simplify_title(title),
2957                                         'ext': ext,
2958                                         'urlhandle': urlh
2959                                 }
2960                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2961                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2962                         return
2963                 if info is None: # Regular URL
2964                         try:
2965                                 json_code = urlh.read()
2966                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2967                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2968                                 return
2969
2970                         try:
2971                                 json_data = json.loads(json_code)
2972                                 if 'Post' in json_data:
2973                                         data = json_data['Post']
2974                                 else:
2975                                         data = json_data
2976
2977                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2978                                 video_url = data['media']['url']
2979                                 umobj = re.match(self._URL_EXT, video_url)
2980                                 if umobj is None:
2981                                         raise ValueError('Can not determine filename extension')
2982                                 ext = umobj.group(1)
2983
2984                                 info = {
2985                                         'id': data['item_id'],
2986                                         'url': video_url,
2987                                         'uploader': data['display_name'],
2988                                         'upload_date': upload_date,
2989                                         'title': data['title'],
2990                                         'stitle': _simplify_title(data['title']),
2991                                         'ext': ext,
2992                                         'format': data['media']['mimeType'],
2993                                         'thumbnail': data['thumbnailUrl'],
2994                                         'description': data['description'],
2995                                         'player_url': data['embedUrl']
2996                                 }
2997                         except (ValueError,KeyError), err:
2998                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2999                                 return
3000
3001                 self._downloader.increment_downloads()
3002
3003                 try:
3004                         self._downloader.process_info(info)
3005                 except UnavailableVideoError, err:
3006                         self._downloader.trouble(u'\nERROR: unable to download video')
3007
3008
3009 class MyVideoIE(InfoExtractor):
3010         """Information Extractor for myvideo.de."""
3011
3012         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3013         IE_NAME = u'myvideo'
3014
3015         def __init__(self, downloader=None):
3016                 InfoExtractor.__init__(self, downloader)
3017
3018         def report_download_webpage(self, video_id):
3019                 """Report webpage download."""
3020                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3021
3022         def report_extraction(self, video_id):
3023                 """Report information extraction."""
3024                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3025
3026         def _real_extract(self,url):
3027                 mobj = re.match(self._VALID_URL, url)
3028                 if mobj is None:
3029                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3030                         return
3031
3032                 video_id = mobj.group(1)
3033
3034                 # Get video webpage
3035                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3036                 try:
3037                         self.report_download_webpage(video_id)
3038                         webpage = urllib2.urlopen(request).read()
3039                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3040                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3041                         return
3042
3043                 self.report_extraction(video_id)
3044                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3045                                  webpage)
3046                 if mobj is None:
3047                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3048                         return
3049                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3050
3051                 mobj = re.search('<title>([^<]+)</title>', webpage)
3052                 if mobj is None:
3053                         self._downloader.trouble(u'ERROR: unable to extract title')
3054                         return
3055
3056                 video_title = mobj.group(1)
3057                 video_title = sanitize_title(video_title)
3058
3059                 simple_title = _simplify_title(video_title)
3060
3061                 try:
3062                         self._downloader.process_info({
3063                                 'id':           video_id,
3064                                 'url':          video_url,
3065                                 'uploader':     u'NA',
3066                                 'upload_date':  u'NA',
3067                                 'title':        video_title,
3068                                 'stitle':       simple_title,
3069                                 'ext':          u'flv',
3070                                 'format':       u'NA',
3071                                 'player_url':   None,
3072                         })
3073                 except UnavailableVideoError:
3074                         self._downloader.trouble(u'\nERROR: Unable to download video')
3075
3076 class ComedyCentralIE(InfoExtractor):
3077         """Information extractor for The Daily Show and Colbert Report """
3078
3079         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3080         IE_NAME = u'comedycentral'
3081
3082         def report_extraction(self, episode_id):
3083                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3084
3085         def report_config_download(self, episode_id):
3086                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3087
3088         def report_index_download(self, episode_id):
3089                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3090
3091         def report_player_url(self, episode_id):
3092                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3093
3094         def _real_extract(self, url):
3095                 mobj = re.match(self._VALID_URL, url)
3096                 if mobj is None:
3097                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3098                         return
3099
3100                 if mobj.group('shortname'):
3101                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3102                                 url = u'http://www.thedailyshow.com/full-episodes/'
3103                         else:
3104                                 url = u'http://www.colbertnation.com/full-episodes/'
3105                         mobj = re.match(self._VALID_URL, url)
3106                         assert mobj is not None
3107
3108                 dlNewest = not mobj.group('episode')
3109                 if dlNewest:
3110                         epTitle = mobj.group('showname')
3111                 else:
3112                         epTitle = mobj.group('episode')
3113
3114                 req = urllib2.Request(url)
3115                 self.report_extraction(epTitle)
3116                 try:
3117                         htmlHandle = urllib2.urlopen(req)
3118                         html = htmlHandle.read()
3119                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3120                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3121                         return
3122                 if dlNewest:
3123                         url = htmlHandle.geturl()
3124                         mobj = re.match(self._VALID_URL, url)
3125                         if mobj is None:
3126                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3127                                 return
3128                         if mobj.group('episode') == '':
3129                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3130                                 return
3131                         epTitle = mobj.group('episode')
3132
3133                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3134                 if len(mMovieParams) == 0:
3135                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3136                         return
3137
3138                 playerUrl_raw = mMovieParams[0][0]
3139                 self.report_player_url(epTitle)
3140                 try:
3141                         urlHandle = urllib2.urlopen(playerUrl_raw)
3142                         playerUrl = urlHandle.geturl()
3143                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3144                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3145                         return
3146
3147                 uri = mMovieParams[0][1]
3148                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3149                 self.report_index_download(epTitle)
3150                 try:
3151                         indexXml = urllib2.urlopen(indexUrl).read()
3152                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3153                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3154                         return
3155
3156                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3157                 itemEls = idoc.findall('.//item')
3158                 for itemEl in itemEls:
3159                         mediaId = itemEl.findall('./guid')[0].text
3160                         shortMediaId = mediaId.split(':')[-1]
3161                         showId = mediaId.split(':')[-2].replace('.com', '')
3162                         officialTitle = itemEl.findall('./title')[0].text
3163                         officialDate = itemEl.findall('./pubDate')[0].text
3164
3165                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3166                                                 urllib.urlencode({'uri': mediaId}))
3167                         configReq = urllib2.Request(configUrl)
3168                         self.report_config_download(epTitle)
3169                         try:
3170                                 configXml = urllib2.urlopen(configReq).read()
3171                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3172                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3173                                 return
3174
3175                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3176                         turls = []
3177                         for rendition in cdoc.findall('.//rendition'):
3178                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3179                                 turls.append(finfo)
3180
3181                         if len(turls) == 0:
3182                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3183                                 continue
3184
3185                         # For now, just pick the highest bitrate
3186                         format,video_url = turls[-1]
3187
3188                         self._downloader.increment_downloads()
3189
3190                         effTitle = showId + u'-' + epTitle
3191                         info = {
3192                                 'id': shortMediaId,
3193                                 'url': video_url,
3194                                 'uploader': showId,
3195                                 'upload_date': officialDate,
3196                                 'title': effTitle,
3197                                 'stitle': _simplify_title(effTitle),
3198                                 'ext': 'mp4',
3199                                 'format': format,
3200                                 'thumbnail': None,
3201                                 'description': officialTitle,
3202                                 'player_url': playerUrl
3203                         }
3204
3205                         try:
3206                                 self._downloader.process_info(info)
3207                         except UnavailableVideoError, err:
3208                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3209                                 continue
3210
3211
3212 class EscapistIE(InfoExtractor):
3213         """Information extractor for The Escapist """
3214
3215         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3216         IE_NAME = u'escapist'
3217
3218         def report_extraction(self, showName):
3219                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3220
3221         def report_config_download(self, showName):
3222                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3223
3224         def _real_extract(self, url):
3225                 htmlParser = HTMLParser.HTMLParser()
3226
3227                 mobj = re.match(self._VALID_URL, url)
3228                 if mobj is None:
3229                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3230                         return
3231                 showName = mobj.group('showname')
3232                 videoId = mobj.group('episode')
3233
3234                 self.report_extraction(showName)
3235                 try:
3236                         webPage = urllib2.urlopen(url).read()
3237                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3238                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3239                         return
3240
3241                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3242                 description = htmlParser.unescape(descMatch.group(1))
3243                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3244                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3245                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3246                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3247                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3248                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3249
3250                 self.report_config_download(showName)
3251                 try:
3252                         configJSON = urllib2.urlopen(configUrl).read()
3253                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3254                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3255                         return
3256
3257                 # Technically, it's JavaScript, not JSON
3258                 configJSON = configJSON.replace("'", '"')
3259
3260                 try:
3261                         config = json.loads(configJSON)
3262                 except (ValueError,), err:
3263                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3264                         return
3265
3266                 playlist = config['playlist']
3267                 videoUrl = playlist[1]['url']
3268
3269                 self._downloader.increment_downloads()
3270                 info = {
3271                         'id': videoId,
3272                         'url': videoUrl,
3273                         'uploader': showName,
3274                         'upload_date': None,
3275                         'title': showName,
3276                         'stitle': _simplify_title(showName),
3277                         'ext': 'flv',
3278                         'format': 'flv',
3279                         'thumbnail': imgUrl,
3280                         'description': description,
3281                         'player_url': playerUrl,
3282                 }
3283
3284                 try:
3285                         self._downloader.process_info(info)
3286                 except UnavailableVideoError, err:
3287                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3288
3289
3290 class CollegeHumorIE(InfoExtractor):
3291         """Information extractor for collegehumor.com"""
3292
3293         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3294         IE_NAME = u'collegehumor'
3295
3296         def report_webpage(self, video_id):
3297                 """Report information extraction."""
3298                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3299
3300         def report_extraction(self, video_id):
3301                 """Report information extraction."""
3302                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3303
3304         def _real_extract(self, url):
3305                 htmlParser = HTMLParser.HTMLParser()
3306
3307                 mobj = re.match(self._VALID_URL, url)
3308                 if mobj is None:
3309                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3310                         return
3311                 video_id = mobj.group('videoid')
3312
3313                 self.report_webpage(video_id)
3314                 request = urllib2.Request(url)
3315                 try:
3316                         webpage = urllib2.urlopen(request).read()
3317                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3318                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3319                         return
3320
3321                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3322                 if m is None:
3323                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3324                         return
3325                 internal_video_id = m.group('internalvideoid')
3326
3327                 info = {
3328                         'id': video_id,
3329                         'internal_id': internal_video_id,
3330                 }
3331
3332                 self.report_extraction(video_id)
3333                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3334                 try:
3335                         metaXml = urllib2.urlopen(xmlUrl).read()
3336                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3337                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3338                         return
3339
3340                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3341                 try:
3342                         videoNode = mdoc.findall('./video')[0]
3343                         info['description'] = videoNode.findall('./description')[0].text
3344                         info['title'] = videoNode.findall('./caption')[0].text
3345                         info['stitle'] = _simplify_title(info['title'])
3346                         info['url'] = videoNode.findall('./file')[0].text
3347                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3348                         info['ext'] = info['url'].rpartition('.')[2]
3349                         info['format'] = info['ext']
3350                 except IndexError:
3351                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3352                         return
3353
3354                 self._downloader.increment_downloads()
3355
3356                 try:
3357                         self._downloader.process_info(info)
3358                 except UnavailableVideoError, err:
3359                         self._downloader.trouble(u'\nERROR: unable to download video')
3360
3361
3362 class XVideosIE(InfoExtractor):
3363         """Information extractor for xvideos.com"""
3364
3365         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3366         IE_NAME = u'xvideos'
3367
3368         def report_webpage(self, video_id):
3369                 """Report information extraction."""
3370                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3371
3372         def report_extraction(self, video_id):
3373                 """Report information extraction."""
3374                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3375
3376         def _real_extract(self, url):
3377                 htmlParser = HTMLParser.HTMLParser()
3378
3379                 mobj = re.match(self._VALID_URL, url)
3380                 if mobj is None:
3381                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3382                         return
3383                 video_id = mobj.group(1).decode('utf-8')
3384
3385                 self.report_webpage(video_id)
3386
3387                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3388                 try:
3389                         webpage = urllib2.urlopen(request).read()
3390                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3391                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3392                         return
3393
3394                 self.report_extraction(video_id)
3395
3396
3397                 # Extract video URL
3398                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3399                 if mobj is None:
3400                         self._downloader.trouble(u'ERROR: unable to extract video url')
3401                         return
3402                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3403
3404
3405                 # Extract title
3406                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3407                 if mobj is None:
3408                         self._downloader.trouble(u'ERROR: unable to extract video title')
3409                         return
3410                 video_title = mobj.group(1).decode('utf-8')
3411
3412
3413                 # Extract video thumbnail
3414                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3415                 if mobj is None:
3416                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3417                         return
3418                 video_thumbnail = mobj.group(1).decode('utf-8')
3419
3420
3421
3422                 self._downloader.increment_downloads()
3423                 info = {
3424                         'id': video_id,
3425                         'url': video_url,
3426                         'uploader': None,
3427                         'upload_date': None,
3428                         'title': video_title,
3429                         'stitle': _simplify_title(video_title),
3430                         'ext': 'flv',
3431                         'format': 'flv',
3432                         'thumbnail': video_thumbnail,
3433                         'description': None,
3434                         'player_url': None,
3435                 }
3436
3437                 try:
3438                         self._downloader.process_info(info)
3439                 except UnavailableVideoError, err:
3440                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3441
3442
3443 class SoundcloudIE(InfoExtractor):
3444         """Information extractor for soundcloud.com
3445            To access the media, the uid of the song and a stream token
3446            must be extracted from the page source and the script must make
3447            a request to media.soundcloud.com/crossdomain.xml. Then
3448            the media can be grabbed by requesting from an url composed
3449            of the stream token and uid
3450          """
3451
3452         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3453         IE_NAME = u'soundcloud'
3454
3455         def __init__(self, downloader=None):
3456                 InfoExtractor.__init__(self, downloader)
3457
3458         def report_webpage(self, video_id):
3459                 """Report information extraction."""
3460                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3461
3462         def report_extraction(self, video_id):
3463                 """Report information extraction."""
3464                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3465
3466         def _real_extract(self, url):
3467                 htmlParser = HTMLParser.HTMLParser()
3468
3469                 mobj = re.match(self._VALID_URL, url)
3470                 if mobj is None:
3471                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3472                         return
3473
3474                 # extract uploader (which is in the url)
3475                 uploader = mobj.group(1).decode('utf-8')
3476                 # extract simple title (uploader + slug of song title)
3477                 slug_title =  mobj.group(2).decode('utf-8')
3478                 simple_title = uploader + '-' + slug_title
3479
3480                 self.report_webpage('%s/%s' % (uploader, slug_title))
3481
3482                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3483                 try:
3484                         webpage = urllib2.urlopen(request).read()
3485                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3486                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3487                         return
3488
3489                 self.report_extraction('%s/%s' % (uploader, slug_title))
3490
3491                 # extract uid and stream token that soundcloud hands out for access
3492                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3493                 if mobj:
3494                         video_id = mobj.group(1)
3495                         stream_token = mobj.group(2)
3496
3497                 # extract unsimplified title
3498                 mobj = re.search('"title":"(.*?)",', webpage)
3499                 if mobj:
3500                         title = mobj.group(1)
3501
3502                 # construct media url (with uid/token)
3503                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3504                 mediaURL = mediaURL % (video_id, stream_token)
3505
3506                 # description
3507                 description = u'No description available'
3508                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3509                 if mobj:
3510                         description = mobj.group(1)
3511
3512                 # upload date
3513                 upload_date = None
3514                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3515                 if mobj:
3516                         try:
3517                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3518                         except Exception, e:
3519                                 print str(e)
3520
3521                 # for soundcloud, a request to a cross domain is required for cookies
3522                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3523
3524                 try:
3525                         self._downloader.process_info({
3526                                 'id':           video_id.decode('utf-8'),
3527                                 'url':          mediaURL,
3528                                 'uploader':     uploader.decode('utf-8'),
3529                                 'upload_date':  upload_date,
3530                                 'title':        simple_title.decode('utf-8'),
3531                                 'stitle':       simple_title.decode('utf-8'),
3532                                 'ext':          u'mp3',
3533                                 'format':       u'NA',
3534                                 'player_url':   None,
3535                                 'description': description.decode('utf-8')
3536                         })
3537                 except UnavailableVideoError:
3538                         self._downloader.trouble(u'\nERROR: unable to download video')
3539
3540
3541 class InfoQIE(InfoExtractor):
3542         """Information extractor for infoq.com"""
3543
3544         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3545         IE_NAME = u'infoq'
3546
3547         def report_webpage(self, video_id):
3548                 """Report information extraction."""
3549                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3550
3551         def report_extraction(self, video_id):
3552                 """Report information extraction."""
3553                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3554
3555         def _real_extract(self, url):
3556                 htmlParser = HTMLParser.HTMLParser()
3557
3558                 mobj = re.match(self._VALID_URL, url)
3559                 if mobj is None:
3560                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3561                         return
3562
3563                 self.report_webpage(url)
3564
3565                 request = urllib2.Request(url)
3566                 try:
3567                         webpage = urllib2.urlopen(request).read()
3568                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3569                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3570                         return
3571
3572                 self.report_extraction(url)
3573
3574
3575                 # Extract video URL
3576                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3577                 if mobj is None:
3578                         self._downloader.trouble(u'ERROR: unable to extract video url')
3579                         return
3580                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3581
3582
3583                 # Extract title
3584                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3585                 if mobj is None:
3586                         self._downloader.trouble(u'ERROR: unable to extract video title')
3587                         return
3588                 video_title = mobj.group(1).decode('utf-8')
3589
3590                 # Extract description
3591                 video_description = u'No description available.'
3592                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3593                 if mobj is not None:
3594                         video_description = mobj.group(1).decode('utf-8')
3595
3596                 video_filename = video_url.split('/')[-1]
3597                 video_id, extension = video_filename.split('.')
3598
3599                 self._downloader.increment_downloads()
3600                 info = {
3601                         'id': video_id,
3602                         'url': video_url,
3603                         'uploader': None,
3604                         'upload_date': None,
3605                         'title': video_title,
3606                         'stitle': _simplify_title(video_title),
3607                         'ext': extension,
3608                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3609                         'thumbnail': None,
3610                         'description': video_description,
3611                         'player_url': None,
3612                 }
3613
3614                 try:
3615                         self._downloader.process_info(info)
3616                 except UnavailableVideoError, err:
3617                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3618
3619 class MixcloudIE(InfoExtractor):
3620         """Information extractor for www.mixcloud.com"""
3621         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3622         IE_NAME = u'mixcloud'
3623
3624         def __init__(self, downloader=None):
3625                 InfoExtractor.__init__(self, downloader)
3626
3627         def report_download_json(self, file_id):
3628                 """Report JSON download."""
3629                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3630
3631         def report_extraction(self, file_id):
3632                 """Report information extraction."""
3633                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3634
3635         def get_urls(self, jsonData, fmt, bitrate='best'):
3636                 """Get urls from 'audio_formats' section in json"""
3637                 file_url = None
3638                 try:
3639                         bitrate_list = jsonData[fmt]
3640                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3641                                 bitrate = max(bitrate_list) # select highest
3642
3643                         url_list = jsonData[fmt][bitrate]
3644                 except TypeError: # we have no bitrate info.
3645                         url_list = jsonData[fmt]
3646
3647                 return url_list
3648
3649         def check_urls(self, url_list):
3650                 """Returns 1st active url from list"""
3651                 for url in url_list:
3652                         try:
3653                                 urllib2.urlopen(url)
3654                                 return url
3655                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3656                                 url = None
3657
3658                 return None
3659
3660         def _print_formats(self, formats):
3661                 print 'Available formats:'
3662                 for fmt in formats.keys():
3663                         for b in formats[fmt]:
3664                                 try:
3665                                         ext = formats[fmt][b][0]
3666                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3667                                 except TypeError: # we have no bitrate info
3668                                         ext = formats[fmt][0]
3669                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3670                                         break
3671
3672         def _real_extract(self, url):
3673                 mobj = re.match(self._VALID_URL, url)
3674                 if mobj is None:
3675                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3676                         return
3677                 # extract uploader & filename from url
3678                 uploader = mobj.group(1).decode('utf-8')
3679                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3680
3681                 # construct API request
3682                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3683                 # retrieve .json file with links to files
3684                 request = urllib2.Request(file_url)
3685                 try:
3686                         self.report_download_json(file_url)
3687                         jsonData = urllib2.urlopen(request).read()
3688                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3689                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3690                         return
3691
3692                 # parse JSON
3693                 json_data = json.loads(jsonData)
3694                 player_url = json_data['player_swf_url']
3695                 formats = dict(json_data['audio_formats'])
3696
3697                 req_format = self._downloader.params.get('format', None)
3698                 bitrate = None
3699
3700                 if self._downloader.params.get('listformats', None):
3701                         self._print_formats(formats)
3702                         return
3703
3704                 if req_format is None or req_format == 'best':
3705                         for format_param in formats.keys():
3706                                 url_list = self.get_urls(formats, format_param)
3707                                 # check urls
3708                                 file_url = self.check_urls(url_list)
3709                                 if file_url is not None:
3710                                         break # got it!
3711                 else:
3712                         if req_format not in formats.keys():
3713                                 self._downloader.trouble(u'ERROR: format is not available')
3714                                 return
3715
3716                         url_list = self.get_urls(formats, req_format)
3717                         file_url = self.check_urls(url_list)
3718                         format_param = req_format
3719
3720                 # We have audio
3721                 self._downloader.increment_downloads()
3722                 try:
3723                         # Process file information
3724                         self._downloader.process_info({
3725                                 'id':           file_id.decode('utf-8'),
3726                                 'url':          file_url.decode('utf-8'),
3727                                 'uploader':     uploader.decode('utf-8'),
3728                                 'upload_date':  u'NA',
3729                                 'title':        json_data['name'],
3730                                 'stitle':       _simplify_title(json_data['name']),
3731                                 'ext':          file_url.split('.')[-1].decode('utf-8'),
3732                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3733                                 'thumbnail':    json_data['thumbnail_url'],
3734                                 'description':  json_data['description'],
3735                                 'player_url':   player_url.decode('utf-8'),
3736                         })
3737                 except UnavailableVideoError, err:
3738                         self._downloader.trouble(u'ERROR: unable to download file')
3739
3740
3741
3742 class PostProcessor(object):
3743         """Post Processor class.
3744
3745         PostProcessor objects can be added to downloaders with their
3746         add_post_processor() method. When the downloader has finished a
3747         successful download, it will take its internal chain of PostProcessors
3748         and start calling the run() method on each one of them, first with
3749         an initial argument and then with the returned value of the previous
3750         PostProcessor.
3751
3752         The chain will be stopped if one of them ever returns None or the end
3753         of the chain is reached.
3754
3755         PostProcessor objects follow a "mutual registration" process similar
3756         to InfoExtractor objects.
3757         """
3758
3759         _downloader = None
3760
3761         def __init__(self, downloader=None):
3762                 self._downloader = downloader
3763
3764         def set_downloader(self, downloader):
3765                 """Sets the downloader for this PP."""
3766                 self._downloader = downloader
3767
3768         def run(self, information):
3769                 """Run the PostProcessor.
3770
3771                 The "information" argument is a dictionary like the ones
3772                 composed by InfoExtractors. The only difference is that this
3773                 one has an extra field called "filepath" that points to the
3774                 downloaded file.
3775
3776                 When this method returns None, the postprocessing chain is
3777                 stopped. However, this method may return an information
3778                 dictionary that will be passed to the next postprocessing
3779                 object in the chain. It can be the one it received after
3780                 changing some fields.
3781
3782                 In addition, this method may raise a PostProcessingError
3783                 exception that will be taken into account by the downloader
3784                 it was called from.
3785                 """
3786                 return information # by default, do nothing
3787
3788
3789 class FFmpegExtractAudioPP(PostProcessor):
3790
3791         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3792                 PostProcessor.__init__(self, downloader)
3793                 if preferredcodec is None:
3794                         preferredcodec = 'best'
3795                 self._preferredcodec = preferredcodec
3796                 self._preferredquality = preferredquality
3797                 self._keepvideo = keepvideo
3798
3799         @staticmethod
3800         def get_audio_codec(path):
3801                 try:
3802                         cmd = ['ffprobe', '-show_streams', '--', path]
3803                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3804                         output = handle.communicate()[0]
3805                         if handle.wait() != 0:
3806                                 return None
3807                 except (IOError, OSError):
3808                         return None
3809                 audio_codec = None
3810                 for line in output.split('\n'):
3811                         if line.startswith('codec_name='):
3812                                 audio_codec = line.split('=')[1].strip()
3813                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3814                                 return audio_codec
3815                 return None
3816
3817         @staticmethod
3818         def run_ffmpeg(path, out_path, codec, more_opts):
3819                 try:
3820                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3821                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3822                         return (ret == 0)
3823                 except (IOError, OSError):
3824                         return False
3825
3826         def run(self, information):
3827                 path = information['filepath']
3828
3829                 filecodec = self.get_audio_codec(path)
3830                 if filecodec is None:
3831                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3832                         return None
3833
3834                 more_opts = []
3835                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3836                         if filecodec in ['aac', 'mp3', 'vorbis']:
3837                                 # Lossless if possible
3838                                 acodec = 'copy'
3839                                 extension = filecodec
3840                                 if filecodec == 'aac':
3841                                         more_opts = ['-f', 'adts']
3842                                 if filecodec == 'vorbis':
3843                                         extension = 'ogg'
3844                         else:
3845                                 # MP3 otherwise.
3846                                 acodec = 'libmp3lame'
3847                                 extension = 'mp3'
3848                                 more_opts = []
3849                                 if self._preferredquality is not None:
3850                                         more_opts += ['-ab', self._preferredquality]
3851                 else:
3852                         # We convert the audio (lossy)
3853                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3854                         extension = self._preferredcodec
3855                         more_opts = []
3856                         if self._preferredquality is not None:
3857                                 more_opts += ['-ab', self._preferredquality]
3858                         if self._preferredcodec == 'aac':
3859                                 more_opts += ['-f', 'adts']
3860                         if self._preferredcodec == 'vorbis':
3861                                 extension = 'ogg'
3862
3863                 (prefix, ext) = os.path.splitext(path)
3864                 new_path = prefix + '.' + extension
3865                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3866                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3867
3868                 if not status:
3869                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3870                         return None
3871
3872                 # Try to update the date time for extracted audio file.
3873                 if information.get('filetime') is not None:
3874                         try:
3875                                 os.utime(new_path, (time.time(), information['filetime']))
3876                         except:
3877                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3878
3879                 if not self._keepvideo:
3880                         try:
3881                                 os.remove(path)
3882                         except (IOError, OSError):
3883                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3884                                 return None
3885
3886                 information['filepath'] = new_path
3887                 return information
3888
3889
3890 def updateSelf(downloader, filename):
3891         ''' Update the program file with the latest version from the repository '''
3892         # Note: downloader only used for options
3893         if not os.access(filename, os.W_OK):
3894                 sys.exit('ERROR: no write permissions on %s' % filename)
3895
3896         downloader.to_screen('Updating to latest version...')
3897
3898         try:
3899                 try:
3900                         urlh = urllib.urlopen(UPDATE_URL)
3901                         newcontent = urlh.read()
3902
3903                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3904                         if vmatch is not None and vmatch.group(1) == __version__:
3905                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3906                                 return
3907                 finally:
3908                         urlh.close()
3909         except (IOError, OSError), err:
3910                 sys.exit('ERROR: unable to download latest version')
3911
3912         try:
3913                 outf = open(filename, 'wb')
3914                 try:
3915                         outf.write(newcontent)
3916                 finally:
3917                         outf.close()
3918         except (IOError, OSError), err:
3919                 sys.exit('ERROR: unable to overwrite current version')
3920
3921         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3922
3923 def parseOpts():
3924         # Deferred imports
3925         import getpass
3926         import optparse
3927
3928         def _format_option_string(option):
3929                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3930
3931                 opts = []
3932
3933                 if option._short_opts: opts.append(option._short_opts[0])
3934                 if option._long_opts: opts.append(option._long_opts[0])
3935                 if len(opts) > 1: opts.insert(1, ', ')
3936
3937                 if option.takes_value(): opts.append(' %s' % option.metavar)
3938
3939                 return "".join(opts)
3940
3941         def _find_term_columns():
3942                 columns = os.environ.get('COLUMNS', None)
3943                 if columns:
3944                         return int(columns)
3945
3946                 try:
3947                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3948                         out,err = sp.communicate()
3949                         return int(out.split()[1])
3950                 except:
3951                         pass
3952                 return None
3953
3954         max_width = 80
3955         max_help_position = 80
3956
3957         # No need to wrap help messages if we're on a wide console
3958         columns = _find_term_columns()
3959         if columns: max_width = columns
3960
3961         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3962         fmt.format_option_strings = _format_option_string
3963
3964         kw = {
3965                 'version'   : __version__,
3966                 'formatter' : fmt,
3967                 'usage' : '%prog [options] url [url...]',
3968                 'conflict_handler' : 'resolve',
3969         }
3970
3971         parser = optparse.OptionParser(**kw)
3972
3973         # option groups
3974         general        = optparse.OptionGroup(parser, 'General Options')
3975         selection      = optparse.OptionGroup(parser, 'Video Selection')
3976         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3977         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3978         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3979         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3980         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3981
3982         general.add_option('-h', '--help',
3983                         action='help', help='print this help text and exit')
3984         general.add_option('-v', '--version',
3985                         action='version', help='print program version and exit')
3986         general.add_option('-U', '--update',
3987                         action='store_true', dest='update_self', help='update this program to latest version')
3988         general.add_option('-i', '--ignore-errors',
3989                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3990         general.add_option('-r', '--rate-limit',
3991                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3992         general.add_option('-R', '--retries',
3993                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3994         general.add_option('--dump-user-agent',
3995                         action='store_true', dest='dump_user_agent',
3996                         help='display the current browser identification', default=False)
3997         general.add_option('--list-extractors',
3998                         action='store_true', dest='list_extractors',
3999                         help='List all supported extractors and the URLs they would handle', default=False)
4000
4001         selection.add_option('--playlist-start',
4002                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4003         selection.add_option('--playlist-end',
4004                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4005         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4006         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4007         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4008
4009         authentication.add_option('-u', '--username',
4010                         dest='username', metavar='USERNAME', help='account username')
4011         authentication.add_option('-p', '--password',
4012                         dest='password', metavar='PASSWORD', help='account password')
4013         authentication.add_option('-n', '--netrc',
4014                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4015
4016
4017         video_format.add_option('-f', '--format',
4018                         action='store', dest='format', metavar='FORMAT', help='video format code')
4019         video_format.add_option('--all-formats',
4020                         action='store_const', dest='format', help='download all available video formats', const='all')
4021         video_format.add_option('--max-quality',
4022                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4023         video_format.add_option('-F', '--list-formats',
4024                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4025
4026
4027         verbosity.add_option('-q', '--quiet',
4028                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4029         verbosity.add_option('-s', '--simulate',
4030                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4031         verbosity.add_option('--skip-download',
4032                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4033         verbosity.add_option('-g', '--get-url',
4034                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4035         verbosity.add_option('-e', '--get-title',
4036                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4037         verbosity.add_option('--get-thumbnail',
4038                         action='store_true', dest='getthumbnail',
4039                         help='simulate, quiet but print thumbnail URL', default=False)
4040         verbosity.add_option('--get-description',
4041                         action='store_true', dest='getdescription',
4042                         help='simulate, quiet but print video description', default=False)
4043         verbosity.add_option('--get-filename',
4044                         action='store_true', dest='getfilename',
4045                         help='simulate, quiet but print output filename', default=False)
4046         verbosity.add_option('--get-format',
4047                         action='store_true', dest='getformat',
4048                         help='simulate, quiet but print output format', default=False)
4049         verbosity.add_option('--no-progress',
4050                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4051         verbosity.add_option('--console-title',
4052                         action='store_true', dest='consoletitle',
4053                         help='display progress in console titlebar', default=False)
4054
4055
4056         filesystem.add_option('-t', '--title',
4057                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4058         filesystem.add_option('-l', '--literal',
4059                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4060         filesystem.add_option('-A', '--auto-number',
4061                         action='store_true', dest='autonumber',
4062                         help='number downloaded files starting from 00000', default=False)
4063         filesystem.add_option('-o', '--output',
4064                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
4065         filesystem.add_option('-a', '--batch-file',
4066                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4067         filesystem.add_option('-w', '--no-overwrites',
4068                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4069         filesystem.add_option('-c', '--continue',
4070                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4071         filesystem.add_option('--no-continue',
4072                         action='store_false', dest='continue_dl',
4073                         help='do not resume partially downloaded files (restart from beginning)')
4074         filesystem.add_option('--cookies',
4075                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4076         filesystem.add_option('--no-part',
4077                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4078         filesystem.add_option('--no-mtime',
4079                         action='store_false', dest='updatetime',
4080                         help='do not use the Last-modified header to set the file modification time', default=True)
4081         filesystem.add_option('--write-description',
4082                         action='store_true', dest='writedescription',
4083                         help='write video description to a .description file', default=False)
4084         filesystem.add_option('--write-info-json',
4085                         action='store_true', dest='writeinfojson',
4086                         help='write video metadata to a .info.json file', default=False)
4087
4088
4089         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4090                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4091         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4092                         help='"best", "aac", "vorbis" or "mp3"; best by default')
4093         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4094                         help='ffmpeg audio bitrate specification, 128k by default')
4095         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4096                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4097
4098
4099         parser.add_option_group(general)
4100         parser.add_option_group(selection)
4101         parser.add_option_group(filesystem)
4102         parser.add_option_group(verbosity)
4103         parser.add_option_group(video_format)
4104         parser.add_option_group(authentication)
4105         parser.add_option_group(postproc)
4106
4107         opts, args = parser.parse_args()
4108
4109         return parser, opts, args
4110
4111 def gen_extractors():
4112         """ Return a list of an instance of every supported extractor.
4113         The order does matter; the first extractor matched is the one handling the URL.
4114         """
4115         youtube_ie = YoutubeIE()
4116         google_ie = GoogleIE()
4117         yahoo_ie = YahooIE()
4118         return [
4119                 YoutubePlaylistIE(youtube_ie),
4120                 YoutubeUserIE(youtube_ie),
4121                 YoutubeSearchIE(youtube_ie),
4122                 youtube_ie,
4123                 MetacafeIE(youtube_ie),
4124                 DailymotionIE(),
4125                 google_ie,
4126                 GoogleSearchIE(google_ie),
4127                 PhotobucketIE(),
4128                 yahoo_ie,
4129                 YahooSearchIE(yahoo_ie),
4130                 DepositFilesIE(),
4131                 FacebookIE(),
4132                 BlipTVIE(),
4133                 VimeoIE(),
4134                 MyVideoIE(),
4135                 ComedyCentralIE(),
4136                 EscapistIE(),
4137                 CollegeHumorIE(),
4138                 XVideosIE(),
4139                 SoundcloudIE(),
4140                 InfoQIE(),
4141                 MixcloudIE(),
4142
4143                 GenericIE()
4144         ]
4145
4146 def _real_main():
4147         parser, opts, args = parseOpts()
4148
4149         # Open appropriate CookieJar
4150         if opts.cookiefile is None:
4151                 jar = cookielib.CookieJar()
4152         else:
4153                 try:
4154                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4155                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4156                                 jar.load()
4157                 except (IOError, OSError), err:
4158                         sys.exit(u'ERROR: unable to open cookie file')
4159
4160         # Dump user agent
4161         if opts.dump_user_agent:
4162                 print std_headers['User-Agent']
4163                 sys.exit(0)
4164
4165         # Batch file verification
4166         batchurls = []
4167         if opts.batchfile is not None:
4168                 try:
4169                         if opts.batchfile == '-':
4170                                 batchfd = sys.stdin
4171                         else:
4172                                 batchfd = open(opts.batchfile, 'r')
4173                         batchurls = batchfd.readlines()
4174                         batchurls = [x.strip() for x in batchurls]
4175                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4176                 except IOError:
4177                         sys.exit(u'ERROR: batch file could not be read')
4178         all_urls = batchurls + args
4179
4180         # General configuration
4181         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4182         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4183         urllib2.install_opener(opener)
4184         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4185
4186         extractors = gen_extractors()
4187
4188         if opts.list_extractors:
4189                 for ie in extractors:
4190                         print(ie.IE_NAME)
4191                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4192                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4193                         for mu in matchedUrls:
4194                                 print(u'  ' + mu)
4195                 sys.exit(0)
4196
4197         # Conflicting, missing and erroneous options
4198         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4199                 parser.error(u'using .netrc conflicts with giving username/password')
4200         if opts.password is not None and opts.username is None:
4201                 parser.error(u'account username missing')
4202         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4203                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4204         if opts.usetitle and opts.useliteral:
4205                 parser.error(u'using title conflicts with using literal title')
4206         if opts.username is not None and opts.password is None:
4207                 opts.password = getpass.getpass(u'Type account password and press return:')
4208         if opts.ratelimit is not None:
4209                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4210                 if numeric_limit is None:
4211                         parser.error(u'invalid rate limit specified')
4212                 opts.ratelimit = numeric_limit
4213         if opts.retries is not None:
4214                 try:
4215                         opts.retries = long(opts.retries)
4216                 except (TypeError, ValueError), err:
4217                         parser.error(u'invalid retry count specified')
4218         try:
4219                 opts.playliststart = int(opts.playliststart)
4220                 if opts.playliststart <= 0:
4221                         raise ValueError(u'Playlist start must be positive')
4222         except (TypeError, ValueError), err:
4223                 parser.error(u'invalid playlist start number specified')
4224         try:
4225                 opts.playlistend = int(opts.playlistend)
4226                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4227                         raise ValueError(u'Playlist end must be greater than playlist start')
4228         except (TypeError, ValueError), err:
4229                 parser.error(u'invalid playlist end number specified')
4230         if opts.extractaudio:
4231                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4232                         parser.error(u'invalid audio format specified')
4233
4234         # File downloader
4235         fd = FileDownloader({
4236                 'usenetrc': opts.usenetrc,
4237                 'username': opts.username,
4238                 'password': opts.password,
4239                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4240                 'forceurl': opts.geturl,
4241                 'forcetitle': opts.gettitle,
4242                 'forcethumbnail': opts.getthumbnail,
4243                 'forcedescription': opts.getdescription,
4244                 'forcefilename': opts.getfilename,
4245                 'forceformat': opts.getformat,
4246                 'simulate': opts.simulate,
4247                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4248                 'format': opts.format,
4249                 'format_limit': opts.format_limit,
4250                 'listformats': opts.listformats,
4251                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4252                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4253                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4254                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4255                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4256                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4257                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4258                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4259                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4260                         or u'%(id)s.%(ext)s'),
4261                 'ignoreerrors': opts.ignoreerrors,
4262                 'ratelimit': opts.ratelimit,
4263                 'nooverwrites': opts.nooverwrites,
4264                 'retries': opts.retries,
4265                 'continuedl': opts.continue_dl,
4266                 'noprogress': opts.noprogress,
4267                 'playliststart': opts.playliststart,
4268                 'playlistend': opts.playlistend,
4269                 'logtostderr': opts.outtmpl == '-',
4270                 'consoletitle': opts.consoletitle,
4271                 'nopart': opts.nopart,
4272                 'updatetime': opts.updatetime,
4273                 'writedescription': opts.writedescription,
4274                 'writeinfojson': opts.writeinfojson,
4275                 'matchtitle': opts.matchtitle,
4276                 'rejecttitle': opts.rejecttitle,
4277                 'max_downloads': int(opts.max_downloads),
4278                 })
4279         for extractor in extractors:
4280                 fd.add_info_extractor(extractor)
4281
4282         # PostProcessors
4283         if opts.extractaudio:
4284                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4285
4286         # Update version
4287         if opts.update_self:
4288                 updateSelf(fd, sys.argv[0])
4289
4290         # Maybe do nothing
4291         if len(all_urls) < 1:
4292                 if not opts.update_self:
4293                         parser.error(u'you must provide at least one URL')
4294                 else:
4295                         sys.exit()
4296         retcode = fd.download(all_urls)
4297
4298         # Dump cookie jar if requested
4299         if opts.cookiefile is not None:
4300                 try:
4301                         jar.save()
4302                 except (IOError, OSError), err:
4303                         sys.exit(u'ERROR: unable to save cookie jar')
4304
4305         sys.exit(retcode)
4306
4307 def main():
4308         try:
4309                 _real_main()
4310         except DownloadError:
4311                 sys.exit(1)
4312         except SameFileError:
4313                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4314         except KeyboardInterrupt:
4315                 sys.exit(u'\nERROR: Interrupted by user')
4316
4317 if __name__ == '__main__':
4318         main()
4319
4320 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: