youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         )
  19
  20 __license__ = 'Public Domain'
  21 __version__ = '2011.11.23'
  22
  23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  24
  25 import cookielib
  26 import datetime
  27 import gzip
  28 import htmlentitydefs
  29 import HTMLParser
  30 import httplib
  31 import locale
  32 import math
  33 import netrc
  34 import os
  35 import os.path
  36 import re
  37 import socket
  38 import string
  39 import subprocess
  40 import sys
  41 import time
  42 import urllib
  43 import urllib2
  44 import warnings
  45 import zlib
  46
  47 if os.name == 'nt':
  48         import ctypes
  49
  50 try:
  51         import email.utils
  52 except ImportError: # Python 2.4
  53         import email.Utils
  54 try:
  55         import cStringIO as StringIO
  56 except ImportError:
  57         import StringIO
  58
  59 # parse_qs was moved from the cgi module to the urlparse module recently.
  60 try:
  61         from urlparse import parse_qs
  62 except ImportError:
  63         from cgi import parse_qs
  64
  65 try:
  66         import lxml.etree
  67 except ImportError:
  68         pass # Handled below
  69
  70 try:
  71         import xml.etree.ElementTree
  72 except ImportError: # Python<2.5: Not officially supported, but let it slip
  73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  74
  75 std_headers = {
  76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  79         'Accept-Encoding': 'gzip, deflate',
  80         'Accept-Language': 'en-us,en;q=0.5',
  81 }
  82
  83 try:
  84         import json
  85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  86         import re
  87         class json(object):
  88                 @staticmethod
  89                 def loads(s):
  90                         s = s.decode('UTF-8')
  91                         def raiseError(msg, i):
  92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  93                         def skipSpace(i, expectMore=True):
  94                                 while i < len(s) and s[i] in ' \t\r\n':
  95                                         i += 1
  96                                 if expectMore:
  97                                         if i >= len(s):
  98                                                 raiseError('Premature end', i)
  99                                 return i
 100                         def decodeEscape(match):
 101                                 esc = match.group(1)
 102                                 _STATIC = {
 103                                         '"': '"',
 104                                         '\\': '\\',
 105                                         '/': '/',
 106                                         'b': unichr(0x8),
 107                                         'f': unichr(0xc),
 108                                         'n': '\n',
 109                                         'r': '\r',
 110                                         't': '\t',
 111                                 }
 112                                 if esc in _STATIC:
 113                                         return _STATIC[esc]
 114                                 if esc[0] == 'u':
 115                                         if len(esc) == 1+4:
 116                                                 return unichr(int(esc[1:5], 16))
 117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 118                                                 hi = int(esc[1:5], 16)
 119                                                 low = int(esc[7:11], 16)
 120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 121                                 raise ValueError('Unknown escape ' + str(esc))
 122                         def parseString(i):
 123                                 i += 1
 124                                 e = i
 125                                 while True:
 126                                         e = s.index('"', e)
 127                                         bslashes = 0
 128                                         while s[e-bslashes-1] == '\\':
 129                                                 bslashes += 1
 130                                         if bslashes % 2 == 1:
 131                                                 e += 1
 132                                                 continue
 133                                         break
 134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 135                                 stri = rexp.sub(decodeEscape, s[i:e])
 136                                 return (e+1,stri)
 137                         def parseObj(i):
 138                                 i += 1
 139                                 res = {}
 140                                 i = skipSpace(i)
 141                                 if s[i] == '}': # Empty dictionary
 142                                         return (i+1,res)
 143                                 while True:
 144                                         if s[i] != '"':
 145                                                 raiseError('Expected a string object key', i)
 146                                         i,key = parseString(i)
 147                                         i = skipSpace(i)
 148                                         if i >= len(s) or s[i] != ':':
 149                                                 raiseError('Expected a colon', i)
 150                                         i,val = parse(i+1)
 151                                         res[key] = val
 152                                         i = skipSpace(i)
 153                                         if s[i] == '}':
 154                                                 return (i+1, res)
 155                                         if s[i] != ',':
 156                                                 raiseError('Expected comma or closing curly brace', i)
 157                                         i = skipSpace(i+1)
 158                         def parseArray(i):
 159                                 res = []
 160                                 i = skipSpace(i+1)
 161                                 if s[i] == ']': # Empty array
 162                                         return (i+1,res)
 163                                 while True:
 164                                         i,val = parse(i)
 165                                         res.append(val)
 166                                         i = skipSpace(i) # Raise exception if premature end
 167                                         if s[i] == ']':
 168                                                 return (i+1, res)
 169                                         if s[i] != ',':
 170                                                 raiseError('Expected a comma or closing bracket', i)
 171                                         i = skipSpace(i+1)
 172                         def parseDiscrete(i):
 173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 174                                         if s.startswith(k, i):
 175                                                 return (i+len(k), v)
 176                                 raiseError('Not a boolean (or null)', i)
 177                         def parseNumber(i):
 178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 179                                 if mobj is None:
 180                                         raiseError('Not a number', i)
 181                                 nums = mobj.group(1)
 182                                 if '.' in nums or 'e' in nums or 'E' in nums:
 183                                         return (i+len(nums), float(nums))
 184                                 return (i+len(nums), int(nums))
 185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 186                         def parse(i):
 187                                 i = skipSpace(i)
 188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 189                                 i = skipSpace(i, False)
 190                                 return (i,res)
 191                         i,res = parse(0)
 192                         if i < len(s):
 193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 194                         return res
 195
 196 def preferredencoding():
 197         """Get preferred encoding.
 198
 199         Returns the best encoding scheme for the system, based on
 200         locale.getpreferredencoding() and some further tweaks.
 201         """
 202         def yield_preferredencoding():
 203                 try:
 204                         pref = locale.getpreferredencoding()
 205                         u'TEST'.encode(pref)
 206                 except:
 207                         pref = 'UTF-8'
 208                 while True:
 209                         yield pref
 210         return yield_preferredencoding().next()
 211
 212
 213 def htmlentity_transform(matchobj):
 214         """Transforms an HTML entity to a Unicode character.
 215
 216         This function receives a match object and is intended to be used with
 217         the re.sub() function.
 218         """
 219         entity = matchobj.group(1)
 220
 221         # Known non-numeric HTML entity
 222         if entity in htmlentitydefs.name2codepoint:
 223                 return unichr(htmlentitydefs.name2codepoint[entity])
 224
 225         # Unicode character
 226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 227         if mobj is not None:
 228                 numstr = mobj.group(1)
 229                 if numstr.startswith(u'x'):
 230                         base = 16
 231                         numstr = u'0%s' % numstr
 232                 else:
 233                         base = 10
 234                 return unichr(long(numstr, base))
 235
 236         # Unknown entity in name, return its literal representation
 237         return (u'&%s;' % entity)
 238
 239
 240 def sanitize_title(utitle):
 241         """Sanitizes a video title so it could be used as part of a filename."""
 242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 243         return utitle.replace(unicode(os.sep), u'%')
 244
 245
 246 def sanitize_open(filename, open_mode):
 247         """Try to open the given filename, and slightly tweak it if this fails.
 248
 249         Attempts to open the given filename. If this fails, it tries to change
 250         the filename slightly, step by step, until it's either able to open it
 251         or it fails and raises a final exception, like the standard open()
 252         function.
 253
 254         It returns the tuple (stream, definitive_file_name).
 255         """
 256         try:
 257                 if filename == u'-':
 258                         if sys.platform == 'win32':
 259                                 import msvcrt
 260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 261                         return (sys.stdout, filename)
 262                 stream = open(filename, open_mode)
 263                 return (stream, filename)
 264         except (IOError, OSError), err:
 265                 # In case of error, try to remove win32 forbidden chars
 266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 267
 268                 # An exception here should be caught in the caller
 269                 stream = open(filename, open_mode)
 270                 return (stream, filename)
 271
 272
 273 def timeconvert(timestr):
 274         """Convert RFC 2822 defined time string into system timestamp"""
 275         timestamp = None
 276         timetuple = email.utils.parsedate_tz(timestr)
 277         if timetuple is not None:
 278                 timestamp = email.utils.mktime_tz(timetuple)
 279         return timestamp
 280
 281 def _simplify_title(title):
 282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 283         return expr.sub(u'_', title).strip(u'_')
 284
 285 class DownloadError(Exception):
 286         """Download Error exception.
 287
 288         This exception may be thrown by FileDownloader objects if they are not
 289         configured to continue on errors. They will contain the appropriate
 290         error message.
 291         """
 292         pass
 293
 294
 295 class SameFileError(Exception):
 296         """Same File exception.
 297
 298         This exception will be thrown by FileDownloader objects if they detect
 299         multiple files would have to be downloaded to the same file on disk.
 300         """
 301         pass
 302
 303
 304 class PostProcessingError(Exception):
 305         """Post Processing exception.
 306
 307         This exception may be raised by PostProcessor's .run() method to
 308         indicate an error in the postprocessing task.
 309         """
 310         pass
 311
 312
 313 class UnavailableVideoError(Exception):
 314         """Unavailable Format exception.
 315
 316         This exception will be thrown when a video is requested
 317         in a format that is not available for that video.
 318         """
 319         pass
 320
 321
 322 class ContentTooShortError(Exception):
 323         """Content Too Short exception.
 324
 325         This exception may be raised by FileDownloader objects when a file they
 326         download is too small for what the server announced first, indicating
 327         the connection was probably interrupted.
 328         """
 329         # Both in bytes
 330         downloaded = None
 331         expected = None
 332
 333         def __init__(self, downloaded, expected):
 334                 self.downloaded = downloaded
 335                 self.expected = expected
 336
 337
 338 class YoutubeDLHandler(urllib2.HTTPHandler):
 339         """Handler for HTTP requests and responses.
 340
 341         This class, when installed with an OpenerDirector, automatically adds
 342         the standard headers to every HTTP request and handles gzipped and
 343         deflated responses from web servers. If compression is to be avoided in
 344         a particular request, the original request in the program code only has
 345         to include the HTTP header "Youtubedl-No-Compression", which will be
 346         removed before making the real request.
 347
 348         Part of this code was copied from:
 349
 350         http://techknack.net/python-urllib2-handlers/
 351
 352         Andrew Rowls, the author of that code, agreed to release it to the
 353         public domain.
 354         """
 355
 356         @staticmethod
 357         def deflate(data):
 358                 try:
 359                         return zlib.decompress(data, -zlib.MAX_WBITS)
 360                 except zlib.error:
 361                         return zlib.decompress(data)
 362
 363         @staticmethod
 364         def addinfourl_wrapper(stream, headers, url, code):
 365                 if hasattr(urllib2.addinfourl, 'getcode'):
 366                         return urllib2.addinfourl(stream, headers, url, code)
 367                 ret = urllib2.addinfourl(stream, headers, url)
 368                 ret.code = code
 369                 return ret
 370
 371         def http_request(self, req):
 372                 for h in std_headers:
 373                         if h in req.headers:
 374                                 del req.headers[h]
 375                         req.add_header(h, std_headers[h])
 376                 if 'Youtubedl-no-compression' in req.headers:
 377                         if 'Accept-encoding' in req.headers:
 378                                 del req.headers['Accept-encoding']
 379                         del req.headers['Youtubedl-no-compression']
 380                 return req
 381
 382         def http_response(self, req, resp):
 383                 old_resp = resp
 384                 # gzip
 385                 if resp.headers.get('Content-encoding', '') == 'gzip':
 386                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 387                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 388                         resp.msg = old_resp.msg
 389                 # deflate
 390                 if resp.headers.get('Content-encoding', '') == 'deflate':
 391                         gz = StringIO.StringIO(self.deflate(resp.read()))
 392                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 393                         resp.msg = old_resp.msg
 394                 return resp
 395
 396
 397 class FileDownloader(object):
 398         """File Downloader class.
 399
 400         File downloader objects are the ones responsible of downloading the
 401         actual video file and writing it to disk if the user has requested
 402         it, among some other tasks. In most cases there should be one per
 403         program. As, given a video URL, the downloader doesn't know how to
 404         extract all the needed information, task that InfoExtractors do, it
 405         has to pass the URL to one of them.
 406
 407         For this, file downloader objects have a method that allows
 408         InfoExtractors to be registered in a given order. When it is passed
 409         a URL, the file downloader handles it to the first InfoExtractor it
 410         finds that reports being able to handle it. The InfoExtractor extracts
 411         all the information about the video or videos the URL refers to, and
 412         asks the FileDownloader to process the video information, possibly
 413         downloading the video.
 414
 415         File downloaders accept a lot of parameters. In order not to saturate
 416         the object constructor with arguments, it receives a dictionary of
 417         options instead. These options are available through the params
 418         attribute for the InfoExtractors to use. The FileDownloader also
 419         registers itself as the downloader in charge for the InfoExtractors
 420         that are added to it, so this is a "mutual registration".
 421
 422         Available options:
 423
 424         username:         Username for authentication purposes.
 425         password:         Password for authentication purposes.
 426         usenetrc:         Use netrc for authentication instead.
 427         quiet:            Do not print messages to stdout.
 428         forceurl:         Force printing final URL.
 429         forcetitle:       Force printing title.
 430         forcethumbnail:   Force printing thumbnail URL.
 431         forcedescription: Force printing description.
 432         forcefilename:    Force printing final filename.
 433         simulate:         Do not download the video files.
 434         format:           Video format code.
 435         format_limit:     Highest quality format to try.
 436         outtmpl:          Template for output names.
 437         ignoreerrors:     Do not stop on download errors.
 438         ratelimit:        Download speed limit, in bytes/sec.
 439         nooverwrites:     Prevent overwriting files.
 440         retries:          Number of times to retry for HTTP error 5xx
 441         continuedl:       Try to continue downloads if possible.
 442         noprogress:       Do not print the progress bar.
 443         playliststart:    Playlist item to start at.
 444         playlistend:      Playlist item to end at.
 445         matchtitle:       Download only matching titles.
 446         rejecttitle:      Reject downloads for matching titles.
 447         logtostderr:      Log messages to stderr instead of stdout.
 448         consoletitle:     Display progress in console window's titlebar.
 449         nopart:           Do not use temporary .part files.
 450         updatetime:       Use the Last-modified header to set output file timestamps.
 451         writedescription: Write the video description to a .description file
 452         writeinfojson:    Write the video description to a .info.json file
 453         """
 454
 455         params = None
 456         _ies = []
 457         _pps = []
 458         _download_retcode = None
 459         _num_downloads = None
 460         _screen_file = None
 461
 462         def __init__(self, params):
 463                 """Create a FileDownloader object with the given options."""
 464                 self._ies = []
 465                 self._pps = []
 466                 self._download_retcode = 0
 467                 self._num_downloads = 0
 468                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 469                 self.params = params
 470
 471         @staticmethod
 472         def format_bytes(bytes):
 473                 if bytes is None:
 474                         return 'N/A'
 475                 if type(bytes) is str:
 476                         bytes = float(bytes)
 477                 if bytes == 0.0:
 478                         exponent = 0
 479                 else:
 480                         exponent = long(math.log(bytes, 1024.0))
 481                 suffix = 'bkMGTPEZY'[exponent]
 482                 converted = float(bytes) / float(1024 ** exponent)
 483                 return '%.2f%s' % (converted, suffix)
 484
 485         @staticmethod
 486         def calc_percent(byte_counter, data_len):
 487                 if data_len is None:
 488                         return '---.-%'
 489                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 490
 491         @staticmethod
 492         def calc_eta(start, now, total, current):
 493                 if total is None:
 494                         return '--:--'
 495                 dif = now - start
 496                 if current == 0 or dif < 0.001: # One millisecond
 497                         return '--:--'
 498                 rate = float(current) / dif
 499                 eta = long((float(total) - float(current)) / rate)
 500                 (eta_mins, eta_secs) = divmod(eta, 60)
 501                 if eta_mins > 99:
 502                         return '--:--'
 503                 return '%02d:%02d' % (eta_mins, eta_secs)
 504
 505         @staticmethod
 506         def calc_speed(start, now, bytes):
 507                 dif = now - start
 508                 if bytes == 0 or dif < 0.001: # One millisecond
 509                         return '%10s' % '---b/s'
 510                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 511
 512         @staticmethod
 513         def best_block_size(elapsed_time, bytes):
 514                 new_min = max(bytes / 2.0, 1.0)
 515                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 516                 if elapsed_time < 0.001:
 517                         return long(new_max)
 518                 rate = bytes / elapsed_time
 519                 if rate > new_max:
 520                         return long(new_max)
 521                 if rate < new_min:
 522                         return long(new_min)
 523                 return long(rate)
 524
 525         @staticmethod
 526         def parse_bytes(bytestr):
 527                 """Parse a string indicating a byte quantity into a long integer."""
 528                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 529                 if matchobj is None:
 530                         return None
 531                 number = float(matchobj.group(1))
 532                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 533                 return long(round(number * multiplier))
 534
 535         def add_info_extractor(self, ie):
 536                 """Add an InfoExtractor object to the end of the list."""
 537                 self._ies.append(ie)
 538                 ie.set_downloader(self)
 539
 540         def add_post_processor(self, pp):
 541                 """Add a PostProcessor object to the end of the chain."""
 542                 self._pps.append(pp)
 543                 pp.set_downloader(self)
 544
 545         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 546                 """Print message to stdout if not in quiet mode."""
 547                 try:
 548                         if not self.params.get('quiet', False):
 549                                 terminator = [u'\n', u''][skip_eol]
 550                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 551                         self._screen_file.flush()
 552                 except (UnicodeEncodeError), err:
 553                         if not ignore_encoding_errors:
 554                                 raise
 555
 556         def to_stderr(self, message):
 557                 """Print message to stderr."""
 558                 print >>sys.stderr, message.encode(preferredencoding())
 559
 560         def to_cons_title(self, message):
 561                 """Set console/terminal window title to message."""
 562                 if not self.params.get('consoletitle', False):
 563                         return
 564                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 565                         # c_wchar_p() might not be necessary if `message` is
 566                         # already of type unicode()
 567                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 568                 elif 'TERM' in os.environ:
 569                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 570
 571         def fixed_template(self):
 572                 """Checks if the output template is fixed."""
 573                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 574
 575         def trouble(self, message=None):
 576                 """Determine action to take when a download problem appears.
 577
 578                 Depending on if the downloader has been configured to ignore
 579                 download errors or not, this method may throw an exception or
 580                 not when errors are found, after printing the message.
 581                 """
 582                 if message is not None:
 583                         self.to_stderr(message)
 584                 if not self.params.get('ignoreerrors', False):
 585                         raise DownloadError(message)
 586                 self._download_retcode = 1
 587
 588         def slow_down(self, start_time, byte_counter):
 589                 """Sleep if the download speed is over the rate limit."""
 590                 rate_limit = self.params.get('ratelimit', None)
 591                 if rate_limit is None or byte_counter == 0:
 592                         return
 593                 now = time.time()
 594                 elapsed = now - start_time
 595                 if elapsed <= 0.0:
 596                         return
 597                 speed = float(byte_counter) / elapsed
 598                 if speed > rate_limit:
 599                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 600
 601         def temp_name(self, filename):
 602                 """Returns a temporary filename for the given filename."""
 603                 if self.params.get('nopart', False) or filename == u'-' or \
 604                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 605                         return filename
 606                 return filename + u'.part'
 607
 608         def undo_temp_name(self, filename):
 609                 if filename.endswith(u'.part'):
 610                         return filename[:-len(u'.part')]
 611                 return filename
 612
 613         def try_rename(self, old_filename, new_filename):
 614                 try:
 615                         if old_filename == new_filename:
 616                                 return
 617                         os.rename(old_filename, new_filename)
 618                 except (IOError, OSError), err:
 619                         self.trouble(u'ERROR: unable to rename file')
 620
 621         def try_utime(self, filename, last_modified_hdr):
 622                 """Try to set the last-modified time of the given file."""
 623                 if last_modified_hdr is None:
 624                         return
 625                 if not os.path.isfile(filename):
 626                         return
 627                 timestr = last_modified_hdr
 628                 if timestr is None:
 629                         return
 630                 filetime = timeconvert(timestr)
 631                 if filetime is None:
 632                         return filetime
 633                 try:
 634                         os.utime(filename, (time.time(), filetime))
 635                 except:
 636                         pass
 637                 return filetime
 638
 639         def report_writedescription(self, descfn):
 640                 """ Report that the description file is being written """
 641                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 642
 643         def report_writeinfojson(self, infofn):
 644                 """ Report that the metadata file has been written """
 645                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 646
 647         def report_destination(self, filename):
 648                 """Report destination filename."""
 649                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 650
 651         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 652                 """Report download progress."""
 653                 if self.params.get('noprogress', False):
 654                         return
 655                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 656                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 657                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 658                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 659
 660         def report_resuming_byte(self, resume_len):
 661                 """Report attempt to resume at given byte."""
 662                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 663
 664         def report_retry(self, count, retries):
 665                 """Report retry in case of HTTP error 5xx"""
 666                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 667
 668         def report_file_already_downloaded(self, file_name):
 669                 """Report file has already been fully downloaded."""
 670                 try:
 671                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 672                 except (UnicodeEncodeError), err:
 673                         self.to_screen(u'[download] The file has already been downloaded')
 674
 675         def report_unable_to_resume(self):
 676                 """Report it was impossible to resume download."""
 677                 self.to_screen(u'[download] Unable to resume')
 678
 679         def report_finish(self):
 680                 """Report download finished."""
 681                 if self.params.get('noprogress', False):
 682                         self.to_screen(u'[download] Download completed')
 683                 else:
 684                         self.to_screen(u'')
 685
 686         def increment_downloads(self):
 687                 """Increment the ordinal that assigns a number to each file."""
 688                 self._num_downloads += 1
 689
 690         def prepare_filename(self, info_dict):
 691                 """Generate the output filename."""
 692                 try:
 693                         template_dict = dict(info_dict)
 694                         template_dict['epoch'] = unicode(long(time.time()))
 695                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 696                         filename = self.params['outtmpl'] % template_dict
 697                         return filename
 698                 except (ValueError, KeyError), err:
 699                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 700                         return None
 701
 702         def process_info(self, info_dict):
 703                 """Process a single dictionary returned by an InfoExtractor."""
 704                 filename = self.prepare_filename(info_dict)
 705
 706                 # Forced printings
 707                 if self.params.get('forcetitle', False):
 708                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                 if self.params.get('forceurl', False):
 710                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 711                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 712                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 713                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 714                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 715                 if self.params.get('forcefilename', False) and filename is not None:
 716                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 717                 if self.params.get('forceformat', False):
 718                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 719
 720                 # Do nothing else if in simulate mode
 721                 if self.params.get('simulate', False):
 722                         return
 723
 724                 if filename is None:
 725                         return
 726
 727                 matchtitle=self.params.get('matchtitle',False)
 728                 rejecttitle=self.params.get('rejecttitle',False)
 729                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 730                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 731                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 732                         return
 733                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 734                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 735                         return
 736
 737                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 738                         self.to_stderr(u'WARNING: file exists and will be skipped')
 739                         return
 740
 741                 try:
 742                         dn = os.path.dirname(filename)
 743                         if dn != '' and not os.path.exists(dn):
 744                                 os.makedirs(dn)
 745                 except (OSError, IOError), err:
 746                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 747                         return
 748
 749                 if self.params.get('writedescription', False):
 750                         try:
 751                                 descfn = filename + '.description'
 752                                 self.report_writedescription(descfn)
 753                                 descfile = open(descfn, 'wb')
 754                                 try:
 755                                         descfile.write(info_dict['description'].encode('utf-8'))
 756                                 finally:
 757                                         descfile.close()
 758                         except (OSError, IOError):
 759                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 760                                 return
 761
 762                 if self.params.get('writeinfojson', False):
 763                         infofn = filename + '.info.json'
 764                         self.report_writeinfojson(infofn)
 765                         try:
 766                                 json.dump
 767                         except (NameError,AttributeError):
 768                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 769                                 return
 770                         try:
 771                                 infof = open(infofn, 'wb')
 772                                 try:
 773                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 774                                         json.dump(json_info_dict, infof)
 775                                 finally:
 776                                         infof.close()
 777                         except (OSError, IOError):
 778                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 779                                 return
 780
 781                 if not self.params.get('skip_download', False):
 782                         try:
 783                                 success = self._do_download(filename, info_dict)
 784                         except (OSError, IOError), err:
 785                                 raise UnavailableVideoError
 786                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 787                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 788                                 return
 789                         except (ContentTooShortError, ), err:
 790                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 791                                 return
 792
 793                         if success:
 794                                 try:
 795                                         self.post_process(filename, info_dict)
 796                                 except (PostProcessingError), err:
 797                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 798                                         return
 799
 800         def download(self, url_list):
 801                 """Download a given list of URLs."""
 802                 if len(url_list) > 1 and self.fixed_template():
 803                         raise SameFileError(self.params['outtmpl'])
 804
 805                 for url in url_list:
 806                         suitable_found = False
 807                         for ie in self._ies:
 808                                 # Go to next InfoExtractor if not suitable
 809                                 if not ie.suitable(url):
 810                                         continue
 811
 812                                 # Suitable InfoExtractor found
 813                                 suitable_found = True
 814
 815                                 # Extract information from URL and process it
 816                                 ie.extract(url)
 817
 818                                 # Suitable InfoExtractor had been found; go to next URL
 819                                 break
 820
 821                         if not suitable_found:
 822                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 823
 824                 return self._download_retcode
 825
 826         def post_process(self, filename, ie_info):
 827                 """Run the postprocessing chain on the given file."""
 828                 info = dict(ie_info)
 829                 info['filepath'] = filename
 830                 for pp in self._pps:
 831                         info = pp.run(info)
 832                         if info is None:
 833                                 break
 834
 835         def _download_with_rtmpdump(self, filename, url, player_url):
 836                 self.report_destination(filename)
 837                 tmpfilename = self.temp_name(filename)
 838
 839                 # Check for rtmpdump first
 840                 try:
 841                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 842                 except (OSError, IOError):
 843                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 844                         return False
 845
 846                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 847                 # the connection was interrumpted and resuming appears to be
 848                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 849                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 850                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 851                 while retval == 2 or retval == 1:
 852                         prevsize = os.path.getsize(tmpfilename)
 853                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 854                         time.sleep(5.0) # This seems to be needed
 855                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 856                         cursize = os.path.getsize(tmpfilename)
 857                         if prevsize == cursize and retval == 1:
 858                                 break
 859                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 860                         if prevsize == cursize and retval == 2 and cursize > 1024:
 861                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 862                                 retval = 0
 863                                 break
 864                 if retval == 0:
 865                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 866                         self.try_rename(tmpfilename, filename)
 867                         return True
 868                 else:
 869                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 870                         return False
 871
 872         def _do_download(self, filename, info_dict):
 873                 url = info_dict['url']
 874                 player_url = info_dict.get('player_url', None)
 875
 876                 # Check file already present
 877                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 878                         self.report_file_already_downloaded(filename)
 879                         return True
 880
 881                 # Attempt to download using rtmpdump
 882                 if url.startswith('rtmp'):
 883                         return self._download_with_rtmpdump(filename, url, player_url)
 884
 885                 tmpfilename = self.temp_name(filename)
 886                 stream = None
 887
 888                 # Do not include the Accept-Encoding header
 889                 headers = {'Youtubedl-no-compression': 'True'}
 890                 basic_request = urllib2.Request(url, None, headers)
 891                 request = urllib2.Request(url, None, headers)
 892
 893                 # Establish possible resume length
 894                 if os.path.isfile(tmpfilename):
 895                         resume_len = os.path.getsize(tmpfilename)
 896                 else:
 897                         resume_len = 0
 898
 899                 open_mode = 'wb'
 900                 if resume_len != 0:
 901                         if self.params.get('continuedl', False):
 902                                 self.report_resuming_byte(resume_len)
 903                                 request.add_header('Range','bytes=%d-' % resume_len)
 904                                 open_mode = 'ab'
 905                         else:
 906                                 resume_len = 0
 907
 908                 count = 0
 909                 retries = self.params.get('retries', 0)
 910                 while count <= retries:
 911                         # Establish connection
 912                         try:
 913                                 if count == 0 and 'urlhandle' in info_dict:
 914                                         data = info_dict['urlhandle']
 915                                 data = urllib2.urlopen(request)
 916                                 break
 917                         except (urllib2.HTTPError, ), err:
 918                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 919                                         # Unexpected HTTP error
 920                                         raise
 921                                 elif err.code == 416:
 922                                         # Unable to resume (requested range not satisfiable)
 923                                         try:
 924                                                 # Open the connection again without the range header
 925                                                 data = urllib2.urlopen(basic_request)
 926                                                 content_length = data.info()['Content-Length']
 927                                         except (urllib2.HTTPError, ), err:
 928                                                 if err.code < 500 or err.code >= 600:
 929                                                         raise
 930                                         else:
 931                                                 # Examine the reported length
 932                                                 if (content_length is not None and
 933                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 934                                                         # The file had already been fully downloaded.
 935                                                         # Explanation to the above condition: in issue #175 it was revealed that
 936                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 937                                                         # changing the file size slightly and causing problems for some users. So
 938                                                         # I decided to implement a suggested change and consider the file
 939                                                         # completely downloaded if the file size differs less than 100 bytes from
 940                                                         # the one in the hard drive.
 941                                                         self.report_file_already_downloaded(filename)
 942                                                         self.try_rename(tmpfilename, filename)
 943                                                         return True
 944                                                 else:
 945                                                         # The length does not match, we start the download over
 946                                                         self.report_unable_to_resume()
 947                                                         open_mode = 'wb'
 948                                                         break
 949                         # Retry
 950                         count += 1
 951                         if count <= retries:
 952                                 self.report_retry(count, retries)
 953
 954                 if count > retries:
 955                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 956                         return False
 957
 958                 data_len = data.info().get('Content-length', None)
 959                 if data_len is not None:
 960                         data_len = long(data_len) + resume_len
 961                 data_len_str = self.format_bytes(data_len)
 962                 byte_counter = 0 + resume_len
 963                 block_size = 1024
 964                 start = time.time()
 965                 while True:
 966                         # Download and write
 967                         before = time.time()
 968                         data_block = data.read(block_size)
 969                         after = time.time()
 970                         if len(data_block) == 0:
 971                                 break
 972                         byte_counter += len(data_block)
 973
 974                         # Open file just in time
 975                         if stream is None:
 976                                 try:
 977                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 978                                         assert stream is not None
 979                                         filename = self.undo_temp_name(tmpfilename)
 980                                         self.report_destination(filename)
 981                                 except (OSError, IOError), err:
 982                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 983                                         return False
 984                         try:
 985                                 stream.write(data_block)
 986                         except (IOError, OSError), err:
 987                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 988                                 return False
 989                         block_size = self.best_block_size(after - before, len(data_block))
 990
 991                         # Progress message
 992                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 993                         if data_len is None:
 994                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
 995                         else:
 996                                 percent_str = self.calc_percent(byte_counter, data_len)
 997                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 998                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 999
1000                         # Apply rate limit
1001                         self.slow_down(start, byte_counter - resume_len)
1002
1003                 if stream is None:
1004                         self.trouble(u'\nERROR: Did not get any data blocks')
1005                         return False
1006                 stream.close()
1007                 self.report_finish()
1008                 if data_len is not None and byte_counter != data_len:
1009                         raise ContentTooShortError(byte_counter, long(data_len))
1010                 self.try_rename(tmpfilename, filename)
1011
1012                 # Update file modification time
1013                 if self.params.get('updatetime', True):
1014                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1015
1016                 return True
1017
1018
1019 class InfoExtractor(object):
1020         """Information Extractor class.
1021
1022         Information extractors are the classes that, given a URL, extract
1023         information from the video (or videos) the URL refers to. This
1024         information includes the real video URL, the video title and simplified
1025         title, author and others. The information is stored in a dictionary
1026         which is then passed to the FileDownloader. The FileDownloader
1027         processes this information possibly downloading the video to the file
1028         system, among other possible outcomes. The dictionaries must include
1029         the following fields:
1030
1031         id:             Video identifier.
1032         url:            Final video URL.
1033         uploader:       Nickname of the video uploader.
1034         title:          Literal title.
1035         stitle:         Simplified title.
1036         ext:            Video filename extension.
1037         format:         Video format.
1038         player_url:     SWF Player URL (may be None).
1039
1040         The following fields are optional. Their primary purpose is to allow
1041         youtube-dl to serve as the backend for a video search function, such
1042         as the one in youtube2mp3.  They are only used when their respective
1043         forced printing functions are called:
1044
1045         thumbnail:      Full URL to a video thumbnail image.
1046         description:    One-line video description.
1047
1048         Subclasses of this one should re-define the _real_initialize() and
1049         _real_extract() methods and define a _VALID_URL regexp.
1050         Probably, they should also be added to the list of extractors.
1051         """
1052
1053         _ready = False
1054         _downloader = None
1055
1056         def __init__(self, downloader=None):
1057                 """Constructor. Receives an optional downloader."""
1058                 self._ready = False
1059                 self.set_downloader(downloader)
1060
1061         def suitable(self, url):
1062                 """Receives a URL and returns True if suitable for this IE."""
1063                 return re.match(self._VALID_URL, url) is not None
1064
1065         def initialize(self):
1066                 """Initializes an instance (authentication, etc)."""
1067                 if not self._ready:
1068                         self._real_initialize()
1069                         self._ready = True
1070
1071         def extract(self, url):
1072                 """Extracts URL information and returns it in list of dicts."""
1073                 self.initialize()
1074                 return self._real_extract(url)
1075
1076         def set_downloader(self, downloader):
1077                 """Sets the downloader for this IE."""
1078                 self._downloader = downloader
1079
1080         def _real_initialize(self):
1081                 """Real initialization process. Redefine in subclasses."""
1082                 pass
1083
1084         def _real_extract(self, url):
1085                 """Real extraction process. Redefine in subclasses."""
1086                 pass
1087
1088
1089 class YoutubeIE(InfoExtractor):
1090         """Information extractor for youtube.com."""
1091
1092         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1093         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1094         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1095         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1096         _NETRC_MACHINE = 'youtube'
1097         # Listed in order of quality
1098         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1099         _video_extensions = {
1100                 '13': '3gp',
1101                 '17': 'mp4',
1102                 '18': 'mp4',
1103                 '22': 'mp4',
1104                 '37': 'mp4',
1105                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1106                 '43': 'webm',
1107                 '44': 'webm',
1108                 '45': 'webm',
1109         }
1110         _video_dimensions = {
1111                 '5': '240x400',
1112                 '6': '???',
1113                 '13': '???',
1114                 '17': '144x176',
1115                 '18': '360x640',
1116                 '22': '720x1280',
1117                 '34': '360x640',
1118                 '35': '480x854',
1119                 '37': '1080x1920',
1120                 '38': '3072x4096',
1121                 '43': '360x640',
1122                 '44': '480x854',
1123                 '45': '720x1280',
1124         }
1125         IE_NAME = u'youtube'
1126
1127         def report_lang(self):
1128                 """Report attempt to set language."""
1129                 self._downloader.to_screen(u'[youtube] Setting language')
1130
1131         def report_login(self):
1132                 """Report attempt to log in."""
1133                 self._downloader.to_screen(u'[youtube] Logging in')
1134
1135         def report_age_confirmation(self):
1136                 """Report attempt to confirm age."""
1137                 self._downloader.to_screen(u'[youtube] Confirming age')
1138
1139         def report_video_webpage_download(self, video_id):
1140                 """Report attempt to download video webpage."""
1141                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1142
1143         def report_video_info_webpage_download(self, video_id):
1144                 """Report attempt to download video info webpage."""
1145                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1146
1147         def report_information_extraction(self, video_id):
1148                 """Report attempt to extract video information."""
1149                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1150
1151         def report_unavailable_format(self, video_id, format):
1152                 """Report extracted video URL."""
1153                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1154
1155         def report_rtmp_download(self):
1156                 """Indicate the download will use the RTMP protocol."""
1157                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1158
1159         def _print_formats(self, formats):
1160                 print 'Available formats:'
1161                 for x in formats:
1162                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1163
1164         def _real_initialize(self):
1165                 if self._downloader is None:
1166                         return
1167
1168                 username = None
1169                 password = None
1170                 downloader_params = self._downloader.params
1171
1172                 # Attempt to use provided username and password or .netrc data
1173                 if downloader_params.get('username', None) is not None:
1174                         username = downloader_params['username']
1175                         password = downloader_params['password']
1176                 elif downloader_params.get('usenetrc', False):
1177                         try:
1178                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1179                                 if info is not None:
1180                                         username = info[0]
1181                                         password = info[2]
1182                                 else:
1183                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1184                         except (IOError, netrc.NetrcParseError), err:
1185                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1186                                 return
1187
1188                 # Set language
1189                 request = urllib2.Request(self._LANG_URL)
1190                 try:
1191                         self.report_lang()
1192                         urllib2.urlopen(request).read()
1193                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1195                         return
1196
1197                 # No authentication to be performed
1198                 if username is None:
1199                         return
1200
1201                 # Log in
1202                 login_form = {
1203                                 'current_form': 'loginForm',
1204                                 'next':         '/',
1205                                 'action_login': 'Log In',
1206                                 'username':     username,
1207                                 'password':     password,
1208                                 }
1209                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1210                 try:
1211                         self.report_login()
1212                         login_results = urllib2.urlopen(request).read()
1213                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1214                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1215                                 return
1216                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1217                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1218                         return
1219
1220                 # Confirm age
1221                 age_form = {
1222                                 'next_url':             '/',
1223                                 'action_confirm':       'Confirm',
1224                                 }
1225                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1226                 try:
1227                         self.report_age_confirmation()
1228                         age_results = urllib2.urlopen(request).read()
1229                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1230                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1231                         return
1232
1233         def _real_extract(self, url):
1234                 # Extract video id from URL
1235                 mobj = re.match(self._VALID_URL, url)
1236                 if mobj is None:
1237                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1238                         return
1239                 video_id = mobj.group(2)
1240
1241                 # Get video webpage
1242                 self.report_video_webpage_download(video_id)
1243                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1244                 try:
1245                         video_webpage = urllib2.urlopen(request).read()
1246                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1247                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1248                         return
1249
1250                 # Attempt to extract SWF player URL
1251                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1252                 if mobj is not None:
1253                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1254                 else:
1255                         player_url = None
1256
1257                 # Get video info
1258                 self.report_video_info_webpage_download(video_id)
1259                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1260                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1261                                         % (video_id, el_type))
1262                         request = urllib2.Request(video_info_url)
1263                         try:
1264                                 video_info_webpage = urllib2.urlopen(request).read()
1265                                 video_info = parse_qs(video_info_webpage)
1266                                 if 'token' in video_info:
1267                                         break
1268                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1269                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1270                                 return
1271                 if 'token' not in video_info:
1272                         if 'reason' in video_info:
1273                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1274                         else:
1275                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1276                         return
1277
1278                 # Start extracting information
1279                 self.report_information_extraction(video_id)
1280
1281                 # uploader
1282                 if 'author' not in video_info:
1283                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1284                         return
1285                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1286
1287                 # title
1288                 if 'title' not in video_info:
1289                         self._downloader.trouble(u'ERROR: unable to extract video title')
1290                         return
1291                 video_title = urllib.unquote_plus(video_info['title'][0])
1292                 video_title = video_title.decode('utf-8')
1293                 video_title = sanitize_title(video_title)
1294
1295                 # simplified title
1296                 simple_title = _simplify_title(video_title)
1297
1298                 # thumbnail image
1299                 if 'thumbnail_url' not in video_info:
1300                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1301                         video_thumbnail = ''
1302                 else:   # don't panic if we can't find it
1303                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1304
1305                 # upload date
1306                 upload_date = u'NA'
1307                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1308                 if mobj is not None:
1309                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1310                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1311                         for expression in format_expressions:
1312                                 try:
1313                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1314                                 except:
1315                                         pass
1316
1317                 # description
1318                 try:
1319                         lxml.etree
1320                 except NameError:
1321                         video_description = u'No description available.'
1322                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1323                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1324                                 if mobj is not None:
1325                                         video_description = mobj.group(1).decode('utf-8')
1326                 else:
1327                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1328                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1329                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1330                         # TODO use another parser
1331
1332                 # token
1333                 video_token = urllib.unquote_plus(video_info['token'][0])
1334
1335                 # Decide which formats to download
1336                 req_format = self._downloader.params.get('format', None)
1337
1338                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1339                         self.report_rtmp_download()
1340                         video_url_list = [(None, video_info['conn'][0])]
1341                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1342                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1343                         url_data = [parse_qs(uds) for uds in url_data_strs]
1344                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1345                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1346
1347                         format_limit = self._downloader.params.get('format_limit', None)
1348                         if format_limit is not None and format_limit in self._available_formats:
1349                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1350                         else:
1351                                 format_list = self._available_formats
1352                         existing_formats = [x for x in format_list if x in url_map]
1353                         if len(existing_formats) == 0:
1354                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1355                                 return
1356                         if self._downloader.params.get('listformats', None):
1357                                 self._print_formats(existing_formats)
1358                                 return
1359                         if req_format is None or req_format == 'best':
1360                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1361                         elif req_format == 'worst':
1362                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1363                         elif req_format in ('-1', 'all'):
1364                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1365                         else:
1366                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1367                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1368                                 req_formats = req_format.split('/')
1369                                 video_url_list = None
1370                                 for rf in req_formats:
1371                                         if rf in url_map:
1372                                                 video_url_list = [(rf, url_map[rf])]
1373                                                 break
1374                                 if video_url_list is None:
1375                                         self._downloader.trouble(u'ERROR: requested format not available')
1376                                         return
1377                 else:
1378                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1379                         return
1380
1381                 for format_param, video_real_url in video_url_list:
1382                         # At this point we have a new video
1383                         self._downloader.increment_downloads()
1384
1385                         # Extension
1386                         video_extension = self._video_extensions.get(format_param, 'flv')
1387
1388                         try:
1389                                 # Process video information
1390                                 self._downloader.process_info({
1391                                         'id':           video_id.decode('utf-8'),
1392                                         'url':          video_real_url.decode('utf-8'),
1393                                         'uploader':     video_uploader.decode('utf-8'),
1394                                         'upload_date':  upload_date,
1395                                         'title':        video_title,
1396                                         'stitle':       simple_title,
1397                                         'ext':          video_extension.decode('utf-8'),
1398                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1399                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1400                                         'description':  video_description,
1401                                         'player_url':   player_url,
1402                                 })
1403                         except UnavailableVideoError, err:
1404                                 self._downloader.trouble(u'\nERROR: unable to download video')
1405
1406
1407 class MetacafeIE(InfoExtractor):
1408         """Information Extractor for metacafe.com."""
1409
1410         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1411         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1412         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1413         _youtube_ie = None
1414         IE_NAME = u'metacafe'
1415
1416         def __init__(self, youtube_ie, downloader=None):
1417                 InfoExtractor.__init__(self, downloader)
1418                 self._youtube_ie = youtube_ie
1419
1420         def report_disclaimer(self):
1421                 """Report disclaimer retrieval."""
1422                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1423
1424         def report_age_confirmation(self):
1425                 """Report attempt to confirm age."""
1426                 self._downloader.to_screen(u'[metacafe] Confirming age')
1427
1428         def report_download_webpage(self, video_id):
1429                 """Report webpage download."""
1430                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1431
1432         def report_extraction(self, video_id):
1433                 """Report information extraction."""
1434                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1435
1436         def _real_initialize(self):
1437                 # Retrieve disclaimer
1438                 request = urllib2.Request(self._DISCLAIMER)
1439                 try:
1440                         self.report_disclaimer()
1441                         disclaimer = urllib2.urlopen(request).read()
1442                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1443                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1444                         return
1445
1446                 # Confirm age
1447                 disclaimer_form = {
1448                         'filters': '0',
1449                         'submit': "Continue - I'm over 18",
1450                         }
1451                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1452                 try:
1453                         self.report_age_confirmation()
1454                         disclaimer = urllib2.urlopen(request).read()
1455                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1456                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1457                         return
1458
1459         def _real_extract(self, url):
1460                 # Extract id and simplified title from URL
1461                 mobj = re.match(self._VALID_URL, url)
1462                 if mobj is None:
1463                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1464                         return
1465
1466                 video_id = mobj.group(1)
1467
1468                 # Check if video comes from YouTube
1469                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1470                 if mobj2 is not None:
1471                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1472                         return
1473
1474                 # At this point we have a new video
1475                 self._downloader.increment_downloads()
1476
1477                 simple_title = mobj.group(2).decode('utf-8')
1478
1479                 # Retrieve video webpage to extract further information
1480                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1481                 try:
1482                         self.report_download_webpage(video_id)
1483                         webpage = urllib2.urlopen(request).read()
1484                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1485                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1486                         return
1487
1488                 # Extract URL, uploader and title from webpage
1489                 self.report_extraction(video_id)
1490                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1491                 if mobj is not None:
1492                         mediaURL = urllib.unquote(mobj.group(1))
1493                         video_extension = mediaURL[-3:]
1494
1495                         # Extract gdaKey if available
1496                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1497                         if mobj is None:
1498                                 video_url = mediaURL
1499                         else:
1500                                 gdaKey = mobj.group(1)
1501                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1502                 else:
1503                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1504                         if mobj is None:
1505                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1506                                 return
1507                         vardict = parse_qs(mobj.group(1))
1508                         if 'mediaData' not in vardict:
1509                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1510                                 return
1511                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1512                         if mobj is None:
1513                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1514                                 return
1515                         mediaURL = mobj.group(1).replace('\\/', '/')
1516                         video_extension = mediaURL[-3:]
1517                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1518
1519                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1520                 if mobj is None:
1521                         self._downloader.trouble(u'ERROR: unable to extract title')
1522                         return
1523                 video_title = mobj.group(1).decode('utf-8')
1524                 video_title = sanitize_title(video_title)
1525
1526                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1527                 if mobj is None:
1528                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1529                         return
1530                 video_uploader = mobj.group(1)
1531
1532                 try:
1533                         # Process video information
1534                         self._downloader.process_info({
1535                                 'id':           video_id.decode('utf-8'),
1536                                 'url':          video_url.decode('utf-8'),
1537                                 'uploader':     video_uploader.decode('utf-8'),
1538                                 'upload_date':  u'NA',
1539                                 'title':        video_title,
1540                                 'stitle':       simple_title,
1541                                 'ext':          video_extension.decode('utf-8'),
1542                                 'format':       u'NA',
1543                                 'player_url':   None,
1544                         })
1545                 except UnavailableVideoError:
1546                         self._downloader.trouble(u'\nERROR: unable to download video')
1547
1548
1549 class DailymotionIE(InfoExtractor):
1550         """Information Extractor for Dailymotion"""
1551
1552         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1553         IE_NAME = u'dailymotion'
1554
1555         def __init__(self, downloader=None):
1556                 InfoExtractor.__init__(self, downloader)
1557
1558         def report_download_webpage(self, video_id):
1559                 """Report webpage download."""
1560                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1561
1562         def report_extraction(self, video_id):
1563                 """Report information extraction."""
1564                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1565
1566         def _real_extract(self, url):
1567                 # Extract id and simplified title from URL
1568                 mobj = re.match(self._VALID_URL, url)
1569                 if mobj is None:
1570                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1571                         return
1572
1573                 # At this point we have a new video
1574                 self._downloader.increment_downloads()
1575                 video_id = mobj.group(1)
1576
1577                 simple_title = mobj.group(2).decode('utf-8')
1578                 video_extension = 'flv'
1579
1580                 # Retrieve video webpage to extract further information
1581                 request = urllib2.Request(url)
1582                 request.add_header('Cookie', 'family_filter=off')
1583                 try:
1584                         self.report_download_webpage(video_id)
1585                         webpage = urllib2.urlopen(request).read()
1586                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1587                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1588                         return
1589
1590                 # Extract URL, uploader and title from webpage
1591                 self.report_extraction(video_id)
1592                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1593                 if mobj is None:
1594                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1595                         return
1596                 sequence = urllib.unquote(mobj.group(1))
1597                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1598                 if mobj is None:
1599                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1600                         return
1601                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1602
1603                 # if needed add http://www.dailymotion.com/ if relative URL
1604
1605                 video_url = mediaURL
1606
1607                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1608                 if mobj is None:
1609                         self._downloader.trouble(u'ERROR: unable to extract title')
1610                         return
1611                 video_title = mobj.group(1).decode('utf-8')
1612                 video_title = sanitize_title(video_title)
1613
1614                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1615                 if mobj is None:
1616                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1617                         return
1618                 video_uploader = mobj.group(1)
1619
1620                 try:
1621                         # Process video information
1622                         self._downloader.process_info({
1623                                 'id':           video_id.decode('utf-8'),
1624                                 'url':          video_url.decode('utf-8'),
1625                                 'uploader':     video_uploader.decode('utf-8'),
1626                                 'upload_date':  u'NA',
1627                                 'title':        video_title,
1628                                 'stitle':       simple_title,
1629                                 'ext':          video_extension.decode('utf-8'),
1630                                 'format':       u'NA',
1631                                 'player_url':   None,
1632                         })
1633                 except UnavailableVideoError:
1634                         self._downloader.trouble(u'\nERROR: unable to download video')
1635
1636
1637 class GoogleIE(InfoExtractor):
1638         """Information extractor for video.google.com."""
1639
1640         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1641         IE_NAME = u'video.google'
1642
1643         def __init__(self, downloader=None):
1644                 InfoExtractor.__init__(self, downloader)
1645
1646         def report_download_webpage(self, video_id):
1647                 """Report webpage download."""
1648                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1649
1650         def report_extraction(self, video_id):
1651                 """Report information extraction."""
1652                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1653
1654         def _real_extract(self, url):
1655                 # Extract id from URL
1656                 mobj = re.match(self._VALID_URL, url)
1657                 if mobj is None:
1658                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1659                         return
1660
1661                 # At this point we have a new video
1662                 self._downloader.increment_downloads()
1663                 video_id = mobj.group(1)
1664
1665                 video_extension = 'mp4'
1666
1667                 # Retrieve video webpage to extract further information
1668                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1669                 try:
1670                         self.report_download_webpage(video_id)
1671                         webpage = urllib2.urlopen(request).read()
1672                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1673                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1674                         return
1675
1676                 # Extract URL, uploader, and title from webpage
1677                 self.report_extraction(video_id)
1678                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1679                 if mobj is None:
1680                         video_extension = 'flv'
1681                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1682                 if mobj is None:
1683                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1684                         return
1685                 mediaURL = urllib.unquote(mobj.group(1))
1686                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1687                 mediaURL = mediaURL.replace('\\x26', '\x26')
1688
1689                 video_url = mediaURL
1690
1691                 mobj = re.search(r'<title>(.*)</title>', webpage)
1692                 if mobj is None:
1693                         self._downloader.trouble(u'ERROR: unable to extract title')
1694                         return
1695                 video_title = mobj.group(1).decode('utf-8')
1696                 video_title = sanitize_title(video_title)
1697                 simple_title = _simplify_title(video_title)
1698
1699                 # Extract video description
1700                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1701                 if mobj is None:
1702                         self._downloader.trouble(u'ERROR: unable to extract video description')
1703                         return
1704                 video_description = mobj.group(1).decode('utf-8')
1705                 if not video_description:
1706                         video_description = 'No description available.'
1707
1708                 # Extract video thumbnail
1709                 if self._downloader.params.get('forcethumbnail', False):
1710                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1711                         try:
1712                                 webpage = urllib2.urlopen(request).read()
1713                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1714                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1715                                 return
1716                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1717                         if mobj is None:
1718                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1719                                 return
1720                         video_thumbnail = mobj.group(1)
1721                 else:   # we need something to pass to process_info
1722                         video_thumbnail = ''
1723
1724                 try:
1725                         # Process video information
1726                         self._downloader.process_info({
1727                                 'id':           video_id.decode('utf-8'),
1728                                 'url':          video_url.decode('utf-8'),
1729                                 'uploader':     u'NA',
1730                                 'upload_date':  u'NA',
1731                                 'title':        video_title,
1732                                 'stitle':       simple_title,
1733                                 'ext':          video_extension.decode('utf-8'),
1734                                 'format':       u'NA',
1735                                 'player_url':   None,
1736                         })
1737                 except UnavailableVideoError:
1738                         self._downloader.trouble(u'\nERROR: unable to download video')
1739
1740
1741 class PhotobucketIE(InfoExtractor):
1742         """Information extractor for photobucket.com."""
1743
1744         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1745         IE_NAME = u'photobucket'
1746
1747         def __init__(self, downloader=None):
1748                 InfoExtractor.__init__(self, downloader)
1749
1750         def report_download_webpage(self, video_id):
1751                 """Report webpage download."""
1752                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1753
1754         def report_extraction(self, video_id):
1755                 """Report information extraction."""
1756                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1757
1758         def _real_extract(self, url):
1759                 # Extract id from URL
1760                 mobj = re.match(self._VALID_URL, url)
1761                 if mobj is None:
1762                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1763                         return
1764
1765                 # At this point we have a new video
1766                 self._downloader.increment_downloads()
1767                 video_id = mobj.group(1)
1768
1769                 video_extension = 'flv'
1770
1771                 # Retrieve video webpage to extract further information
1772                 request = urllib2.Request(url)
1773                 try:
1774                         self.report_download_webpage(video_id)
1775                         webpage = urllib2.urlopen(request).read()
1776                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1777                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1778                         return
1779
1780                 # Extract URL, uploader, and title from webpage
1781                 self.report_extraction(video_id)
1782                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1783                 if mobj is None:
1784                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1785                         return
1786                 mediaURL = urllib.unquote(mobj.group(1))
1787
1788                 video_url = mediaURL
1789
1790                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1791                 if mobj is None:
1792                         self._downloader.trouble(u'ERROR: unable to extract title')
1793                         return
1794                 video_title = mobj.group(1).decode('utf-8')
1795                 video_title = sanitize_title(video_title)
1796                 simple_title = _simplify_title(vide_title)
1797
1798                 video_uploader = mobj.group(2).decode('utf-8')
1799
1800                 try:
1801                         # Process video information
1802                         self._downloader.process_info({
1803                                 'id':           video_id.decode('utf-8'),
1804                                 'url':          video_url.decode('utf-8'),
1805                                 'uploader':     video_uploader,
1806                                 'upload_date':  u'NA',
1807                                 'title':        video_title,
1808                                 'stitle':       simple_title,
1809                                 'ext':          video_extension.decode('utf-8'),
1810                                 'format':       u'NA',
1811                                 'player_url':   None,
1812                         })
1813                 except UnavailableVideoError:
1814                         self._downloader.trouble(u'\nERROR: unable to download video')
1815
1816
1817 class YahooIE(InfoExtractor):
1818         """Information extractor for video.yahoo.com."""
1819
1820         # _VALID_URL matches all Yahoo! Video URLs
1821         # _VPAGE_URL matches only the extractable '/watch/' URLs
1822         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1823         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1824         IE_NAME = u'video.yahoo'
1825
1826         def __init__(self, downloader=None):
1827                 InfoExtractor.__init__(self, downloader)
1828
1829         def report_download_webpage(self, video_id):
1830                 """Report webpage download."""
1831                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1832
1833         def report_extraction(self, video_id):
1834                 """Report information extraction."""
1835                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1836
1837         def _real_extract(self, url, new_video=True):
1838                 # Extract ID from URL
1839                 mobj = re.match(self._VALID_URL, url)
1840                 if mobj is None:
1841                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1842                         return
1843
1844                 # At this point we have a new video
1845                 self._downloader.increment_downloads()
1846                 video_id = mobj.group(2)
1847                 video_extension = 'flv'
1848
1849                 # Rewrite valid but non-extractable URLs as
1850                 # extractable English language /watch/ URLs
1851                 if re.match(self._VPAGE_URL, url) is None:
1852                         request = urllib2.Request(url)
1853                         try:
1854                                 webpage = urllib2.urlopen(request).read()
1855                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1856                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1857                                 return
1858
1859                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1860                         if mobj is None:
1861                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1862                                 return
1863                         yahoo_id = mobj.group(1)
1864
1865                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1866                         if mobj is None:
1867                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1868                                 return
1869                         yahoo_vid = mobj.group(1)
1870
1871                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1872                         return self._real_extract(url, new_video=False)
1873
1874                 # Retrieve video webpage to extract further information
1875                 request = urllib2.Request(url)
1876                 try:
1877                         self.report_download_webpage(video_id)
1878                         webpage = urllib2.urlopen(request).read()
1879                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1880                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1881                         return
1882
1883                 # Extract uploader and title from webpage
1884                 self.report_extraction(video_id)
1885                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1886                 if mobj is None:
1887                         self._downloader.trouble(u'ERROR: unable to extract video title')
1888                         return
1889                 video_title = mobj.group(1).decode('utf-8')
1890                 simple_title = _simplify_title(video_title)
1891
1892                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1893                 if mobj is None:
1894                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1895                         return
1896                 video_uploader = mobj.group(1).decode('utf-8')
1897
1898                 # Extract video thumbnail
1899                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1900                 if mobj is None:
1901                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1902                         return
1903                 video_thumbnail = mobj.group(1).decode('utf-8')
1904
1905                 # Extract video description
1906                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1907                 if mobj is None:
1908                         self._downloader.trouble(u'ERROR: unable to extract video description')
1909                         return
1910                 video_description = mobj.group(1).decode('utf-8')
1911                 if not video_description:
1912                         video_description = 'No description available.'
1913
1914                 # Extract video height and width
1915                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1916                 if mobj is None:
1917                         self._downloader.trouble(u'ERROR: unable to extract video height')
1918                         return
1919                 yv_video_height = mobj.group(1)
1920
1921                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1922                 if mobj is None:
1923                         self._downloader.trouble(u'ERROR: unable to extract video width')
1924                         return
1925                 yv_video_width = mobj.group(1)
1926
1927                 # Retrieve video playlist to extract media URL
1928                 # I'm not completely sure what all these options are, but we
1929                 # seem to need most of them, otherwise the server sends a 401.
1930                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1931                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1932                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1933                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1934                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1935                 try:
1936                         self.report_download_webpage(video_id)
1937                         webpage = urllib2.urlopen(request).read()
1938                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1940                         return
1941
1942                 # Extract media URL from playlist XML
1943                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1944                 if mobj is None:
1945                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1946                         return
1947                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1948                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1949
1950                 try:
1951                         # Process video information
1952                         self._downloader.process_info({
1953                                 'id':           video_id.decode('utf-8'),
1954                                 'url':          video_url,
1955                                 'uploader':     video_uploader,
1956                                 'upload_date':  u'NA',
1957                                 'title':        video_title,
1958                                 'stitle':       simple_title,
1959                                 'ext':          video_extension.decode('utf-8'),
1960                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1961                                 'description':  video_description,
1962                                 'thumbnail':    video_thumbnail,
1963                                 'player_url':   None,
1964                         })
1965                 except UnavailableVideoError:
1966                         self._downloader.trouble(u'\nERROR: unable to download video')
1967
1968
1969 class VimeoIE(InfoExtractor):
1970         """Information extractor for vimeo.com."""
1971
1972         # _VALID_URL matches Vimeo URLs
1973         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1974         IE_NAME = u'vimeo'
1975
1976         def __init__(self, downloader=None):
1977                 InfoExtractor.__init__(self, downloader)
1978
1979         def report_download_webpage(self, video_id):
1980                 """Report webpage download."""
1981                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1982
1983         def report_extraction(self, video_id):
1984                 """Report information extraction."""
1985                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1986
1987         def _real_extract(self, url, new_video=True):
1988                 # Extract ID from URL
1989                 mobj = re.match(self._VALID_URL, url)
1990                 if mobj is None:
1991                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1992                         return
1993
1994                 # At this point we have a new video
1995                 self._downloader.increment_downloads()
1996                 video_id = mobj.group(1)
1997
1998                 # Retrieve video webpage to extract further information
1999                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2000                 try:
2001                         self.report_download_webpage(video_id)
2002                         webpage = urllib2.urlopen(request).read()
2003                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2004                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2005                         return
2006
2007                 # Now we begin extracting as much information as we can from what we
2008                 # retrieved. First we extract the information common to all extractors,
2009                 # and latter we extract those that are Vimeo specific.
2010                 self.report_extraction(video_id)
2011
2012                 # Extract title
2013                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2014                 if mobj is None:
2015                         self._downloader.trouble(u'ERROR: unable to extract video title')
2016                         return
2017                 video_title = mobj.group(1).decode('utf-8')
2018                 simple_title = _simplify_title(video_title)
2019
2020                 # Extract uploader
2021                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2022                 if mobj is None:
2023                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2024                         return
2025                 video_uploader = mobj.group(1).decode('utf-8')
2026
2027                 # Extract video thumbnail
2028                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2029                 if mobj is None:
2030                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2031                         return
2032                 video_thumbnail = mobj.group(1).decode('utf-8')
2033
2034                 # # Extract video description
2035                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2036                 # if mobj is None:
2037                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2038                 #       return
2039                 # video_description = mobj.group(1).decode('utf-8')
2040                 # if not video_description: video_description = 'No description available.'
2041                 video_description = 'Foo.'
2042
2043                 # Vimeo specific: extract request signature
2044                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2045                 if mobj is None:
2046                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2047                         return
2048                 sig = mobj.group(1).decode('utf-8')
2049
2050                 # Vimeo specific: extract video quality information
2051                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2052                 if mobj is None:
2053                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2054                         return
2055                 quality = mobj.group(1).decode('utf-8')
2056
2057                 if int(quality) == 1:
2058                         quality = 'hd'
2059                 else:
2060                         quality = 'sd'
2061
2062                 # Vimeo specific: Extract request signature expiration
2063                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2064                 if mobj is None:
2065                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2066                         return
2067                 sig_exp = mobj.group(1).decode('utf-8')
2068
2069                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2070
2071                 try:
2072                         # Process video information
2073                         self._downloader.process_info({
2074                                 'id':           video_id.decode('utf-8'),
2075                                 'url':          video_url,
2076                                 'uploader':     video_uploader,
2077                                 'upload_date':  u'NA',
2078                                 'title':        video_title,
2079                                 'stitle':       simple_title,
2080                                 'ext':          u'mp4',
2081                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2082                                 'description':  video_description,
2083                                 'thumbnail':    video_thumbnail,
2084                                 'description':  video_description,
2085                                 'player_url':   None,
2086                         })
2087                 except UnavailableVideoError:
2088                         self._downloader.trouble(u'ERROR: unable to download video')
2089
2090
2091 class GenericIE(InfoExtractor):
2092         """Generic last-resort information extractor."""
2093
2094         _VALID_URL = r'.*'
2095         IE_NAME = u'generic'
2096
2097         def __init__(self, downloader=None):
2098                 InfoExtractor.__init__(self, downloader)
2099
2100         def report_download_webpage(self, video_id):
2101                 """Report webpage download."""
2102                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2103                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2104
2105         def report_extraction(self, video_id):
2106                 """Report information extraction."""
2107                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2108
2109         def _real_extract(self, url):
2110                 # At this point we have a new video
2111                 self._downloader.increment_downloads()
2112
2113                 video_id = url.split('/')[-1]
2114                 request = urllib2.Request(url)
2115                 try:
2116                         self.report_download_webpage(video_id)
2117                         webpage = urllib2.urlopen(request).read()
2118                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2119                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2120                         return
2121                 except ValueError, err:
2122                         # since this is the last-resort InfoExtractor, if
2123                         # this error is thrown, it'll be thrown here
2124                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2125                         return
2126
2127                 self.report_extraction(video_id)
2128                 # Start with something easy: JW Player in SWFObject
2129                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2130                 if mobj is None:
2131                         # Broaden the search a little bit
2132                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2133                 if mobj is None:
2134                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2135                         return
2136
2137                 # It's possible that one of the regexes
2138                 # matched, but returned an empty group:
2139                 if mobj.group(1) is None:
2140                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2141                         return
2142
2143                 video_url = urllib.unquote(mobj.group(1))
2144                 video_id = os.path.basename(video_url)
2145
2146                 # here's a fun little line of code for you:
2147                 video_extension = os.path.splitext(video_id)[1][1:]
2148                 video_id = os.path.splitext(video_id)[0]
2149
2150                 # it's tempting to parse this further, but you would
2151                 # have to take into account all the variations like
2152                 #   Video Title - Site Name
2153                 #   Site Name | Video Title
2154                 #   Video Title - Tagline | Site Name
2155                 # and so on and so forth; it's just not practical
2156                 mobj = re.search(r'<title>(.*)</title>', webpage)
2157                 if mobj is None:
2158                         self._downloader.trouble(u'ERROR: unable to extract title')
2159                         return
2160                 video_title = mobj.group(1).decode('utf-8')
2161                 video_title = sanitize_title(video_title)
2162                 simple_title = _simplify_title(video_title)
2163
2164                 # video uploader is domain name
2165                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2166                 if mobj is None:
2167                         self._downloader.trouble(u'ERROR: unable to extract title')
2168                         return
2169                 video_uploader = mobj.group(1).decode('utf-8')
2170
2171                 try:
2172                         # Process video information
2173                         self._downloader.process_info({
2174                                 'id':           video_id.decode('utf-8'),
2175                                 'url':          video_url.decode('utf-8'),
2176                                 'uploader':     video_uploader,
2177                                 'upload_date':  u'NA',
2178                                 'title':        video_title,
2179                                 'stitle':       simple_title,
2180                                 'ext':          video_extension.decode('utf-8'),
2181                                 'format':       u'NA',
2182                                 'player_url':   None,
2183                         })
2184                 except UnavailableVideoError, err:
2185                         self._downloader.trouble(u'\nERROR: unable to download video')
2186
2187
2188 class YoutubeSearchIE(InfoExtractor):
2189         """Information Extractor for YouTube search queries."""
2190         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2191         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2192         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2193         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2194         _youtube_ie = None
2195         _max_youtube_results = 1000
2196         IE_NAME = u'youtube:search'
2197
2198         def __init__(self, youtube_ie, downloader=None):
2199                 InfoExtractor.__init__(self, downloader)
2200                 self._youtube_ie = youtube_ie
2201
2202         def report_download_page(self, query, pagenum):
2203                 """Report attempt to download playlist page with given number."""
2204                 query = query.decode(preferredencoding())
2205                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2206
2207         def _real_initialize(self):
2208                 self._youtube_ie.initialize()
2209
2210         def _real_extract(self, query):
2211                 mobj = re.match(self._VALID_URL, query)
2212                 if mobj is None:
2213                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2214                         return
2215
2216                 prefix, query = query.split(':')
2217                 prefix = prefix[8:]
2218                 query = query.encode('utf-8')
2219                 if prefix == '':
2220                         self._download_n_results(query, 1)
2221                         return
2222                 elif prefix == 'all':
2223                         self._download_n_results(query, self._max_youtube_results)
2224                         return
2225                 else:
2226                         try:
2227                                 n = long(prefix)
2228                                 if n <= 0:
2229                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2230                                         return
2231                                 elif n > self._max_youtube_results:
2232                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2233                                         n = self._max_youtube_results
2234                                 self._download_n_results(query, n)
2235                                 return
2236                         except ValueError: # parsing prefix as integer fails
2237                                 self._download_n_results(query, 1)
2238                                 return
2239
2240         def _download_n_results(self, query, n):
2241                 """Downloads a specified number of results for a query"""
2242
2243                 video_ids = []
2244                 already_seen = set()
2245                 pagenum = 1
2246
2247                 while True:
2248                         self.report_download_page(query, pagenum)
2249                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2250                         request = urllib2.Request(result_url)
2251                         try:
2252                                 page = urllib2.urlopen(request).read()
2253                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2254                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2255                                 return
2256
2257                         # Extract video identifiers
2258                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2259                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2260                                 if video_id not in already_seen:
2261                                         video_ids.append(video_id)
2262                                         already_seen.add(video_id)
2263                                         if len(video_ids) == n:
2264                                                 # Specified n videos reached
2265                                                 for id in video_ids:
2266                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2267                                                 return
2268
2269                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2270                                 for id in video_ids:
2271                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2272                                 return
2273
2274                         pagenum = pagenum + 1
2275
2276
2277 class GoogleSearchIE(InfoExtractor):
2278         """Information Extractor for Google Video search queries."""
2279         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2280         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2281         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2282         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2283         _google_ie = None
2284         _max_google_results = 1000
2285         IE_NAME = u'video.google:search'
2286
2287         def __init__(self, google_ie, downloader=None):
2288                 InfoExtractor.__init__(self, downloader)
2289                 self._google_ie = google_ie
2290
2291         def report_download_page(self, query, pagenum):
2292                 """Report attempt to download playlist page with given number."""
2293                 query = query.decode(preferredencoding())
2294                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2295
2296         def _real_initialize(self):
2297                 self._google_ie.initialize()
2298
2299         def _real_extract(self, query):
2300                 mobj = re.match(self._VALID_URL, query)
2301                 if mobj is None:
2302                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2303                         return
2304
2305                 prefix, query = query.split(':')
2306                 prefix = prefix[8:]
2307                 query = query.encode('utf-8')
2308                 if prefix == '':
2309                         self._download_n_results(query, 1)
2310                         return
2311                 elif prefix == 'all':
2312                         self._download_n_results(query, self._max_google_results)
2313                         return
2314                 else:
2315                         try:
2316                                 n = long(prefix)
2317                                 if n <= 0:
2318                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2319                                         return
2320                                 elif n > self._max_google_results:
2321                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2322                                         n = self._max_google_results
2323                                 self._download_n_results(query, n)
2324                                 return
2325                         except ValueError: # parsing prefix as integer fails
2326                                 self._download_n_results(query, 1)
2327                                 return
2328
2329         def _download_n_results(self, query, n):
2330                 """Downloads a specified number of results for a query"""
2331
2332                 video_ids = []
2333                 already_seen = set()
2334                 pagenum = 1
2335
2336                 while True:
2337                         self.report_download_page(query, pagenum)
2338                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2339                         request = urllib2.Request(result_url)
2340                         try:
2341                                 page = urllib2.urlopen(request).read()
2342                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2343                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2344                                 return
2345
2346                         # Extract video identifiers
2347                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2348                                 video_id = mobj.group(1)
2349                                 if video_id not in already_seen:
2350                                         video_ids.append(video_id)
2351                                         already_seen.add(video_id)
2352                                         if len(video_ids) == n:
2353                                                 # Specified n videos reached
2354                                                 for id in video_ids:
2355                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2356                                                 return
2357
2358                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2359                                 for id in video_ids:
2360                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2361                                 return
2362
2363                         pagenum = pagenum + 1
2364
2365
2366 class YahooSearchIE(InfoExtractor):
2367         """Information Extractor for Yahoo! Video search queries."""
2368         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2369         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2370         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2371         _MORE_PAGES_INDICATOR = r'\s*Next'
2372         _yahoo_ie = None
2373         _max_yahoo_results = 1000
2374         IE_NAME = u'video.yahoo:search'
2375
2376         def __init__(self, yahoo_ie, downloader=None):
2377                 InfoExtractor.__init__(self, downloader)
2378                 self._yahoo_ie = yahoo_ie
2379
2380         def report_download_page(self, query, pagenum):
2381                 """Report attempt to download playlist page with given number."""
2382                 query = query.decode(preferredencoding())
2383                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2384
2385         def _real_initialize(self):
2386                 self._yahoo_ie.initialize()
2387
2388         def _real_extract(self, query):
2389                 mobj = re.match(self._VALID_URL, query)
2390                 if mobj is None:
2391                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2392                         return
2393
2394                 prefix, query = query.split(':')
2395                 prefix = prefix[8:]
2396                 query = query.encode('utf-8')
2397                 if prefix == '':
2398                         self._download_n_results(query, 1)
2399                         return
2400                 elif prefix == 'all':
2401                         self._download_n_results(query, self._max_yahoo_results)
2402                         return
2403                 else:
2404                         try:
2405                                 n = long(prefix)
2406                                 if n <= 0:
2407                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2408                                         return
2409                                 elif n > self._max_yahoo_results:
2410                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2411                                         n = self._max_yahoo_results
2412                                 self._download_n_results(query, n)
2413                                 return
2414                         except ValueError: # parsing prefix as integer fails
2415                                 self._download_n_results(query, 1)
2416                                 return
2417
2418         def _download_n_results(self, query, n):
2419                 """Downloads a specified number of results for a query"""
2420
2421                 video_ids = []
2422                 already_seen = set()
2423                 pagenum = 1
2424
2425                 while True:
2426                         self.report_download_page(query, pagenum)
2427                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2428                         request = urllib2.Request(result_url)
2429                         try:
2430                                 page = urllib2.urlopen(request).read()
2431                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2432                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2433                                 return
2434
2435                         # Extract video identifiers
2436                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2437                                 video_id = mobj.group(1)
2438                                 if video_id not in already_seen:
2439                                         video_ids.append(video_id)
2440                                         already_seen.add(video_id)
2441                                         if len(video_ids) == n:
2442                                                 # Specified n videos reached
2443                                                 for id in video_ids:
2444                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2445                                                 return
2446
2447                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2448                                 for id in video_ids:
2449                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2450                                 return
2451
2452                         pagenum = pagenum + 1
2453
2454
2455 class YoutubePlaylistIE(InfoExtractor):
2456         """Information Extractor for YouTube playlists."""
2457
2458         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2459         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2460         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2461         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2462         _youtube_ie = None
2463         IE_NAME = u'youtube:playlist'
2464
2465         def __init__(self, youtube_ie, downloader=None):
2466                 InfoExtractor.__init__(self, downloader)
2467                 self._youtube_ie = youtube_ie
2468
2469         def report_download_page(self, playlist_id, pagenum):
2470                 """Report attempt to download playlist page with given number."""
2471                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2472
2473         def _real_initialize(self):
2474                 self._youtube_ie.initialize()
2475
2476         def _real_extract(self, url):
2477                 # Extract playlist id
2478                 mobj = re.match(self._VALID_URL, url)
2479                 if mobj is None:
2480                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2481                         return
2482
2483                 # Single video case
2484                 if mobj.group(3) is not None:
2485                         self._youtube_ie.extract(mobj.group(3))
2486                         return
2487
2488                 # Download playlist pages
2489                 # prefix is 'p' as default for playlists but there are other types that need extra care
2490                 playlist_prefix = mobj.group(1)
2491                 if playlist_prefix == 'a':
2492                         playlist_access = 'artist'
2493                 else:
2494                         playlist_prefix = 'p'
2495                         playlist_access = 'view_play_list'
2496                 playlist_id = mobj.group(2)
2497                 video_ids = []
2498                 pagenum = 1
2499
2500                 while True:
2501                         self.report_download_page(playlist_id, pagenum)
2502                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2503                         request = urllib2.Request(url)
2504                         try:
2505                                 page = urllib2.urlopen(request).read()
2506                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2507                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2508                                 return
2509
2510                         # Extract video identifiers
2511                         ids_in_page = []
2512                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2513                                 if mobj.group(1) not in ids_in_page:
2514                                         ids_in_page.append(mobj.group(1))
2515                         video_ids.extend(ids_in_page)
2516
2517                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2518                                 break
2519                         pagenum = pagenum + 1
2520
2521                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2522                 playlistend = self._downloader.params.get('playlistend', -1)
2523                 video_ids = video_ids[playliststart:playlistend]
2524
2525                 for id in video_ids:
2526                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2527                 return
2528
2529
2530 class YoutubeUserIE(InfoExtractor):
2531         """Information Extractor for YouTube users."""
2532
2533         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2534         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2535         _GDATA_PAGE_SIZE = 50
2536         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2537         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2538         _youtube_ie = None
2539         IE_NAME = u'youtube:user'
2540
2541         def __init__(self, youtube_ie, downloader=None):
2542                 InfoExtractor.__init__(self, downloader)
2543                 self._youtube_ie = youtube_ie
2544
2545         def report_download_page(self, username, start_index):
2546                 """Report attempt to download user page."""
2547                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2548                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2549
2550         def _real_initialize(self):
2551                 self._youtube_ie.initialize()
2552
2553         def _real_extract(self, url):
2554                 # Extract username
2555                 mobj = re.match(self._VALID_URL, url)
2556                 if mobj is None:
2557                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2558                         return
2559
2560                 username = mobj.group(1)
2561
2562                 # Download video ids using YouTube Data API. Result size per
2563                 # query is limited (currently to 50 videos) so we need to query
2564                 # page by page until there are no video ids - it means we got
2565                 # all of them.
2566
2567                 video_ids = []
2568                 pagenum = 0
2569
2570                 while True:
2571                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2572                         self.report_download_page(username, start_index)
2573
2574                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2575
2576                         try:
2577                                 page = urllib2.urlopen(request).read()
2578                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2579                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2580                                 return
2581
2582                         # Extract video identifiers
2583                         ids_in_page = []
2584
2585                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2586                                 if mobj.group(1) not in ids_in_page:
2587                                         ids_in_page.append(mobj.group(1))
2588
2589                         video_ids.extend(ids_in_page)
2590
2591                         # A little optimization - if current page is not
2592                         # "full", ie. does not contain PAGE_SIZE video ids then
2593                         # we can assume that this page is the last one - there
2594                         # are no more ids on further pages - no need to query
2595                         # again.
2596
2597                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2598                                 break
2599
2600                         pagenum += 1
2601
2602                 all_ids_count = len(video_ids)
2603                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2604                 playlistend = self._downloader.params.get('playlistend', -1)
2605
2606                 if playlistend == -1:
2607                         video_ids = video_ids[playliststart:]
2608                 else:
2609                         video_ids = video_ids[playliststart:playlistend]
2610
2611                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2612                                 (username, all_ids_count, len(video_ids)))
2613
2614                 for video_id in video_ids:
2615                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2616
2617
2618 class DepositFilesIE(InfoExtractor):
2619         """Information extractor for depositfiles.com"""
2620
2621         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2622         IE_NAME = u'DepositFiles'
2623
2624         def __init__(self, downloader=None):
2625                 InfoExtractor.__init__(self, downloader)
2626
2627         def report_download_webpage(self, file_id):
2628                 """Report webpage download."""
2629                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2630
2631         def report_extraction(self, file_id):
2632                 """Report information extraction."""
2633                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2634
2635         def _real_extract(self, url):
2636                 # At this point we have a new file
2637                 self._downloader.increment_downloads()
2638
2639                 file_id = url.split('/')[-1]
2640                 # Rebuild url in english locale
2641                 url = 'http://depositfiles.com/en/files/' + file_id
2642
2643                 # Retrieve file webpage with 'Free download' button pressed
2644                 free_download_indication = { 'gateway_result' : '1' }
2645                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2646                 try:
2647                         self.report_download_webpage(file_id)
2648                         webpage = urllib2.urlopen(request).read()
2649                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2650                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2651                         return
2652
2653                 # Search for the real file URL
2654                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2655                 if (mobj is None) or (mobj.group(1) is None):
2656                         # Try to figure out reason of the error.
2657                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2658                         if (mobj is not None) and (mobj.group(1) is not None):
2659                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2660                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2661                         else:
2662                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2663                         return
2664
2665                 file_url = mobj.group(1)
2666                 file_extension = os.path.splitext(file_url)[1][1:]
2667
2668                 # Search for file title
2669                 mobj = re.search(r'<b title="(.*?)">', webpage)
2670                 if mobj is None:
2671                         self._downloader.trouble(u'ERROR: unable to extract title')
2672                         return
2673                 file_title = mobj.group(1).decode('utf-8')
2674
2675                 try:
2676                         # Process file information
2677                         self._downloader.process_info({
2678                                 'id':           file_id.decode('utf-8'),
2679                                 'url':          file_url.decode('utf-8'),
2680                                 'uploader':     u'NA',
2681                                 'upload_date':  u'NA',
2682                                 'title':        file_title,
2683                                 'stitle':       file_title,
2684                                 'ext':          file_extension.decode('utf-8'),
2685                                 'format':       u'NA',
2686                                 'player_url':   None,
2687                         })
2688                 except UnavailableVideoError, err:
2689                         self._downloader.trouble(u'ERROR: unable to download file')
2690
2691
2692 class FacebookIE(InfoExtractor):
2693         """Information Extractor for Facebook"""
2694
2695         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2696         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2697         _NETRC_MACHINE = 'facebook'
2698         _available_formats = ['video', 'highqual', 'lowqual']
2699         _video_extensions = {
2700                 'video': 'mp4',
2701                 'highqual': 'mp4',
2702                 'lowqual': 'mp4',
2703         }
2704         IE_NAME = u'facebook'
2705
2706         def __init__(self, downloader=None):
2707                 InfoExtractor.__init__(self, downloader)
2708
2709         def _reporter(self, message):
2710                 """Add header and report message."""
2711                 self._downloader.to_screen(u'[facebook] %s' % message)
2712
2713         def report_login(self):
2714                 """Report attempt to log in."""
2715                 self._reporter(u'Logging in')
2716
2717         def report_video_webpage_download(self, video_id):
2718                 """Report attempt to download video webpage."""
2719                 self._reporter(u'%s: Downloading video webpage' % video_id)
2720
2721         def report_information_extraction(self, video_id):
2722                 """Report attempt to extract video information."""
2723                 self._reporter(u'%s: Extracting video information' % video_id)
2724
2725         def _parse_page(self, video_webpage):
2726                 """Extract video information from page"""
2727                 # General data
2728                 data = {'title': r'\("video_title", "(.*?)"\)',
2729                         'description': r'<div class="datawrap">(.*?)</div>',
2730                         'owner': r'\("video_owner_name", "(.*?)"\)',
2731                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2732                         }
2733                 video_info = {}
2734                 for piece in data.keys():
2735                         mobj = re.search(data[piece], video_webpage)
2736                         if mobj is not None:
2737                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2738
2739                 # Video urls
2740                 video_urls = {}
2741                 for fmt in self._available_formats:
2742                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2743                         if mobj is not None:
2744                                 # URL is in a Javascript segment inside an escaped Unicode format within
2745                                 # the generally utf-8 page
2746                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2747                 video_info['video_urls'] = video_urls
2748
2749                 return video_info
2750
2751         def _real_initialize(self):
2752                 if self._downloader is None:
2753                         return
2754
2755                 useremail = None
2756                 password = None
2757                 downloader_params = self._downloader.params
2758
2759                 # Attempt to use provided username and password or .netrc data
2760                 if downloader_params.get('username', None) is not None:
2761                         useremail = downloader_params['username']
2762                         password = downloader_params['password']
2763                 elif downloader_params.get('usenetrc', False):
2764                         try:
2765                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2766                                 if info is not None:
2767                                         useremail = info[0]
2768                                         password = info[2]
2769                                 else:
2770                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2771                         except (IOError, netrc.NetrcParseError), err:
2772                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2773                                 return
2774
2775                 if useremail is None:
2776                         return
2777
2778                 # Log in
2779                 login_form = {
2780                         'email': useremail,
2781                         'pass': password,
2782                         'login': 'Log+In'
2783                         }
2784                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2785                 try:
2786                         self.report_login()
2787                         login_results = urllib2.urlopen(request).read()
2788                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2789                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2790                                 return
2791                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2792                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2793                         return
2794
2795         def _real_extract(self, url):
2796                 mobj = re.match(self._VALID_URL, url)
2797                 if mobj is None:
2798                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2799                         return
2800                 video_id = mobj.group('ID')
2801
2802                 # Get video webpage
2803                 self.report_video_webpage_download(video_id)
2804                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2805                 try:
2806                         page = urllib2.urlopen(request)
2807                         video_webpage = page.read()
2808                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2809                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2810                         return
2811
2812                 # Start extracting information
2813                 self.report_information_extraction(video_id)
2814
2815                 # Extract information
2816                 video_info = self._parse_page(video_webpage)
2817
2818                 # uploader
2819                 if 'owner' not in video_info:
2820                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2821                         return
2822                 video_uploader = video_info['owner']
2823
2824                 # title
2825                 if 'title' not in video_info:
2826                         self._downloader.trouble(u'ERROR: unable to extract video title')
2827                         return
2828                 video_title = video_info['title']
2829                 video_title = video_title.decode('utf-8')
2830                 video_title = sanitize_title(video_title)
2831
2832                 simple_title = _simplify_title(video_title)
2833
2834                 # thumbnail image
2835                 if 'thumbnail' not in video_info:
2836                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2837                         video_thumbnail = ''
2838                 else:
2839                         video_thumbnail = video_info['thumbnail']
2840
2841                 # upload date
2842                 upload_date = u'NA'
2843                 if 'upload_date' in video_info:
2844                         upload_time = video_info['upload_date']
2845                         timetuple = email.utils.parsedate_tz(upload_time)
2846                         if timetuple is not None:
2847                                 try:
2848                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2849                                 except:
2850                                         pass
2851
2852                 # description
2853                 video_description = video_info.get('description', 'No description available.')
2854
2855                 url_map = video_info['video_urls']
2856                 if len(url_map.keys()) > 0:
2857                         # Decide which formats to download
2858                         req_format = self._downloader.params.get('format', None)
2859                         format_limit = self._downloader.params.get('format_limit', None)
2860
2861                         if format_limit is not None and format_limit in self._available_formats:
2862                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2863                         else:
2864                                 format_list = self._available_formats
2865                         existing_formats = [x for x in format_list if x in url_map]
2866                         if len(existing_formats) == 0:
2867                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2868                                 return
2869                         if req_format is None:
2870                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2871                         elif req_format == 'worst':
2872                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2873                         elif req_format == '-1':
2874                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2875                         else:
2876                                 # Specific format
2877                                 if req_format not in url_map:
2878                                         self._downloader.trouble(u'ERROR: requested format not available')
2879                                         return
2880                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2881
2882                 for format_param, video_real_url in video_url_list:
2883
2884                         # At this point we have a new video
2885                         self._downloader.increment_downloads()
2886
2887                         # Extension
2888                         video_extension = self._video_extensions.get(format_param, 'mp4')
2889
2890                         try:
2891                                 # Process video information
2892                                 self._downloader.process_info({
2893                                         'id':           video_id.decode('utf-8'),
2894                                         'url':          video_real_url.decode('utf-8'),
2895                                         'uploader':     video_uploader.decode('utf-8'),
2896                                         'upload_date':  upload_date,
2897                                         'title':        video_title,
2898                                         'stitle':       simple_title,
2899                                         'ext':          video_extension.decode('utf-8'),
2900                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2901                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2902                                         'description':  video_description.decode('utf-8'),
2903                                         'player_url':   None,
2904                                 })
2905                         except UnavailableVideoError, err:
2906                                 self._downloader.trouble(u'\nERROR: unable to download video')
2907
2908 class BlipTVIE(InfoExtractor):
2909         """Information extractor for blip.tv"""
2910
2911         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2912         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2913         IE_NAME = u'blip.tv'
2914
2915         def report_extraction(self, file_id):
2916                 """Report information extraction."""
2917                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2918
2919         def report_direct_download(self, title):
2920                 """Report information extraction."""
2921                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2922
2923         def _real_extract(self, url):
2924                 mobj = re.match(self._VALID_URL, url)
2925                 if mobj is None:
2926                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2927                         return
2928
2929                 if '?' in url:
2930                         cchar = '&'
2931                 else:
2932                         cchar = '?'
2933                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2934                 request = urllib2.Request(json_url)
2935                 self.report_extraction(mobj.group(1))
2936                 info = None
2937                 try:
2938                         urlh = urllib2.urlopen(request)
2939                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2940                                 basename = url.split('/')[-1]
2941                                 title,ext = os.path.splitext(basename)
2942                                 title = title.decode('UTF-8')
2943                                 ext = ext.replace('.', '')
2944                                 self.report_direct_download(title)
2945                                 info = {
2946                                         'id': title,
2947                                         'url': url,
2948                                         'title': title,
2949                                         'stitle': _simplify_title(title),
2950                                         'ext': ext,
2951                                         'urlhandle': urlh
2952                                 }
2953                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2954                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2955                         return
2956                 if info is None: # Regular URL
2957                         try:
2958                                 json_code = urlh.read()
2959                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2960                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2961                                 return
2962
2963                         try:
2964                                 json_data = json.loads(json_code)
2965                                 if 'Post' in json_data:
2966                                         data = json_data['Post']
2967                                 else:
2968                                         data = json_data
2969
2970                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2971                                 video_url = data['media']['url']
2972                                 umobj = re.match(self._URL_EXT, video_url)
2973                                 if umobj is None:
2974                                         raise ValueError('Can not determine filename extension')
2975                                 ext = umobj.group(1)
2976
2977                                 info = {
2978                                         'id': data['item_id'],
2979                                         'url': video_url,
2980                                         'uploader': data['display_name'],
2981                                         'upload_date': upload_date,
2982                                         'title': data['title'],
2983                                         'stitle': _simplify_title(data['title']),
2984                                         'ext': ext,
2985                                         'format': data['media']['mimeType'],
2986                                         'thumbnail': data['thumbnailUrl'],
2987                                         'description': data['description'],
2988                                         'player_url': data['embedUrl']
2989                                 }
2990                         except (ValueError,KeyError), err:
2991                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2992                                 return
2993
2994                 self._downloader.increment_downloads()
2995
2996                 try:
2997                         self._downloader.process_info(info)
2998                 except UnavailableVideoError, err:
2999                         self._downloader.trouble(u'\nERROR: unable to download video')
3000
3001
3002 class MyVideoIE(InfoExtractor):
3003         """Information Extractor for myvideo.de."""
3004
3005         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3006         IE_NAME = u'myvideo'
3007
3008         def __init__(self, downloader=None):
3009                 InfoExtractor.__init__(self, downloader)
3010
3011         def report_download_webpage(self, video_id):
3012                 """Report webpage download."""
3013                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3014
3015         def report_extraction(self, video_id):
3016                 """Report information extraction."""
3017                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3018
3019         def _real_extract(self,url):
3020                 mobj = re.match(self._VALID_URL, url)
3021                 if mobj is None:
3022                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3023                         return
3024
3025                 video_id = mobj.group(1)
3026
3027                 # Get video webpage
3028                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3029                 try:
3030                         self.report_download_webpage(video_id)
3031                         webpage = urllib2.urlopen(request).read()
3032                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3033                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3034                         return
3035
3036                 self.report_extraction(video_id)
3037                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3038                                  webpage)
3039                 if mobj is None:
3040                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3041                         return
3042                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3043
3044                 mobj = re.search('<title>([^<]+)</title>', webpage)
3045                 if mobj is None:
3046                         self._downloader.trouble(u'ERROR: unable to extract title')
3047                         return
3048
3049                 video_title = mobj.group(1)
3050                 video_title = sanitize_title(video_title)
3051
3052                 simple_title = _simplify_title(video_title)
3053
3054                 try:
3055                         self._downloader.process_info({
3056                                 'id':           video_id,
3057                                 'url':          video_url,
3058                                 'uploader':     u'NA',
3059                                 'upload_date':  u'NA',
3060                                 'title':        video_title,
3061                                 'stitle':       simple_title,
3062                                 'ext':          u'flv',
3063                                 'format':       u'NA',
3064                                 'player_url':   None,
3065                         })
3066                 except UnavailableVideoError:
3067                         self._downloader.trouble(u'\nERROR: Unable to download video')
3068
3069 class ComedyCentralIE(InfoExtractor):
3070         """Information extractor for The Daily Show and Colbert Report """
3071
3072         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3073         IE_NAME = u'comedycentral'
3074
3075         def report_extraction(self, episode_id):
3076                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3077
3078         def report_config_download(self, episode_id):
3079                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3080
3081         def report_index_download(self, episode_id):
3082                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3083
3084         def report_player_url(self, episode_id):
3085                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3086
3087         def _real_extract(self, url):
3088                 mobj = re.match(self._VALID_URL, url)
3089                 if mobj is None:
3090                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3091                         return
3092
3093                 if mobj.group('shortname'):
3094                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3095                                 url = u'http://www.thedailyshow.com/full-episodes/'
3096                         else:
3097                                 url = u'http://www.colbertnation.com/full-episodes/'
3098                         mobj = re.match(self._VALID_URL, url)
3099                         assert mobj is not None
3100
3101                 dlNewest = not mobj.group('episode')
3102                 if dlNewest:
3103                         epTitle = mobj.group('showname')
3104                 else:
3105                         epTitle = mobj.group('episode')
3106
3107                 req = urllib2.Request(url)
3108                 self.report_extraction(epTitle)
3109                 try:
3110                         htmlHandle = urllib2.urlopen(req)
3111                         html = htmlHandle.read()
3112                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3113                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3114                         return
3115                 if dlNewest:
3116                         url = htmlHandle.geturl()
3117                         mobj = re.match(self._VALID_URL, url)
3118                         if mobj is None:
3119                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3120                                 return
3121                         if mobj.group('episode') == '':
3122                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3123                                 return
3124                         epTitle = mobj.group('episode')
3125
3126                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3127                 if len(mMovieParams) == 0:
3128                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3129                         return
3130
3131                 playerUrl_raw = mMovieParams[0][0]
3132                 self.report_player_url(epTitle)
3133                 try:
3134                         urlHandle = urllib2.urlopen(playerUrl_raw)
3135                         playerUrl = urlHandle.geturl()
3136                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3137                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3138                         return
3139
3140                 uri = mMovieParams[0][1]
3141                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3142                 self.report_index_download(epTitle)
3143                 try:
3144                         indexXml = urllib2.urlopen(indexUrl).read()
3145                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3146                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3147                         return
3148
3149                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3150                 itemEls = idoc.findall('.//item')
3151                 for itemEl in itemEls:
3152                         mediaId = itemEl.findall('./guid')[0].text
3153                         shortMediaId = mediaId.split(':')[-1]
3154                         showId = mediaId.split(':')[-2].replace('.com', '')
3155                         officialTitle = itemEl.findall('./title')[0].text
3156                         officialDate = itemEl.findall('./pubDate')[0].text
3157
3158                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3159                                                 urllib.urlencode({'uri': mediaId}))
3160                         configReq = urllib2.Request(configUrl)
3161                         self.report_config_download(epTitle)
3162                         try:
3163                                 configXml = urllib2.urlopen(configReq).read()
3164                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3165                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3166                                 return
3167
3168                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3169                         turls = []
3170                         for rendition in cdoc.findall('.//rendition'):
3171                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3172                                 turls.append(finfo)
3173
3174                         if len(turls) == 0:
3175                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3176                                 continue
3177
3178                         # For now, just pick the highest bitrate
3179                         format,video_url = turls[-1]
3180
3181                         self._downloader.increment_downloads()
3182
3183                         effTitle = showId + u'-' + epTitle
3184                         info = {
3185                                 'id': shortMediaId,
3186                                 'url': video_url,
3187                                 'uploader': showId,
3188                                 'upload_date': officialDate,
3189                                 'title': effTitle,
3190                                 'stitle': _simplify_title(effTitle),
3191                                 'ext': 'mp4',
3192                                 'format': format,
3193                                 'thumbnail': None,
3194                                 'description': officialTitle,
3195                                 'player_url': playerUrl
3196                         }
3197
3198                         try:
3199                                 self._downloader.process_info(info)
3200                         except UnavailableVideoError, err:
3201                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3202                                 continue
3203
3204
3205 class EscapistIE(InfoExtractor):
3206         """Information extractor for The Escapist """
3207
3208         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3209         IE_NAME = u'escapist'
3210
3211         def report_extraction(self, showName):
3212                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3213
3214         def report_config_download(self, showName):
3215                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3216
3217         def _real_extract(self, url):
3218                 htmlParser = HTMLParser.HTMLParser()
3219
3220                 mobj = re.match(self._VALID_URL, url)
3221                 if mobj is None:
3222                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3223                         return
3224                 showName = mobj.group('showname')
3225                 videoId = mobj.group('episode')
3226
3227                 self.report_extraction(showName)
3228                 try:
3229                         webPage = urllib2.urlopen(url).read()
3230                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3231                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3232                         return
3233
3234                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3235                 description = htmlParser.unescape(descMatch.group(1))
3236                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3237                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3238                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3239                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3240                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3241                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3242
3243                 self.report_config_download(showName)
3244                 try:
3245                         configJSON = urllib2.urlopen(configUrl).read()
3246                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3247                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3248                         return
3249
3250                 # Technically, it's JavaScript, not JSON
3251                 configJSON = configJSON.replace("'", '"')
3252
3253                 try:
3254                         config = json.loads(configJSON)
3255                 except (ValueError,), err:
3256                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3257                         return
3258
3259                 playlist = config['playlist']
3260                 videoUrl = playlist[1]['url']
3261
3262                 self._downloader.increment_downloads()
3263                 info = {
3264                         'id': videoId,
3265                         'url': videoUrl,
3266                         'uploader': showName,
3267                         'upload_date': None,
3268                         'title': showName,
3269                         'stitle': _simplify_title(showName),
3270                         'ext': 'flv',
3271                         'format': 'flv',
3272                         'thumbnail': imgUrl,
3273                         'description': description,
3274                         'player_url': playerUrl,
3275                 }
3276
3277                 try:
3278                         self._downloader.process_info(info)
3279                 except UnavailableVideoError, err:
3280                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3281
3282
3283 class CollegeHumorIE(InfoExtractor):
3284         """Information extractor for collegehumor.com"""
3285
3286         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3287         IE_NAME = u'collegehumor'
3288
3289         def report_webpage(self, video_id):
3290                 """Report information extraction."""
3291                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3292
3293         def report_extraction(self, video_id):
3294                 """Report information extraction."""
3295                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3296
3297         def _real_extract(self, url):
3298                 htmlParser = HTMLParser.HTMLParser()
3299
3300                 mobj = re.match(self._VALID_URL, url)
3301                 if mobj is None:
3302                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3303                         return
3304                 video_id = mobj.group('videoid')
3305
3306                 self.report_webpage(video_id)
3307                 request = urllib2.Request(url)
3308                 try:
3309                         webpage = urllib2.urlopen(request).read()
3310                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3311                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3312                         return
3313
3314                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3315                 if m is None:
3316                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3317                         return
3318                 internal_video_id = m.group('internalvideoid')
3319
3320                 info = {
3321                         'id': video_id,
3322                         'internal_id': internal_video_id,
3323                 }
3324
3325                 self.report_extraction(video_id)
3326                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3327                 try:
3328                         metaXml = urllib2.urlopen(xmlUrl).read()
3329                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3330                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3331                         return
3332
3333                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3334                 try:
3335                         videoNode = mdoc.findall('./video')[0]
3336                         info['description'] = videoNode.findall('./description')[0].text
3337                         info['title'] = videoNode.findall('./caption')[0].text
3338                         info['stitle'] = _simplify_title(info['title'])
3339                         info['url'] = videoNode.findall('./file')[0].text
3340                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3341                         info['ext'] = info['url'].rpartition('.')[2]
3342                         info['format'] = info['ext']
3343                 except IndexError:
3344                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3345                         return
3346
3347                 self._downloader.increment_downloads()
3348
3349                 try:
3350                         self._downloader.process_info(info)
3351                 except UnavailableVideoError, err:
3352                         self._downloader.trouble(u'\nERROR: unable to download video')
3353
3354
3355 class XVideosIE(InfoExtractor):
3356         """Information extractor for xvideos.com"""
3357
3358         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3359         IE_NAME = u'xvideos'
3360
3361         def report_webpage(self, video_id):
3362                 """Report information extraction."""
3363                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3364
3365         def report_extraction(self, video_id):
3366                 """Report information extraction."""
3367                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3368
3369         def _real_extract(self, url):
3370                 htmlParser = HTMLParser.HTMLParser()
3371
3372                 mobj = re.match(self._VALID_URL, url)
3373                 if mobj is None:
3374                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3375                         return
3376                 video_id = mobj.group(1).decode('utf-8')
3377
3378                 self.report_webpage(video_id)
3379
3380                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3381                 try:
3382                         webpage = urllib2.urlopen(request).read()
3383                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3384                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3385                         return
3386
3387                 self.report_extraction(video_id)
3388
3389
3390                 # Extract video URL
3391                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3392                 if mobj is None:
3393                         self._downloader.trouble(u'ERROR: unable to extract video url')
3394                         return
3395                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3396
3397
3398                 # Extract title
3399                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3400                 if mobj is None:
3401                         self._downloader.trouble(u'ERROR: unable to extract video title')
3402                         return
3403                 video_title = mobj.group(1).decode('utf-8')
3404
3405
3406                 # Extract video thumbnail
3407                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3408                 if mobj is None:
3409                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3410                         return
3411                 video_thumbnail = mobj.group(1).decode('utf-8')
3412
3413
3414
3415                 self._downloader.increment_downloads()
3416                 info = {
3417                         'id': video_id,
3418                         'url': video_url,
3419                         'uploader': None,
3420                         'upload_date': None,
3421                         'title': video_title,
3422                         'stitle': _simplify_title(video_title),
3423                         'ext': 'flv',
3424                         'format': 'flv',
3425                         'thumbnail': video_thumbnail,
3426                         'description': None,
3427                         'player_url': None,
3428                 }
3429
3430                 try:
3431                         self._downloader.process_info(info)
3432                 except UnavailableVideoError, err:
3433                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3434
3435
3436 class SoundcloudIE(InfoExtractor):
3437         """Information extractor for soundcloud.com
3438            To access the media, the uid of the song and a stream token
3439            must be extracted from the page source and the script must make
3440            a request to media.soundcloud.com/crossdomain.xml. Then
3441            the media can be grabbed by requesting from an url composed
3442            of the stream token and uid
3443          """
3444
3445         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3446         IE_NAME = u'soundcloud'
3447
3448         def __init__(self, downloader=None):
3449                 InfoExtractor.__init__(self, downloader)
3450
3451         def report_webpage(self, video_id):
3452                 """Report information extraction."""
3453                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3454
3455         def report_extraction(self, video_id):
3456                 """Report information extraction."""
3457                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3458
3459         def _real_extract(self, url):
3460                 htmlParser = HTMLParser.HTMLParser()
3461
3462                 mobj = re.match(self._VALID_URL, url)
3463                 if mobj is None:
3464                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3465                         return
3466
3467                 # extract uploader (which is in the url)
3468                 uploader = mobj.group(1).decode('utf-8')
3469                 # extract simple title (uploader + slug of song title)
3470                 slug_title =  mobj.group(2).decode('utf-8')
3471                 simple_title = uploader + '-' + slug_title
3472
3473                 self.report_webpage('%s/%s' % (uploader, slug_title))
3474
3475                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3476                 try:
3477                         webpage = urllib2.urlopen(request).read()
3478                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3479                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3480                         return
3481
3482                 self.report_extraction('%s/%s' % (uploader, slug_title))
3483
3484                 # extract uid and stream token that soundcloud hands out for access
3485                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3486                 if mobj:
3487                         video_id = mobj.group(1)
3488                         stream_token = mobj.group(2)
3489
3490                 # extract unsimplified title
3491                 mobj = re.search('"title":"(.*?)",', webpage)
3492                 if mobj:
3493                         title = mobj.group(1)
3494
3495                 # construct media url (with uid/token)
3496                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3497                 mediaURL = mediaURL % (video_id, stream_token)
3498
3499                 # description
3500                 description = u'No description available'
3501                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3502                 if mobj:
3503                         description = mobj.group(1)
3504
3505                 # upload date
3506                 upload_date = None
3507                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3508                 if mobj:
3509                         try:
3510                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3511                         except Exception, e:
3512                                 print str(e)
3513
3514                 # for soundcloud, a request to a cross domain is required for cookies
3515                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3516
3517                 try:
3518                         self._downloader.process_info({
3519                                 'id':           video_id.decode('utf-8'),
3520                                 'url':          mediaURL,
3521                                 'uploader':     uploader.decode('utf-8'),
3522                                 'upload_date':  upload_date,
3523                                 'title':        simple_title.decode('utf-8'),
3524                                 'stitle':       simple_title.decode('utf-8'),
3525                                 'ext':          u'mp3',
3526                                 'format':       u'NA',
3527                                 'player_url':   None,
3528                                 'description': description.decode('utf-8')
3529                         })
3530                 except UnavailableVideoError:
3531                         self._downloader.trouble(u'\nERROR: unable to download video')
3532
3533
3534 class InfoQIE(InfoExtractor):
3535         """Information extractor for infoq.com"""
3536
3537         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3538         IE_NAME = u'infoq'
3539
3540         def report_webpage(self, video_id):
3541                 """Report information extraction."""
3542                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3543
3544         def report_extraction(self, video_id):
3545                 """Report information extraction."""
3546                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3547
3548         def _real_extract(self, url):
3549                 htmlParser = HTMLParser.HTMLParser()
3550
3551                 mobj = re.match(self._VALID_URL, url)
3552                 if mobj is None:
3553                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3554                         return
3555
3556                 self.report_webpage(url)
3557
3558                 request = urllib2.Request(url)
3559                 try:
3560                         webpage = urllib2.urlopen(request).read()
3561                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3562                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3563                         return
3564
3565                 self.report_extraction(url)
3566
3567
3568                 # Extract video URL
3569                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3570                 if mobj is None:
3571                         self._downloader.trouble(u'ERROR: unable to extract video url')
3572                         return
3573                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3574
3575
3576                 # Extract title
3577                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3578                 if mobj is None:
3579                         self._downloader.trouble(u'ERROR: unable to extract video title')
3580                         return
3581                 video_title = mobj.group(1).decode('utf-8')
3582
3583                 # Extract description
3584                 video_description = u'No description available.'
3585                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3586                 if mobj is not None:
3587                         video_description = mobj.group(1).decode('utf-8')
3588
3589                 video_filename = video_url.split('/')[-1]
3590                 video_id, extension = video_filename.split('.')
3591
3592                 self._downloader.increment_downloads()
3593                 info = {
3594                         'id': video_id,
3595                         'url': video_url,
3596                         'uploader': None,
3597                         'upload_date': None,
3598                         'title': video_title,
3599                         'stitle': _simplify_title(video_title),
3600                         'ext': extension,
3601                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3602                         'thumbnail': None,
3603                         'description': video_description,
3604                         'player_url': None,
3605                 }
3606
3607                 try:
3608                         self._downloader.process_info(info)
3609                 except UnavailableVideoError, err:
3610                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3611
3612 class MixcloudIE(InfoExtractor):
3613         """Information extractor for www.mixcloud.com"""
3614         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3615         IE_NAME = u'mixcloud'
3616
3617         def __init__(self, downloader=None):
3618                 InfoExtractor.__init__(self, downloader)
3619
3620         def report_download_json(self, file_id):
3621                 """Report JSON download."""
3622                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3623
3624         def report_extraction(self, file_id):
3625                 """Report information extraction."""
3626                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3627
3628         def get_urls(self, jsonData, fmt, bitrate='best'):
3629                 """Get urls from 'audio_formats' section in json"""
3630                 file_url = None
3631                 try:
3632                         bitrate_list = jsonData[fmt]
3633                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3634                                 bitrate = max(bitrate_list) # select highest
3635
3636                         url_list = jsonData[fmt][bitrate]
3637                 except TypeError: # we have no bitrate info.
3638                         url_list = jsonData[fmt]
3639
3640                 return url_list
3641
3642         def check_urls(self, url_list):
3643                 """Returns 1st active url from list"""
3644                 for url in url_list:
3645                         try:
3646                                 urllib2.urlopen(url)
3647                                 return url
3648                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3649                                 url = None
3650
3651                 return None
3652
3653         def _print_formats(self, formats):
3654                 print 'Available formats:'
3655                 for fmt in formats.keys():
3656                         for b in formats[fmt]:
3657                                 try:
3658                                         ext = formats[fmt][b][0]
3659                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3660                                 except TypeError: # we have no bitrate info
3661                                         ext = formats[fmt][0]
3662                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3663                                         break
3664
3665         def _real_extract(self, url):
3666                 mobj = re.match(self._VALID_URL, url)
3667                 if mobj is None:
3668                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3669                         return
3670                 # extract uploader & filename from url
3671                 uploader = mobj.group(1).decode('utf-8')
3672                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3673
3674                 # construct API request
3675                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3676                 # retrieve .json file with links to files
3677                 request = urllib2.Request(file_url)
3678                 try:
3679                         self.report_download_json(file_url)
3680                         jsonData = urllib2.urlopen(request).read()
3681                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3682                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3683                         return
3684
3685                 # parse JSON
3686                 json_data = json.loads(jsonData)
3687                 player_url = json_data['player_swf_url']
3688                 formats = dict(json_data['audio_formats'])
3689
3690                 req_format = self._downloader.params.get('format', None)
3691                 bitrate = None
3692
3693                 if self._downloader.params.get('listformats', None):
3694                         self._print_formats(formats)
3695                         return
3696
3697                 if req_format is None or req_format == 'best':
3698                         for format_param in formats.keys():
3699                                 url_list = self.get_urls(formats, format_param)
3700                                 # check urls
3701                                 file_url = self.check_urls(url_list)
3702                                 if file_url is not None:
3703                                         break # got it!
3704                 else:
3705                         if req_format not in formats.keys():
3706                                 self._downloader.trouble(u'ERROR: format is not available')
3707                                 return
3708
3709                         url_list = self.get_urls(formats, req_format)
3710                         file_url = self.check_urls(url_list)
3711                         format_param = req_format
3712
3713                 # We have audio
3714                 self._downloader.increment_downloads()
3715                 try:
3716                         # Process file information
3717                         self._downloader.process_info({
3718                                 'id':           file_id.decode('utf-8'),
3719                                 'url':          file_url.decode('utf-8'),
3720                                 'uploader':     uploader.decode('utf-8'),
3721                                 'upload_date':  u'NA',
3722                                 'title':        json_data['name'],
3723                                 'stitle':       _simplify_title(json_data['name']),
3724                                 'ext':          file_url.split('.')[-1].decode('utf-8'),
3725                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3726                                 'thumbnail':    json_data['thumbnail_url'],
3727                                 'description':  json_data['description'],
3728                                 'player_url':   player_url.decode('utf-8'),
3729                         })
3730                 except UnavailableVideoError, err:
3731                         self._downloader.trouble(u'ERROR: unable to download file')
3732
3733
3734
3735 class PostProcessor(object):
3736         """Post Processor class.
3737
3738         PostProcessor objects can be added to downloaders with their
3739         add_post_processor() method. When the downloader has finished a
3740         successful download, it will take its internal chain of PostProcessors
3741         and start calling the run() method on each one of them, first with
3742         an initial argument and then with the returned value of the previous
3743         PostProcessor.
3744
3745         The chain will be stopped if one of them ever returns None or the end
3746         of the chain is reached.
3747
3748         PostProcessor objects follow a "mutual registration" process similar
3749         to InfoExtractor objects.
3750         """
3751
3752         _downloader = None
3753
3754         def __init__(self, downloader=None):
3755                 self._downloader = downloader
3756
3757         def set_downloader(self, downloader):
3758                 """Sets the downloader for this PP."""
3759                 self._downloader = downloader
3760
3761         def run(self, information):
3762                 """Run the PostProcessor.
3763
3764                 The "information" argument is a dictionary like the ones
3765                 composed by InfoExtractors. The only difference is that this
3766                 one has an extra field called "filepath" that points to the
3767                 downloaded file.
3768
3769                 When this method returns None, the postprocessing chain is
3770                 stopped. However, this method may return an information
3771                 dictionary that will be passed to the next postprocessing
3772                 object in the chain. It can be the one it received after
3773                 changing some fields.
3774
3775                 In addition, this method may raise a PostProcessingError
3776                 exception that will be taken into account by the downloader
3777                 it was called from.
3778                 """
3779                 return information # by default, do nothing
3780
3781
3782 class FFmpegExtractAudioPP(PostProcessor):
3783
3784         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3785                 PostProcessor.__init__(self, downloader)
3786                 if preferredcodec is None:
3787                         preferredcodec = 'best'
3788                 self._preferredcodec = preferredcodec
3789                 self._preferredquality = preferredquality
3790                 self._keepvideo = keepvideo
3791
3792         @staticmethod
3793         def get_audio_codec(path):
3794                 try:
3795                         cmd = ['ffprobe', '-show_streams', '--', path]
3796                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3797                         output = handle.communicate()[0]
3798                         if handle.wait() != 0:
3799                                 return None
3800                 except (IOError, OSError):
3801                         return None
3802                 audio_codec = None
3803                 for line in output.split('\n'):
3804                         if line.startswith('codec_name='):
3805                                 audio_codec = line.split('=')[1].strip()
3806                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3807                                 return audio_codec
3808                 return None
3809
3810         @staticmethod
3811         def run_ffmpeg(path, out_path, codec, more_opts):
3812                 try:
3813                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3814                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3815                         return (ret == 0)
3816                 except (IOError, OSError):
3817                         return False
3818
3819         def run(self, information):
3820                 path = information['filepath']
3821
3822                 filecodec = self.get_audio_codec(path)
3823                 if filecodec is None:
3824                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3825                         return None
3826
3827                 more_opts = []
3828                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3829                         if filecodec in ['aac', 'mp3', 'vorbis']:
3830                                 # Lossless if possible
3831                                 acodec = 'copy'
3832                                 extension = filecodec
3833                                 if filecodec == 'aac':
3834                                         more_opts = ['-f', 'adts']
3835                                 if filecodec == 'vorbis':
3836                                         extension = 'ogg'
3837                         else:
3838                                 # MP3 otherwise.
3839                                 acodec = 'libmp3lame'
3840                                 extension = 'mp3'
3841                                 more_opts = []
3842                                 if self._preferredquality is not None:
3843                                         more_opts += ['-ab', self._preferredquality]
3844                 else:
3845                         # We convert the audio (lossy)
3846                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3847                         extension = self._preferredcodec
3848                         more_opts = []
3849                         if self._preferredquality is not None:
3850                                 more_opts += ['-ab', self._preferredquality]
3851                         if self._preferredcodec == 'aac':
3852                                 more_opts += ['-f', 'adts']
3853                         if self._preferredcodec == 'vorbis':
3854                                 extension = 'ogg'
3855
3856                 (prefix, ext) = os.path.splitext(path)
3857                 new_path = prefix + '.' + extension
3858                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3859                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3860
3861                 if not status:
3862                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3863                         return None
3864
3865                 # Try to update the date time for extracted audio file.
3866                 if information.get('filetime') is not None:
3867                         try:
3868                                 os.utime(new_path, (time.time(), information['filetime']))
3869                         except:
3870                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3871
3872                 if not self._keepvideo:
3873                         try:
3874                                 os.remove(path)
3875                         except (IOError, OSError):
3876                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3877                                 return None
3878
3879                 information['filepath'] = new_path
3880                 return information
3881
3882
3883 def updateSelf(downloader, filename):
3884         ''' Update the program file with the latest version from the repository '''
3885         # Note: downloader only used for options
3886         if not os.access(filename, os.W_OK):
3887                 sys.exit('ERROR: no write permissions on %s' % filename)
3888
3889         downloader.to_screen('Updating to latest version...')
3890
3891         try:
3892                 try:
3893                         urlh = urllib.urlopen(UPDATE_URL)
3894                         newcontent = urlh.read()
3895
3896                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3897                         if vmatch is not None and vmatch.group(1) == __version__:
3898                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3899                                 return
3900                 finally:
3901                         urlh.close()
3902         except (IOError, OSError), err:
3903                 sys.exit('ERROR: unable to download latest version')
3904
3905         try:
3906                 outf = open(filename, 'wb')
3907                 try:
3908                         outf.write(newcontent)
3909                 finally:
3910                         outf.close()
3911         except (IOError, OSError), err:
3912                 sys.exit('ERROR: unable to overwrite current version')
3913
3914         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3915
3916 def parseOpts():
3917         # Deferred imports
3918         import getpass
3919         import optparse
3920
3921         def _format_option_string(option):
3922                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3923
3924                 opts = []
3925
3926                 if option._short_opts: opts.append(option._short_opts[0])
3927                 if option._long_opts: opts.append(option._long_opts[0])
3928                 if len(opts) > 1: opts.insert(1, ', ')
3929
3930                 if option.takes_value(): opts.append(' %s' % option.metavar)
3931
3932                 return "".join(opts)
3933
3934         def _find_term_columns():
3935                 columns = os.environ.get('COLUMNS', None)
3936                 if columns:
3937                         return int(columns)
3938
3939                 try:
3940                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3941                         out,err = sp.communicate()
3942                         return int(out.split()[1])
3943                 except:
3944                         pass
3945                 return None
3946
3947         max_width = 80
3948         max_help_position = 80
3949
3950         # No need to wrap help messages if we're on a wide console
3951         columns = _find_term_columns()
3952         if columns: max_width = columns
3953
3954         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3955         fmt.format_option_strings = _format_option_string
3956
3957         kw = {
3958                 'version'   : __version__,
3959                 'formatter' : fmt,
3960                 'usage' : '%prog [options] url [url...]',
3961                 'conflict_handler' : 'resolve',
3962         }
3963
3964         parser = optparse.OptionParser(**kw)
3965
3966         # option groups
3967         general        = optparse.OptionGroup(parser, 'General Options')
3968         selection      = optparse.OptionGroup(parser, 'Video Selection')
3969         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3970         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3971         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3972         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3973         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3974
3975         general.add_option('-h', '--help',
3976                         action='help', help='print this help text and exit')
3977         general.add_option('-v', '--version',
3978                         action='version', help='print program version and exit')
3979         general.add_option('-U', '--update',
3980                         action='store_true', dest='update_self', help='update this program to latest version')
3981         general.add_option('-i', '--ignore-errors',
3982                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3983         general.add_option('-r', '--rate-limit',
3984                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3985         general.add_option('-R', '--retries',
3986                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3987         general.add_option('--dump-user-agent',
3988                         action='store_true', dest='dump_user_agent',
3989                         help='display the current browser identification', default=False)
3990         general.add_option('--list-extractors',
3991                         action='store_true', dest='list_extractors',
3992                         help='List all supported extractors and the URLs they would handle', default=False)
3993
3994         selection.add_option('--playlist-start',
3995                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3996         selection.add_option('--playlist-end',
3997                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3998         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3999         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4000
4001         authentication.add_option('-u', '--username',
4002                         dest='username', metavar='USERNAME', help='account username')
4003         authentication.add_option('-p', '--password',
4004                         dest='password', metavar='PASSWORD', help='account password')
4005         authentication.add_option('-n', '--netrc',
4006                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4007
4008
4009         video_format.add_option('-f', '--format',
4010                         action='store', dest='format', metavar='FORMAT', help='video format code')
4011         video_format.add_option('--all-formats',
4012                         action='store_const', dest='format', help='download all available video formats', const='all')
4013         video_format.add_option('--max-quality',
4014                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4015         video_format.add_option('-F', '--list-formats',
4016                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4017
4018
4019         verbosity.add_option('-q', '--quiet',
4020                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4021         verbosity.add_option('-s', '--simulate',
4022                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4023         verbosity.add_option('--skip-download',
4024                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4025         verbosity.add_option('-g', '--get-url',
4026                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4027         verbosity.add_option('-e', '--get-title',
4028                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4029         verbosity.add_option('--get-thumbnail',
4030                         action='store_true', dest='getthumbnail',
4031                         help='simulate, quiet but print thumbnail URL', default=False)
4032         verbosity.add_option('--get-description',
4033                         action='store_true', dest='getdescription',
4034                         help='simulate, quiet but print video description', default=False)
4035         verbosity.add_option('--get-filename',
4036                         action='store_true', dest='getfilename',
4037                         help='simulate, quiet but print output filename', default=False)
4038         verbosity.add_option('--get-format',
4039                         action='store_true', dest='getformat',
4040                         help='simulate, quiet but print output format', default=False)
4041         verbosity.add_option('--no-progress',
4042                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4043         verbosity.add_option('--console-title',
4044                         action='store_true', dest='consoletitle',
4045                         help='display progress in console titlebar', default=False)
4046
4047
4048         filesystem.add_option('-t', '--title',
4049                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4050         filesystem.add_option('-l', '--literal',
4051                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4052         filesystem.add_option('-A', '--auto-number',
4053                         action='store_true', dest='autonumber',
4054                         help='number downloaded files starting from 00000', default=False)
4055         filesystem.add_option('-o', '--output',
4056                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
4057         filesystem.add_option('-a', '--batch-file',
4058                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4059         filesystem.add_option('-w', '--no-overwrites',
4060                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4061         filesystem.add_option('-c', '--continue',
4062                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4063         filesystem.add_option('--no-continue',
4064                         action='store_false', dest='continue_dl',
4065                         help='do not resume partially downloaded files (restart from beginning)')
4066         filesystem.add_option('--cookies',
4067                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4068         filesystem.add_option('--no-part',
4069                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4070         filesystem.add_option('--no-mtime',
4071                         action='store_false', dest='updatetime',
4072                         help='do not use the Last-modified header to set the file modification time', default=True)
4073         filesystem.add_option('--write-description',
4074                         action='store_true', dest='writedescription',
4075                         help='write video description to a .description file', default=False)
4076         filesystem.add_option('--write-info-json',
4077                         action='store_true', dest='writeinfojson',
4078                         help='write video metadata to a .info.json file', default=False)
4079
4080
4081         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4082                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4083         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4084                         help='"best", "aac", "vorbis" or "mp3"; best by default')
4085         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4086                         help='ffmpeg audio bitrate specification, 128k by default')
4087         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4088                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4089
4090
4091         parser.add_option_group(general)
4092         parser.add_option_group(selection)
4093         parser.add_option_group(filesystem)
4094         parser.add_option_group(verbosity)
4095         parser.add_option_group(video_format)
4096         parser.add_option_group(authentication)
4097         parser.add_option_group(postproc)
4098
4099         opts, args = parser.parse_args()
4100
4101         return parser, opts, args
4102
4103 def gen_extractors():
4104         """ Return a list of an instance of every supported extractor.
4105         The order does matter; the first extractor matched is the one handling the URL.
4106         """
4107         youtube_ie = YoutubeIE()
4108         google_ie = GoogleIE()
4109         yahoo_ie = YahooIE()
4110         return [
4111                 YoutubePlaylistIE(youtube_ie),
4112                 YoutubeUserIE(youtube_ie),
4113                 YoutubeSearchIE(youtube_ie),
4114                 youtube_ie,
4115                 MetacafeIE(youtube_ie),
4116                 DailymotionIE(),
4117                 google_ie,
4118                 GoogleSearchIE(google_ie),
4119                 PhotobucketIE(),
4120                 yahoo_ie,
4121                 YahooSearchIE(yahoo_ie),
4122                 DepositFilesIE(),
4123                 FacebookIE(),
4124                 BlipTVIE(),
4125                 VimeoIE(),
4126                 MyVideoIE(),
4127                 ComedyCentralIE(),
4128                 EscapistIE(),
4129                 CollegeHumorIE(),
4130                 XVideosIE(),
4131                 SoundcloudIE(),
4132                 InfoQIE(),
4133                 MixcloudIE(),
4134
4135                 GenericIE()
4136         ]
4137
4138 def _real_main():
4139         parser, opts, args = parseOpts()
4140
4141         # Open appropriate CookieJar
4142         if opts.cookiefile is None:
4143                 jar = cookielib.CookieJar()
4144         else:
4145                 try:
4146                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4147                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4148                                 jar.load()
4149                 except (IOError, OSError), err:
4150                         sys.exit(u'ERROR: unable to open cookie file')
4151
4152         # Dump user agent
4153         if opts.dump_user_agent:
4154                 print std_headers['User-Agent']
4155                 sys.exit(0)
4156
4157         # Batch file verification
4158         batchurls = []
4159         if opts.batchfile is not None:
4160                 try:
4161                         if opts.batchfile == '-':
4162                                 batchfd = sys.stdin
4163                         else:
4164                                 batchfd = open(opts.batchfile, 'r')
4165                         batchurls = batchfd.readlines()
4166                         batchurls = [x.strip() for x in batchurls]
4167                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4168                 except IOError:
4169                         sys.exit(u'ERROR: batch file could not be read')
4170         all_urls = batchurls + args
4171
4172         # General configuration
4173         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4174         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4175         urllib2.install_opener(opener)
4176         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4177
4178         extractors = gen_extractors()
4179
4180         if opts.list_extractors:
4181                 for ie in extractors:
4182                         print(ie.IE_NAME)
4183                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4184                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4185                         for mu in matchedUrls:
4186                                 print(u'  ' + mu)
4187                 sys.exit(0)
4188
4189         # Conflicting, missing and erroneous options
4190         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4191                 parser.error(u'using .netrc conflicts with giving username/password')
4192         if opts.password is not None and opts.username is None:
4193                 parser.error(u'account username missing')
4194         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4195                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4196         if opts.usetitle and opts.useliteral:
4197                 parser.error(u'using title conflicts with using literal title')
4198         if opts.username is not None and opts.password is None:
4199                 opts.password = getpass.getpass(u'Type account password and press return:')
4200         if opts.ratelimit is not None:
4201                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4202                 if numeric_limit is None:
4203                         parser.error(u'invalid rate limit specified')
4204                 opts.ratelimit = numeric_limit
4205         if opts.retries is not None:
4206                 try:
4207                         opts.retries = long(opts.retries)
4208                 except (TypeError, ValueError), err:
4209                         parser.error(u'invalid retry count specified')
4210         try:
4211                 opts.playliststart = int(opts.playliststart)
4212                 if opts.playliststart <= 0:
4213                         raise ValueError(u'Playlist start must be positive')
4214         except (TypeError, ValueError), err:
4215                 parser.error(u'invalid playlist start number specified')
4216         try:
4217                 opts.playlistend = int(opts.playlistend)
4218                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4219                         raise ValueError(u'Playlist end must be greater than playlist start')
4220         except (TypeError, ValueError), err:
4221                 parser.error(u'invalid playlist end number specified')
4222         if opts.extractaudio:
4223                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4224                         parser.error(u'invalid audio format specified')
4225
4226         # File downloader
4227         fd = FileDownloader({
4228                 'usenetrc': opts.usenetrc,
4229                 'username': opts.username,
4230                 'password': opts.password,
4231                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4232                 'forceurl': opts.geturl,
4233                 'forcetitle': opts.gettitle,
4234                 'forcethumbnail': opts.getthumbnail,
4235                 'forcedescription': opts.getdescription,
4236                 'forcefilename': opts.getfilename,
4237                 'forceformat': opts.getformat,
4238                 'simulate': opts.simulate,
4239                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4240                 'format': opts.format,
4241                 'format_limit': opts.format_limit,
4242                 'listformats': opts.listformats,
4243                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4244                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4245                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4246                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4247                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4248                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4249                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4250                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4251                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4252                         or u'%(id)s.%(ext)s'),
4253                 'ignoreerrors': opts.ignoreerrors,
4254                 'ratelimit': opts.ratelimit,
4255                 'nooverwrites': opts.nooverwrites,
4256                 'retries': opts.retries,
4257                 'continuedl': opts.continue_dl,
4258                 'noprogress': opts.noprogress,
4259                 'playliststart': opts.playliststart,
4260                 'playlistend': opts.playlistend,
4261                 'logtostderr': opts.outtmpl == '-',
4262                 'consoletitle': opts.consoletitle,
4263                 'nopart': opts.nopart,
4264                 'updatetime': opts.updatetime,
4265                 'writedescription': opts.writedescription,
4266                 'writeinfojson': opts.writeinfojson,
4267                 'matchtitle': opts.matchtitle,
4268                 'rejecttitle': opts.rejecttitle,
4269                 })
4270         for extractor in extractors:
4271                 fd.add_info_extractor(extractor)
4272
4273         # PostProcessors
4274         if opts.extractaudio:
4275                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4276
4277         # Update version
4278         if opts.update_self:
4279                 updateSelf(fd, sys.argv[0])
4280
4281         # Maybe do nothing
4282         if len(all_urls) < 1:
4283                 if not opts.update_self:
4284                         parser.error(u'you must provide at least one URL')
4285                 else:
4286                         sys.exit()
4287         retcode = fd.download(all_urls)
4288
4289         # Dump cookie jar if requested
4290         if opts.cookiefile is not None:
4291                 try:
4292                         jar.save()
4293                 except (IOError, OSError), err:
4294                         sys.exit(u'ERROR: unable to save cookie jar')
4295
4296         sys.exit(retcode)
4297
4298 def main():
4299         try:
4300                 _real_main()
4301         except DownloadError:
4302                 sys.exit(1)
4303         except SameFileError:
4304                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4305         except KeyboardInterrupt:
4306                 sys.exit(u'\nERROR: Interrupted by user')
4307
4308 if __name__ == '__main__':
4309         main()
4310
4311 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: