youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         )
  18
  19 __license__ = 'Public Domain'
  20 __version__ = '2011.10.19'
  21
  22 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  23
  24 import cookielib
  25 import datetime
  26 import gzip
  27 import htmlentitydefs
  28 import HTMLParser
  29 import httplib
  30 import locale
  31 import math
  32 import netrc
  33 import os
  34 import os.path
  35 import re
  36 import socket
  37 import string
  38 import subprocess
  39 import sys
  40 import time
  41 import urllib
  42 import urllib2
  43 import warnings
  44 import zlib
  45
  46 if os.name == 'nt':
  47         import ctypes
  48
  49 try:
  50         import email.utils
  51 except ImportError: # Python 2.4
  52         import email.Utils
  53 try:
  54         import cStringIO as StringIO
  55 except ImportError:
  56         import StringIO
  57
  58 # parse_qs was moved from the cgi module to the urlparse module recently.
  59 try:
  60         from urlparse import parse_qs
  61 except ImportError:
  62         from cgi import parse_qs
  63
  64 try:
  65         import lxml.etree
  66 except ImportError:
  67         pass # Handled below
  68
  69 try:
  70         import xml.etree.ElementTree
  71 except ImportError: # Python<2.5: Not officially supported, but let it slip
  72         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  73
  74 std_headers = {
  75         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  76         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  77         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  78         'Accept-Encoding': 'gzip, deflate',
  79         'Accept-Language': 'en-us,en;q=0.5',
  80 }
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280 def _simplify_title(title):
 281         return re.sub(ur'[^\w\d_\-]+', u'_', title).strip(u'_')
 282
 283 class DownloadError(Exception):
 284         """Download Error exception.
 285
 286         This exception may be thrown by FileDownloader objects if they are not
 287         configured to continue on errors. They will contain the appropriate
 288         error message.
 289         """
 290         pass
 291
 292
 293 class SameFileError(Exception):
 294         """Same File exception.
 295
 296         This exception will be thrown by FileDownloader objects if they detect
 297         multiple files would have to be downloaded to the same file on disk.
 298         """
 299         pass
 300
 301
 302 class PostProcessingError(Exception):
 303         """Post Processing exception.
 304
 305         This exception may be raised by PostProcessor's .run() method to
 306         indicate an error in the postprocessing task.
 307         """
 308         pass
 309
 310
 311 class UnavailableVideoError(Exception):
 312         """Unavailable Format exception.
 313
 314         This exception will be thrown when a video is requested
 315         in a format that is not available for that video.
 316         """
 317         pass
 318
 319
 320 class ContentTooShortError(Exception):
 321         """Content Too Short exception.
 322
 323         This exception may be raised by FileDownloader objects when a file they
 324         download is too small for what the server announced first, indicating
 325         the connection was probably interrupted.
 326         """
 327         # Both in bytes
 328         downloaded = None
 329         expected = None
 330
 331         def __init__(self, downloaded, expected):
 332                 self.downloaded = downloaded
 333                 self.expected = expected
 334
 335
 336 class YoutubeDLHandler(urllib2.HTTPHandler):
 337         """Handler for HTTP requests and responses.
 338
 339         This class, when installed with an OpenerDirector, automatically adds
 340         the standard headers to every HTTP request and handles gzipped and
 341         deflated responses from web servers. If compression is to be avoided in
 342         a particular request, the original request in the program code only has
 343         to include the HTTP header "Youtubedl-No-Compression", which will be
 344         removed before making the real request.
 345
 346         Part of this code was copied from:
 347
 348         http://techknack.net/python-urllib2-handlers/
 349
 350         Andrew Rowls, the author of that code, agreed to release it to the
 351         public domain.
 352         """
 353
 354         @staticmethod
 355         def deflate(data):
 356                 try:
 357                         return zlib.decompress(data, -zlib.MAX_WBITS)
 358                 except zlib.error:
 359                         return zlib.decompress(data)
 360
 361         @staticmethod
 362         def addinfourl_wrapper(stream, headers, url, code):
 363                 if hasattr(urllib2.addinfourl, 'getcode'):
 364                         return urllib2.addinfourl(stream, headers, url, code)
 365                 ret = urllib2.addinfourl(stream, headers, url)
 366                 ret.code = code
 367                 return ret
 368
 369         def http_request(self, req):
 370                 for h in std_headers:
 371                         if h in req.headers:
 372                                 del req.headers[h]
 373                         req.add_header(h, std_headers[h])
 374                 if 'Youtubedl-no-compression' in req.headers:
 375                         if 'Accept-encoding' in req.headers:
 376                                 del req.headers['Accept-encoding']
 377                         del req.headers['Youtubedl-no-compression']
 378                 return req
 379
 380         def http_response(self, req, resp):
 381                 old_resp = resp
 382                 # gzip
 383                 if resp.headers.get('Content-encoding', '') == 'gzip':
 384                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 385                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 386                         resp.msg = old_resp.msg
 387                 # deflate
 388                 if resp.headers.get('Content-encoding', '') == 'deflate':
 389                         gz = StringIO.StringIO(self.deflate(resp.read()))
 390                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 391                         resp.msg = old_resp.msg
 392                 return resp
 393
 394
 395 class FileDownloader(object):
 396         """File Downloader class.
 397
 398         File downloader objects are the ones responsible of downloading the
 399         actual video file and writing it to disk if the user has requested
 400         it, among some other tasks. In most cases there should be one per
 401         program. As, given a video URL, the downloader doesn't know how to
 402         extract all the needed information, task that InfoExtractors do, it
 403         has to pass the URL to one of them.
 404
 405         For this, file downloader objects have a method that allows
 406         InfoExtractors to be registered in a given order. When it is passed
 407         a URL, the file downloader handles it to the first InfoExtractor it
 408         finds that reports being able to handle it. The InfoExtractor extracts
 409         all the information about the video or videos the URL refers to, and
 410         asks the FileDownloader to process the video information, possibly
 411         downloading the video.
 412
 413         File downloaders accept a lot of parameters. In order not to saturate
 414         the object constructor with arguments, it receives a dictionary of
 415         options instead. These options are available through the params
 416         attribute for the InfoExtractors to use. The FileDownloader also
 417         registers itself as the downloader in charge for the InfoExtractors
 418         that are added to it, so this is a "mutual registration".
 419
 420         Available options:
 421
 422         username:         Username for authentication purposes.
 423         password:         Password for authentication purposes.
 424         usenetrc:         Use netrc for authentication instead.
 425         quiet:            Do not print messages to stdout.
 426         forceurl:         Force printing final URL.
 427         forcetitle:       Force printing title.
 428         forcethumbnail:   Force printing thumbnail URL.
 429         forcedescription: Force printing description.
 430         forcefilename:    Force printing final filename.
 431         simulate:         Do not download the video files.
 432         format:           Video format code.
 433         format_limit:     Highest quality format to try.
 434         outtmpl:          Template for output names.
 435         ignoreerrors:     Do not stop on download errors.
 436         ratelimit:        Download speed limit, in bytes/sec.
 437         nooverwrites:     Prevent overwriting files.
 438         retries:          Number of times to retry for HTTP error 5xx
 439         continuedl:       Try to continue downloads if possible.
 440         noprogress:       Do not print the progress bar.
 441         playliststart:    Playlist item to start at.
 442         playlistend:      Playlist item to end at.
 443         matchtitle:       Download only matching titles.
 444         rejecttitle:      Reject downloads for matching titles.
 445         logtostderr:      Log messages to stderr instead of stdout.
 446         consoletitle:     Display progress in console window's titlebar.
 447         nopart:           Do not use temporary .part files.
 448         updatetime:       Use the Last-modified header to set output file timestamps.
 449         writedescription: Write the video description to a .description file
 450         writeinfojson:    Write the video description to a .info.json file
 451         """
 452
 453         params = None
 454         _ies = []
 455         _pps = []
 456         _download_retcode = None
 457         _num_downloads = None
 458         _screen_file = None
 459
 460         def __init__(self, params):
 461                 """Create a FileDownloader object with the given options."""
 462                 self._ies = []
 463                 self._pps = []
 464                 self._download_retcode = 0
 465                 self._num_downloads = 0
 466                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 467                 self.params = params
 468
 469         @staticmethod
 470         def format_bytes(bytes):
 471                 if bytes is None:
 472                         return 'N/A'
 473                 if type(bytes) is str:
 474                         bytes = float(bytes)
 475                 if bytes == 0.0:
 476                         exponent = 0
 477                 else:
 478                         exponent = long(math.log(bytes, 1024.0))
 479                 suffix = 'bkMGTPEZY'[exponent]
 480                 converted = float(bytes) / float(1024 ** exponent)
 481                 return '%.2f%s' % (converted, suffix)
 482
 483         @staticmethod
 484         def calc_percent(byte_counter, data_len):
 485                 if data_len is None:
 486                         return '---.-%'
 487                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 488
 489         @staticmethod
 490         def calc_eta(start, now, total, current):
 491                 if total is None:
 492                         return '--:--'
 493                 dif = now - start
 494                 if current == 0 or dif < 0.001: # One millisecond
 495                         return '--:--'
 496                 rate = float(current) / dif
 497                 eta = long((float(total) - float(current)) / rate)
 498                 (eta_mins, eta_secs) = divmod(eta, 60)
 499                 if eta_mins > 99:
 500                         return '--:--'
 501                 return '%02d:%02d' % (eta_mins, eta_secs)
 502
 503         @staticmethod
 504         def calc_speed(start, now, bytes):
 505                 dif = now - start
 506                 if bytes == 0 or dif < 0.001: # One millisecond
 507                         return '%10s' % '---b/s'
 508                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 509
 510         @staticmethod
 511         def best_block_size(elapsed_time, bytes):
 512                 new_min = max(bytes / 2.0, 1.0)
 513                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 514                 if elapsed_time < 0.001:
 515                         return long(new_max)
 516                 rate = bytes / elapsed_time
 517                 if rate > new_max:
 518                         return long(new_max)
 519                 if rate < new_min:
 520                         return long(new_min)
 521                 return long(rate)
 522
 523         @staticmethod
 524         def parse_bytes(bytestr):
 525                 """Parse a string indicating a byte quantity into a long integer."""
 526                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 527                 if matchobj is None:
 528                         return None
 529                 number = float(matchobj.group(1))
 530                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 531                 return long(round(number * multiplier))
 532
 533         def add_info_extractor(self, ie):
 534                 """Add an InfoExtractor object to the end of the list."""
 535                 self._ies.append(ie)
 536                 ie.set_downloader(self)
 537
 538         def add_post_processor(self, pp):
 539                 """Add a PostProcessor object to the end of the chain."""
 540                 self._pps.append(pp)
 541                 pp.set_downloader(self)
 542
 543         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 544                 """Print message to stdout if not in quiet mode."""
 545                 try:
 546                         if not self.params.get('quiet', False):
 547                                 terminator = [u'\n', u''][skip_eol]
 548                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 549                         self._screen_file.flush()
 550                 except (UnicodeEncodeError), err:
 551                         if not ignore_encoding_errors:
 552                                 raise
 553
 554         def to_stderr(self, message):
 555                 """Print message to stderr."""
 556                 print >>sys.stderr, message.encode(preferredencoding())
 557
 558         def to_cons_title(self, message):
 559                 """Set console/terminal window title to message."""
 560                 if not self.params.get('consoletitle', False):
 561                         return
 562                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 563                         # c_wchar_p() might not be necessary if `message` is
 564                         # already of type unicode()
 565                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 566                 elif 'TERM' in os.environ:
 567                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 568
 569         def fixed_template(self):
 570                 """Checks if the output template is fixed."""
 571                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 572
 573         def trouble(self, message=None):
 574                 """Determine action to take when a download problem appears.
 575
 576                 Depending on if the downloader has been configured to ignore
 577                 download errors or not, this method may throw an exception or
 578                 not when errors are found, after printing the message.
 579                 """
 580                 if message is not None:
 581                         self.to_stderr(message)
 582                 if not self.params.get('ignoreerrors', False):
 583                         raise DownloadError(message)
 584                 self._download_retcode = 1
 585
 586         def slow_down(self, start_time, byte_counter):
 587                 """Sleep if the download speed is over the rate limit."""
 588                 rate_limit = self.params.get('ratelimit', None)
 589                 if rate_limit is None or byte_counter == 0:
 590                         return
 591                 now = time.time()
 592                 elapsed = now - start_time
 593                 if elapsed <= 0.0:
 594                         return
 595                 speed = float(byte_counter) / elapsed
 596                 if speed > rate_limit:
 597                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 598
 599         def temp_name(self, filename):
 600                 """Returns a temporary filename for the given filename."""
 601                 if self.params.get('nopart', False) or filename == u'-' or \
 602                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 603                         return filename
 604                 return filename + u'.part'
 605
 606         def undo_temp_name(self, filename):
 607                 if filename.endswith(u'.part'):
 608                         return filename[:-len(u'.part')]
 609                 return filename
 610
 611         def try_rename(self, old_filename, new_filename):
 612                 try:
 613                         if old_filename == new_filename:
 614                                 return
 615                         os.rename(old_filename, new_filename)
 616                 except (IOError, OSError), err:
 617                         self.trouble(u'ERROR: unable to rename file')
 618
 619         def try_utime(self, filename, last_modified_hdr):
 620                 """Try to set the last-modified time of the given file."""
 621                 if last_modified_hdr is None:
 622                         return
 623                 if not os.path.isfile(filename):
 624                         return
 625                 timestr = last_modified_hdr
 626                 if timestr is None:
 627                         return
 628                 filetime = timeconvert(timestr)
 629                 if filetime is None:
 630                         return filetime
 631                 try:
 632                         os.utime(filename, (time.time(), filetime))
 633                 except:
 634                         pass
 635                 return filetime
 636
 637         def report_writedescription(self, descfn):
 638                 """ Report that the description file is being written """
 639                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 640
 641         def report_writeinfojson(self, infofn):
 642                 """ Report that the metadata file has been written """
 643                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 644
 645         def report_destination(self, filename):
 646                 """Report destination filename."""
 647                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 648
 649         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 650                 """Report download progress."""
 651                 if self.params.get('noprogress', False):
 652                         return
 653                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 654                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 655                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 656                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 657
 658         def report_resuming_byte(self, resume_len):
 659                 """Report attempt to resume at given byte."""
 660                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 661
 662         def report_retry(self, count, retries):
 663                 """Report retry in case of HTTP error 5xx"""
 664                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 665
 666         def report_file_already_downloaded(self, file_name):
 667                 """Report file has already been fully downloaded."""
 668                 try:
 669                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 670                 except (UnicodeEncodeError), err:
 671                         self.to_screen(u'[download] The file has already been downloaded')
 672
 673         def report_unable_to_resume(self):
 674                 """Report it was impossible to resume download."""
 675                 self.to_screen(u'[download] Unable to resume')
 676
 677         def report_finish(self):
 678                 """Report download finished."""
 679                 if self.params.get('noprogress', False):
 680                         self.to_screen(u'[download] Download completed')
 681                 else:
 682                         self.to_screen(u'')
 683
 684         def increment_downloads(self):
 685                 """Increment the ordinal that assigns a number to each file."""
 686                 self._num_downloads += 1
 687
 688         def prepare_filename(self, info_dict):
 689                 """Generate the output filename."""
 690                 try:
 691                         template_dict = dict(info_dict)
 692                         template_dict['epoch'] = unicode(long(time.time()))
 693                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 694                         filename = self.params['outtmpl'] % template_dict
 695                         return filename
 696                 except (ValueError, KeyError), err:
 697                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 698                         return None
 699
 700         def process_info(self, info_dict):
 701                 """Process a single dictionary returned by an InfoExtractor."""
 702                 filename = self.prepare_filename(info_dict)
 703
 704                 # Forced printings
 705                 if self.params.get('forcetitle', False):
 706                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 707                 if self.params.get('forceurl', False):
 708                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 709                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 710                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 711                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 712                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 713                 if self.params.get('forcefilename', False) and filename is not None:
 714                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 715                 if self.params.get('forceformat', False):
 716                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 717
 718                 # Do nothing else if in simulate mode
 719                 if self.params.get('simulate', False):
 720                         return
 721
 722                 if filename is None:
 723                         return
 724
 725                 matchtitle=self.params.get('matchtitle',False)
 726                 rejecttitle=self.params.get('rejecttitle',False)
 727                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 728                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 729                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 730                         return
 731                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 732                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 733                         return
 734
 735                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 736                         self.to_stderr(u'WARNING: file exists and will be skipped')
 737                         return
 738
 739                 try:
 740                         dn = os.path.dirname(filename)
 741                         if dn != '' and not os.path.exists(dn):
 742                                 os.makedirs(dn)
 743                 except (OSError, IOError), err:
 744                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 745                         return
 746
 747                 if self.params.get('writedescription', False):
 748                         try:
 749                                 descfn = filename + '.description'
 750                                 self.report_writedescription(descfn)
 751                                 descfile = open(descfn, 'wb')
 752                                 try:
 753                                         descfile.write(info_dict['description'].encode('utf-8'))
 754                                 finally:
 755                                         descfile.close()
 756                         except (OSError, IOError):
 757                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 758                                 return
 759
 760                 if self.params.get('writeinfojson', False):
 761                         infofn = filename + '.info.json'
 762                         self.report_writeinfojson(infofn)
 763                         try:
 764                                 json.dump
 765                         except (NameError,AttributeError):
 766                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 767                                 return
 768                         try:
 769                                 infof = open(infofn, 'wb')
 770                                 try:
 771                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 772                                         json.dump(json_info_dict, infof)
 773                                 finally:
 774                                         infof.close()
 775                         except (OSError, IOError):
 776                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 777                                 return
 778
 779                 if not self.params.get('skip_download', False):
 780                         try:
 781                                 success = self._do_download(filename, info_dict)
 782                         except (OSError, IOError), err:
 783                                 raise UnavailableVideoError
 784                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 785                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 786                                 return
 787                         except (ContentTooShortError, ), err:
 788                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 789                                 return
 790
 791                         if success:
 792                                 try:
 793                                         self.post_process(filename, info_dict)
 794                                 except (PostProcessingError), err:
 795                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 796                                         return
 797
 798         def download(self, url_list):
 799                 """Download a given list of URLs."""
 800                 if len(url_list) > 1 and self.fixed_template():
 801                         raise SameFileError(self.params['outtmpl'])
 802
 803                 for url in url_list:
 804                         suitable_found = False
 805                         for ie in self._ies:
 806                                 # Go to next InfoExtractor if not suitable
 807                                 if not ie.suitable(url):
 808                                         continue
 809
 810                                 # Suitable InfoExtractor found
 811                                 suitable_found = True
 812
 813                                 # Extract information from URL and process it
 814                                 ie.extract(url)
 815
 816                                 # Suitable InfoExtractor had been found; go to next URL
 817                                 break
 818
 819                         if not suitable_found:
 820                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 821
 822                 return self._download_retcode
 823
 824         def post_process(self, filename, ie_info):
 825                 """Run the postprocessing chain on the given file."""
 826                 info = dict(ie_info)
 827                 info['filepath'] = filename
 828                 for pp in self._pps:
 829                         info = pp.run(info)
 830                         if info is None:
 831                                 break
 832
 833         def _download_with_rtmpdump(self, filename, url, player_url):
 834                 self.report_destination(filename)
 835                 tmpfilename = self.temp_name(filename)
 836
 837                 # Check for rtmpdump first
 838                 try:
 839                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 840                 except (OSError, IOError):
 841                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 842                         return False
 843
 844                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 845                 # the connection was interrumpted and resuming appears to be
 846                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 847                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 848                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 849                 while retval == 2 or retval == 1:
 850                         prevsize = os.path.getsize(tmpfilename)
 851                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 852                         time.sleep(5.0) # This seems to be needed
 853                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 854                         cursize = os.path.getsize(tmpfilename)
 855                         if prevsize == cursize and retval == 1:
 856                                 break
 857                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 858                         if prevsize == cursize and retval == 2 and cursize > 1024:
 859                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 860                                 retval = 0
 861                                 break
 862                 if retval == 0:
 863                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 864                         self.try_rename(tmpfilename, filename)
 865                         return True
 866                 else:
 867                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 868                         return False
 869
 870         def _do_download(self, filename, info_dict):
 871                 url = info_dict['url']
 872                 player_url = info_dict.get('player_url', None)
 873
 874                 # Check file already present
 875                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 876                         self.report_file_already_downloaded(filename)
 877                         return True
 878
 879                 # Attempt to download using rtmpdump
 880                 if url.startswith('rtmp'):
 881                         return self._download_with_rtmpdump(filename, url, player_url)
 882
 883                 tmpfilename = self.temp_name(filename)
 884                 stream = None
 885
 886                 # Do not include the Accept-Encoding header
 887                 headers = {'Youtubedl-no-compression': 'True'}
 888                 basic_request = urllib2.Request(url, None, headers)
 889                 request = urllib2.Request(url, None, headers)
 890
 891                 # Establish possible resume length
 892                 if os.path.isfile(tmpfilename):
 893                         resume_len = os.path.getsize(tmpfilename)
 894                 else:
 895                         resume_len = 0
 896
 897                 open_mode = 'wb'
 898                 if resume_len != 0:
 899                         if self.params.get('continuedl', False):
 900                                 self.report_resuming_byte(resume_len)
 901                                 request.add_header('Range','bytes=%d-' % resume_len)
 902                                 open_mode = 'ab'
 903                         else:
 904                                 resume_len = 0
 905
 906                 count = 0
 907                 retries = self.params.get('retries', 0)
 908                 while count <= retries:
 909                         # Establish connection
 910                         try:
 911                                 if count == 0 and 'urlhandle' in info_dict:
 912                                         data = info_dict['urlhandle']
 913                                 data = urllib2.urlopen(request)
 914                                 break
 915                         except (urllib2.HTTPError, ), err:
 916                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 917                                         # Unexpected HTTP error
 918                                         raise
 919                                 elif err.code == 416:
 920                                         # Unable to resume (requested range not satisfiable)
 921                                         try:
 922                                                 # Open the connection again without the range header
 923                                                 data = urllib2.urlopen(basic_request)
 924                                                 content_length = data.info()['Content-Length']
 925                                         except (urllib2.HTTPError, ), err:
 926                                                 if err.code < 500 or err.code >= 600:
 927                                                         raise
 928                                         else:
 929                                                 # Examine the reported length
 930                                                 if (content_length is not None and
 931                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 932                                                         # The file had already been fully downloaded.
 933                                                         # Explanation to the above condition: in issue #175 it was revealed that
 934                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 935                                                         # changing the file size slightly and causing problems for some users. So
 936                                                         # I decided to implement a suggested change and consider the file
 937                                                         # completely downloaded if the file size differs less than 100 bytes from
 938                                                         # the one in the hard drive.
 939                                                         self.report_file_already_downloaded(filename)
 940                                                         self.try_rename(tmpfilename, filename)
 941                                                         return True
 942                                                 else:
 943                                                         # The length does not match, we start the download over
 944                                                         self.report_unable_to_resume()
 945                                                         open_mode = 'wb'
 946                                                         break
 947                         # Retry
 948                         count += 1
 949                         if count <= retries:
 950                                 self.report_retry(count, retries)
 951
 952                 if count > retries:
 953                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 954                         return False
 955
 956                 data_len = data.info().get('Content-length', None)
 957                 if data_len is not None:
 958                         data_len = long(data_len) + resume_len
 959                 data_len_str = self.format_bytes(data_len)
 960                 byte_counter = 0 + resume_len
 961                 block_size = 1024
 962                 start = time.time()
 963                 while True:
 964                         # Download and write
 965                         before = time.time()
 966                         data_block = data.read(block_size)
 967                         after = time.time()
 968                         if len(data_block) == 0:
 969                                 break
 970                         byte_counter += len(data_block)
 971
 972                         # Open file just in time
 973                         if stream is None:
 974                                 try:
 975                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 976                                         assert stream is not None
 977                                         filename = self.undo_temp_name(tmpfilename)
 978                                         self.report_destination(filename)
 979                                 except (OSError, IOError), err:
 980                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 981                                         return False
 982                         try:
 983                                 stream.write(data_block)
 984                         except (IOError, OSError), err:
 985                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 986                                 return False
 987                         block_size = self.best_block_size(after - before, len(data_block))
 988
 989                         # Progress message
 990                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 991                         if data_len is None:
 992                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
 993                         else:
 994                                 percent_str = self.calc_percent(byte_counter, data_len)
 995                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 996                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 997
 998                         # Apply rate limit
 999                         self.slow_down(start, byte_counter - resume_len)
1000
1001                 if stream is None:
1002                         self.trouble(u'\nERROR: Did not get any data blocks')
1003                         return False
1004                 stream.close()
1005                 self.report_finish()
1006                 if data_len is not None and byte_counter != data_len:
1007                         raise ContentTooShortError(byte_counter, long(data_len))
1008                 self.try_rename(tmpfilename, filename)
1009
1010                 # Update file modification time
1011                 if self.params.get('updatetime', True):
1012                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1013
1014                 return True
1015
1016
1017 class InfoExtractor(object):
1018         """Information Extractor class.
1019
1020         Information extractors are the classes that, given a URL, extract
1021         information from the video (or videos) the URL refers to. This
1022         information includes the real video URL, the video title and simplified
1023         title, author and others. The information is stored in a dictionary
1024         which is then passed to the FileDownloader. The FileDownloader
1025         processes this information possibly downloading the video to the file
1026         system, among other possible outcomes. The dictionaries must include
1027         the following fields:
1028
1029         id:             Video identifier.
1030         url:            Final video URL.
1031         uploader:       Nickname of the video uploader.
1032         title:          Literal title.
1033         stitle:         Simplified title.
1034         ext:            Video filename extension.
1035         format:         Video format.
1036         player_url:     SWF Player URL (may be None).
1037
1038         The following fields are optional. Their primary purpose is to allow
1039         youtube-dl to serve as the backend for a video search function, such
1040         as the one in youtube2mp3.  They are only used when their respective
1041         forced printing functions are called:
1042
1043         thumbnail:      Full URL to a video thumbnail image.
1044         description:    One-line video description.
1045
1046         Subclasses of this one should re-define the _real_initialize() and
1047         _real_extract() methods and define a _VALID_URL regexp.
1048         Probably, they should also be added to the list of extractors.
1049         """
1050
1051         _ready = False
1052         _downloader = None
1053
1054         def __init__(self, downloader=None):
1055                 """Constructor. Receives an optional downloader."""
1056                 self._ready = False
1057                 self.set_downloader(downloader)
1058
1059         def suitable(self, url):
1060                 """Receives a URL and returns True if suitable for this IE."""
1061                 return re.match(self._VALID_URL, url) is not None
1062
1063         def initialize(self):
1064                 """Initializes an instance (authentication, etc)."""
1065                 if not self._ready:
1066                         self._real_initialize()
1067                         self._ready = True
1068
1069         def extract(self, url):
1070                 """Extracts URL information and returns it in list of dicts."""
1071                 self.initialize()
1072                 return self._real_extract(url)
1073
1074         def set_downloader(self, downloader):
1075                 """Sets the downloader for this IE."""
1076                 self._downloader = downloader
1077
1078         def _real_initialize(self):
1079                 """Real initialization process. Redefine in subclasses."""
1080                 pass
1081
1082         def _real_extract(self, url):
1083                 """Real extraction process. Redefine in subclasses."""
1084                 pass
1085
1086
1087 class YoutubeIE(InfoExtractor):
1088         """Information extractor for youtube.com."""
1089
1090         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1091         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1092         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1093         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1094         _NETRC_MACHINE = 'youtube'
1095         # Listed in order of quality
1096         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1097         _video_extensions = {
1098                 '13': '3gp',
1099                 '17': 'mp4',
1100                 '18': 'mp4',
1101                 '22': 'mp4',
1102                 '37': 'mp4',
1103                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1104                 '43': 'webm',
1105                 '44': 'webm',
1106                 '45': 'webm',
1107         }
1108         _video_dimensions = {
1109                 '5': '240x400',
1110                 '6': '???',
1111                 '13': '???',
1112                 '17': '144x176',
1113                 '18': '360x640',
1114                 '22': '720x1280',
1115                 '34': '360x640',
1116                 '35': '480x854',
1117                 '37': '1080x1920',
1118                 '38': '3072x4096',
1119                 '43': '360x640',
1120                 '44': '480x854',
1121                 '45': '720x1280',
1122         }
1123         IE_NAME = u'youtube'
1124
1125         def report_lang(self):
1126                 """Report attempt to set language."""
1127                 self._downloader.to_screen(u'[youtube] Setting language')
1128
1129         def report_login(self):
1130                 """Report attempt to log in."""
1131                 self._downloader.to_screen(u'[youtube] Logging in')
1132
1133         def report_age_confirmation(self):
1134                 """Report attempt to confirm age."""
1135                 self._downloader.to_screen(u'[youtube] Confirming age')
1136
1137         def report_video_webpage_download(self, video_id):
1138                 """Report attempt to download video webpage."""
1139                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1140
1141         def report_video_info_webpage_download(self, video_id):
1142                 """Report attempt to download video info webpage."""
1143                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1144
1145         def report_information_extraction(self, video_id):
1146                 """Report attempt to extract video information."""
1147                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1148
1149         def report_unavailable_format(self, video_id, format):
1150                 """Report extracted video URL."""
1151                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1152
1153         def report_rtmp_download(self):
1154                 """Indicate the download will use the RTMP protocol."""
1155                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1156
1157         def _print_formats(self, formats):
1158                 print 'Available formats:'
1159                 for x in formats:
1160                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1161
1162         def _real_initialize(self):
1163                 if self._downloader is None:
1164                         return
1165
1166                 username = None
1167                 password = None
1168                 downloader_params = self._downloader.params
1169
1170                 # Attempt to use provided username and password or .netrc data
1171                 if downloader_params.get('username', None) is not None:
1172                         username = downloader_params['username']
1173                         password = downloader_params['password']
1174                 elif downloader_params.get('usenetrc', False):
1175                         try:
1176                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1177                                 if info is not None:
1178                                         username = info[0]
1179                                         password = info[2]
1180                                 else:
1181                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1182                         except (IOError, netrc.NetrcParseError), err:
1183                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1184                                 return
1185
1186                 # Set language
1187                 request = urllib2.Request(self._LANG_URL)
1188                 try:
1189                         self.report_lang()
1190                         urllib2.urlopen(request).read()
1191                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1192                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1193                         return
1194
1195                 # No authentication to be performed
1196                 if username is None:
1197                         return
1198
1199                 # Log in
1200                 login_form = {
1201                                 'current_form': 'loginForm',
1202                                 'next':         '/',
1203                                 'action_login': 'Log In',
1204                                 'username':     username,
1205                                 'password':     password,
1206                                 }
1207                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1208                 try:
1209                         self.report_login()
1210                         login_results = urllib2.urlopen(request).read()
1211                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1212                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1213                                 return
1214                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1215                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1216                         return
1217
1218                 # Confirm age
1219                 age_form = {
1220                                 'next_url':             '/',
1221                                 'action_confirm':       'Confirm',
1222                                 }
1223                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1224                 try:
1225                         self.report_age_confirmation()
1226                         age_results = urllib2.urlopen(request).read()
1227                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1229                         return
1230
1231         def _real_extract(self, url):
1232                 # Extract video id from URL
1233                 mobj = re.match(self._VALID_URL, url)
1234                 if mobj is None:
1235                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1236                         return
1237                 video_id = mobj.group(2)
1238
1239                 # Get video webpage
1240                 self.report_video_webpage_download(video_id)
1241                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1242                 try:
1243                         video_webpage = urllib2.urlopen(request).read()
1244                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1245                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1246                         return
1247
1248                 # Attempt to extract SWF player URL
1249                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1250                 if mobj is not None:
1251                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1252                 else:
1253                         player_url = None
1254
1255                 # Get video info
1256                 self.report_video_info_webpage_download(video_id)
1257                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1258                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1259                                         % (video_id, el_type))
1260                         request = urllib2.Request(video_info_url)
1261                         try:
1262                                 video_info_webpage = urllib2.urlopen(request).read()
1263                                 video_info = parse_qs(video_info_webpage)
1264                                 if 'token' in video_info:
1265                                         break
1266                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1267                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1268                                 return
1269                 if 'token' not in video_info:
1270                         if 'reason' in video_info:
1271                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1272                         else:
1273                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1274                         return
1275
1276                 # Start extracting information
1277                 self.report_information_extraction(video_id)
1278
1279                 # uploader
1280                 if 'author' not in video_info:
1281                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1282                         return
1283                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1284
1285                 # title
1286                 if 'title' not in video_info:
1287                         self._downloader.trouble(u'ERROR: unable to extract video title')
1288                         return
1289                 video_title = urllib.unquote_plus(video_info['title'][0])
1290                 video_title = video_title.decode('utf-8')
1291                 video_title = sanitize_title(video_title)
1292
1293                 # simplified title
1294                 simple_title = _simplify_title(video_title)
1295
1296                 # thumbnail image
1297                 if 'thumbnail_url' not in video_info:
1298                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1299                         video_thumbnail = ''
1300                 else:   # don't panic if we can't find it
1301                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1302
1303                 # upload date
1304                 upload_date = u'NA'
1305                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1306                 if mobj is not None:
1307                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1308                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1309                         for expression in format_expressions:
1310                                 try:
1311                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1312                                 except:
1313                                         pass
1314
1315                 # description
1316                 try:
1317                         lxml.etree
1318                 except NameError:
1319                         video_description = u'No description available.'
1320                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1321                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1322                                 if mobj is not None:
1323                                         video_description = mobj.group(1).decode('utf-8')
1324                 else:
1325                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1326                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1327                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1328                         # TODO use another parser
1329
1330                 # token
1331                 video_token = urllib.unquote_plus(video_info['token'][0])
1332
1333                 # Decide which formats to download
1334                 req_format = self._downloader.params.get('format', None)
1335
1336                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1337                         self.report_rtmp_download()
1338                         video_url_list = [(None, video_info['conn'][0])]
1339                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1340                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1341                         url_data = [parse_qs(uds) for uds in url_data_strs]
1342                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1343                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1344
1345                         format_limit = self._downloader.params.get('format_limit', None)
1346                         if format_limit is not None and format_limit in self._available_formats:
1347                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1348                         else:
1349                                 format_list = self._available_formats
1350                         existing_formats = [x for x in format_list if x in url_map]
1351                         if len(existing_formats) == 0:
1352                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1353                                 return
1354                         if self._downloader.params.get('listformats', None):
1355                                 self._print_formats(existing_formats)
1356                                 return
1357                         if req_format is None or req_format == 'best':
1358                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1359                         elif req_format == 'worst':
1360                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1361                         elif req_format in ('-1', 'all'):
1362                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1363                         else:
1364                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1365                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1366                                 req_formats = req_format.split('/')
1367                                 video_url_list = None
1368                                 for rf in req_formats:
1369                                         if rf in url_map:
1370                                                 video_url_list = [(rf, url_map[rf])]
1371                                                 break
1372                                 if video_url_list is None:
1373                                         self._downloader.trouble(u'ERROR: requested format not available')
1374                                         return
1375                 else:
1376                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1377                         return
1378
1379                 for format_param, video_real_url in video_url_list:
1380                         # At this point we have a new video
1381                         self._downloader.increment_downloads()
1382
1383                         # Extension
1384                         video_extension = self._video_extensions.get(format_param, 'flv')
1385
1386                         try:
1387                                 # Process video information
1388                                 self._downloader.process_info({
1389                                         'id':           video_id.decode('utf-8'),
1390                                         'url':          video_real_url.decode('utf-8'),
1391                                         'uploader':     video_uploader.decode('utf-8'),
1392                                         'upload_date':  upload_date,
1393                                         'title':        video_title,
1394                                         'stitle':       simple_title,
1395                                         'ext':          video_extension.decode('utf-8'),
1396                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1397                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1398                                         'description':  video_description,
1399                                         'player_url':   player_url,
1400                                 })
1401                         except UnavailableVideoError, err:
1402                                 self._downloader.trouble(u'\nERROR: unable to download video')
1403
1404
1405 class MetacafeIE(InfoExtractor):
1406         """Information Extractor for metacafe.com."""
1407
1408         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1409         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1410         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1411         _youtube_ie = None
1412         IE_NAME = u'metacafe'
1413
1414         def __init__(self, youtube_ie, downloader=None):
1415                 InfoExtractor.__init__(self, downloader)
1416                 self._youtube_ie = youtube_ie
1417
1418         def report_disclaimer(self):
1419                 """Report disclaimer retrieval."""
1420                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1421
1422         def report_age_confirmation(self):
1423                 """Report attempt to confirm age."""
1424                 self._downloader.to_screen(u'[metacafe] Confirming age')
1425
1426         def report_download_webpage(self, video_id):
1427                 """Report webpage download."""
1428                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1429
1430         def report_extraction(self, video_id):
1431                 """Report information extraction."""
1432                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1433
1434         def _real_initialize(self):
1435                 # Retrieve disclaimer
1436                 request = urllib2.Request(self._DISCLAIMER)
1437                 try:
1438                         self.report_disclaimer()
1439                         disclaimer = urllib2.urlopen(request).read()
1440                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1441                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1442                         return
1443
1444                 # Confirm age
1445                 disclaimer_form = {
1446                         'filters': '0',
1447                         'submit': "Continue - I'm over 18",
1448                         }
1449                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1450                 try:
1451                         self.report_age_confirmation()
1452                         disclaimer = urllib2.urlopen(request).read()
1453                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1454                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1455                         return
1456
1457         def _real_extract(self, url):
1458                 # Extract id and simplified title from URL
1459                 mobj = re.match(self._VALID_URL, url)
1460                 if mobj is None:
1461                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1462                         return
1463
1464                 video_id = mobj.group(1)
1465
1466                 # Check if video comes from YouTube
1467                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1468                 if mobj2 is not None:
1469                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1470                         return
1471
1472                 # At this point we have a new video
1473                 self._downloader.increment_downloads()
1474
1475                 simple_title = mobj.group(2).decode('utf-8')
1476
1477                 # Retrieve video webpage to extract further information
1478                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1479                 try:
1480                         self.report_download_webpage(video_id)
1481                         webpage = urllib2.urlopen(request).read()
1482                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1483                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1484                         return
1485
1486                 # Extract URL, uploader and title from webpage
1487                 self.report_extraction(video_id)
1488                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1489                 if mobj is not None:
1490                         mediaURL = urllib.unquote(mobj.group(1))
1491                         video_extension = mediaURL[-3:]
1492
1493                         # Extract gdaKey if available
1494                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1495                         if mobj is None:
1496                                 video_url = mediaURL
1497                         else:
1498                                 gdaKey = mobj.group(1)
1499                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1500                 else:
1501                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1502                         if mobj is None:
1503                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1504                                 return
1505                         vardict = parse_qs(mobj.group(1))
1506                         if 'mediaData' not in vardict:
1507                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1508                                 return
1509                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1510                         if mobj is None:
1511                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1512                                 return
1513                         mediaURL = mobj.group(1).replace('\\/', '/')
1514                         video_extension = mediaURL[-3:]
1515                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1516
1517                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1518                 if mobj is None:
1519                         self._downloader.trouble(u'ERROR: unable to extract title')
1520                         return
1521                 video_title = mobj.group(1).decode('utf-8')
1522                 video_title = sanitize_title(video_title)
1523
1524                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1525                 if mobj is None:
1526                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1527                         return
1528                 video_uploader = mobj.group(1)
1529
1530                 try:
1531                         # Process video information
1532                         self._downloader.process_info({
1533                                 'id':           video_id.decode('utf-8'),
1534                                 'url':          video_url.decode('utf-8'),
1535                                 'uploader':     video_uploader.decode('utf-8'),
1536                                 'upload_date':  u'NA',
1537                                 'title':        video_title,
1538                                 'stitle':       simple_title,
1539                                 'ext':          video_extension.decode('utf-8'),
1540                                 'format':       u'NA',
1541                                 'player_url':   None,
1542                         })
1543                 except UnavailableVideoError:
1544                         self._downloader.trouble(u'\nERROR: unable to download video')
1545
1546
1547 class DailymotionIE(InfoExtractor):
1548         """Information Extractor for Dailymotion"""
1549
1550         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1551         IE_NAME = u'dailymotion'
1552
1553         def __init__(self, downloader=None):
1554                 InfoExtractor.__init__(self, downloader)
1555
1556         def report_download_webpage(self, video_id):
1557                 """Report webpage download."""
1558                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1559
1560         def report_extraction(self, video_id):
1561                 """Report information extraction."""
1562                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1563
1564         def _real_extract(self, url):
1565                 # Extract id and simplified title from URL
1566                 mobj = re.match(self._VALID_URL, url)
1567                 if mobj is None:
1568                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1569                         return
1570
1571                 # At this point we have a new video
1572                 self._downloader.increment_downloads()
1573                 video_id = mobj.group(1)
1574
1575                 simple_title = mobj.group(2).decode('utf-8')
1576                 video_extension = 'flv'
1577
1578                 # Retrieve video webpage to extract further information
1579                 request = urllib2.Request(url)
1580                 request.add_header('Cookie', 'family_filter=off')
1581                 try:
1582                         self.report_download_webpage(video_id)
1583                         webpage = urllib2.urlopen(request).read()
1584                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1585                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1586                         return
1587
1588                 # Extract URL, uploader and title from webpage
1589                 self.report_extraction(video_id)
1590                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1591                 if mobj is None:
1592                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1593                         return
1594                 sequence = urllib.unquote(mobj.group(1))
1595                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1596                 if mobj is None:
1597                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1598                         return
1599                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1600
1601                 # if needed add http://www.dailymotion.com/ if relative URL
1602
1603                 video_url = mediaURL
1604
1605                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1606                 if mobj is None:
1607                         self._downloader.trouble(u'ERROR: unable to extract title')
1608                         return
1609                 video_title = mobj.group(1).decode('utf-8')
1610                 video_title = sanitize_title(video_title)
1611
1612                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1613                 if mobj is None:
1614                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1615                         return
1616                 video_uploader = mobj.group(1)
1617
1618                 try:
1619                         # Process video information
1620                         self._downloader.process_info({
1621                                 'id':           video_id.decode('utf-8'),
1622                                 'url':          video_url.decode('utf-8'),
1623                                 'uploader':     video_uploader.decode('utf-8'),
1624                                 'upload_date':  u'NA',
1625                                 'title':        video_title,
1626                                 'stitle':       simple_title,
1627                                 'ext':          video_extension.decode('utf-8'),
1628                                 'format':       u'NA',
1629                                 'player_url':   None,
1630                         })
1631                 except UnavailableVideoError:
1632                         self._downloader.trouble(u'\nERROR: unable to download video')
1633
1634
1635 class GoogleIE(InfoExtractor):
1636         """Information extractor for video.google.com."""
1637
1638         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1639         IE_NAME = u'video.google'
1640
1641         def __init__(self, downloader=None):
1642                 InfoExtractor.__init__(self, downloader)
1643
1644         def report_download_webpage(self, video_id):
1645                 """Report webpage download."""
1646                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1647
1648         def report_extraction(self, video_id):
1649                 """Report information extraction."""
1650                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1651
1652         def _real_extract(self, url):
1653                 # Extract id from URL
1654                 mobj = re.match(self._VALID_URL, url)
1655                 if mobj is None:
1656                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1657                         return
1658
1659                 # At this point we have a new video
1660                 self._downloader.increment_downloads()
1661                 video_id = mobj.group(1)
1662
1663                 video_extension = 'mp4'
1664
1665                 # Retrieve video webpage to extract further information
1666                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1667                 try:
1668                         self.report_download_webpage(video_id)
1669                         webpage = urllib2.urlopen(request).read()
1670                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1671                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1672                         return
1673
1674                 # Extract URL, uploader, and title from webpage
1675                 self.report_extraction(video_id)
1676                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1677                 if mobj is None:
1678                         video_extension = 'flv'
1679                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1680                 if mobj is None:
1681                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1682                         return
1683                 mediaURL = urllib.unquote(mobj.group(1))
1684                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1685                 mediaURL = mediaURL.replace('\\x26', '\x26')
1686
1687                 video_url = mediaURL
1688
1689                 mobj = re.search(r'<title>(.*)</title>', webpage)
1690                 if mobj is None:
1691                         self._downloader.trouble(u'ERROR: unable to extract title')
1692                         return
1693                 video_title = mobj.group(1).decode('utf-8')
1694                 video_title = sanitize_title(video_title)
1695                 simple_title = _simplify_title(video_title)
1696
1697                 # Extract video description
1698                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1699                 if mobj is None:
1700                         self._downloader.trouble(u'ERROR: unable to extract video description')
1701                         return
1702                 video_description = mobj.group(1).decode('utf-8')
1703                 if not video_description:
1704                         video_description = 'No description available.'
1705
1706                 # Extract video thumbnail
1707                 if self._downloader.params.get('forcethumbnail', False):
1708                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1709                         try:
1710                                 webpage = urllib2.urlopen(request).read()
1711                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1712                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1713                                 return
1714                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1715                         if mobj is None:
1716                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1717                                 return
1718                         video_thumbnail = mobj.group(1)
1719                 else:   # we need something to pass to process_info
1720                         video_thumbnail = ''
1721
1722                 try:
1723                         # Process video information
1724                         self._downloader.process_info({
1725                                 'id':           video_id.decode('utf-8'),
1726                                 'url':          video_url.decode('utf-8'),
1727                                 'uploader':     u'NA',
1728                                 'upload_date':  u'NA',
1729                                 'title':        video_title,
1730                                 'stitle':       simple_title,
1731                                 'ext':          video_extension.decode('utf-8'),
1732                                 'format':       u'NA',
1733                                 'player_url':   None,
1734                         })
1735                 except UnavailableVideoError:
1736                         self._downloader.trouble(u'\nERROR: unable to download video')
1737
1738
1739 class PhotobucketIE(InfoExtractor):
1740         """Information extractor for photobucket.com."""
1741
1742         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1743         IE_NAME = u'photobucket'
1744
1745         def __init__(self, downloader=None):
1746                 InfoExtractor.__init__(self, downloader)
1747
1748         def report_download_webpage(self, video_id):
1749                 """Report webpage download."""
1750                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1751
1752         def report_extraction(self, video_id):
1753                 """Report information extraction."""
1754                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1755
1756         def _real_extract(self, url):
1757                 # Extract id from URL
1758                 mobj = re.match(self._VALID_URL, url)
1759                 if mobj is None:
1760                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1761                         return
1762
1763                 # At this point we have a new video
1764                 self._downloader.increment_downloads()
1765                 video_id = mobj.group(1)
1766
1767                 video_extension = 'flv'
1768
1769                 # Retrieve video webpage to extract further information
1770                 request = urllib2.Request(url)
1771                 try:
1772                         self.report_download_webpage(video_id)
1773                         webpage = urllib2.urlopen(request).read()
1774                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1775                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1776                         return
1777
1778                 # Extract URL, uploader, and title from webpage
1779                 self.report_extraction(video_id)
1780                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1781                 if mobj is None:
1782                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1783                         return
1784                 mediaURL = urllib.unquote(mobj.group(1))
1785
1786                 video_url = mediaURL
1787
1788                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1789                 if mobj is None:
1790                         self._downloader.trouble(u'ERROR: unable to extract title')
1791                         return
1792                 video_title = mobj.group(1).decode('utf-8')
1793                 video_title = sanitize_title(video_title)
1794                 simple_title = _simplify_title(vide_title)
1795
1796                 video_uploader = mobj.group(2).decode('utf-8')
1797
1798                 try:
1799                         # Process video information
1800                         self._downloader.process_info({
1801                                 'id':           video_id.decode('utf-8'),
1802                                 'url':          video_url.decode('utf-8'),
1803                                 'uploader':     video_uploader,
1804                                 'upload_date':  u'NA',
1805                                 'title':        video_title,
1806                                 'stitle':       simple_title,
1807                                 'ext':          video_extension.decode('utf-8'),
1808                                 'format':       u'NA',
1809                                 'player_url':   None,
1810                         })
1811                 except UnavailableVideoError:
1812                         self._downloader.trouble(u'\nERROR: unable to download video')
1813
1814
1815 class YahooIE(InfoExtractor):
1816         """Information extractor for video.yahoo.com."""
1817
1818         # _VALID_URL matches all Yahoo! Video URLs
1819         # _VPAGE_URL matches only the extractable '/watch/' URLs
1820         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1821         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1822         IE_NAME = u'video.yahoo'
1823
1824         def __init__(self, downloader=None):
1825                 InfoExtractor.__init__(self, downloader)
1826
1827         def report_download_webpage(self, video_id):
1828                 """Report webpage download."""
1829                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1830
1831         def report_extraction(self, video_id):
1832                 """Report information extraction."""
1833                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1834
1835         def _real_extract(self, url, new_video=True):
1836                 # Extract ID from URL
1837                 mobj = re.match(self._VALID_URL, url)
1838                 if mobj is None:
1839                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1840                         return
1841
1842                 # At this point we have a new video
1843                 self._downloader.increment_downloads()
1844                 video_id = mobj.group(2)
1845                 video_extension = 'flv'
1846
1847                 # Rewrite valid but non-extractable URLs as
1848                 # extractable English language /watch/ URLs
1849                 if re.match(self._VPAGE_URL, url) is None:
1850                         request = urllib2.Request(url)
1851                         try:
1852                                 webpage = urllib2.urlopen(request).read()
1853                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1854                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1855                                 return
1856
1857                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1858                         if mobj is None:
1859                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1860                                 return
1861                         yahoo_id = mobj.group(1)
1862
1863                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1864                         if mobj is None:
1865                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1866                                 return
1867                         yahoo_vid = mobj.group(1)
1868
1869                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1870                         return self._real_extract(url, new_video=False)
1871
1872                 # Retrieve video webpage to extract further information
1873                 request = urllib2.Request(url)
1874                 try:
1875                         self.report_download_webpage(video_id)
1876                         webpage = urllib2.urlopen(request).read()
1877                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1878                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1879                         return
1880
1881                 # Extract uploader and title from webpage
1882                 self.report_extraction(video_id)
1883                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1884                 if mobj is None:
1885                         self._downloader.trouble(u'ERROR: unable to extract video title')
1886                         return
1887                 video_title = mobj.group(1).decode('utf-8')
1888                 simple_title = _simplify_title(video_title)
1889
1890                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1891                 if mobj is None:
1892                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1893                         return
1894                 video_uploader = mobj.group(1).decode('utf-8')
1895
1896                 # Extract video thumbnail
1897                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1898                 if mobj is None:
1899                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1900                         return
1901                 video_thumbnail = mobj.group(1).decode('utf-8')
1902
1903                 # Extract video description
1904                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1905                 if mobj is None:
1906                         self._downloader.trouble(u'ERROR: unable to extract video description')
1907                         return
1908                 video_description = mobj.group(1).decode('utf-8')
1909                 if not video_description:
1910                         video_description = 'No description available.'
1911
1912                 # Extract video height and width
1913                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1914                 if mobj is None:
1915                         self._downloader.trouble(u'ERROR: unable to extract video height')
1916                         return
1917                 yv_video_height = mobj.group(1)
1918
1919                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1920                 if mobj is None:
1921                         self._downloader.trouble(u'ERROR: unable to extract video width')
1922                         return
1923                 yv_video_width = mobj.group(1)
1924
1925                 # Retrieve video playlist to extract media URL
1926                 # I'm not completely sure what all these options are, but we
1927                 # seem to need most of them, otherwise the server sends a 401.
1928                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1929                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1930                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1931                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1932                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1933                 try:
1934                         self.report_download_webpage(video_id)
1935                         webpage = urllib2.urlopen(request).read()
1936                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1937                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1938                         return
1939
1940                 # Extract media URL from playlist XML
1941                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1942                 if mobj is None:
1943                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1944                         return
1945                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1946                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1947
1948                 try:
1949                         # Process video information
1950                         self._downloader.process_info({
1951                                 'id':           video_id.decode('utf-8'),
1952                                 'url':          video_url,
1953                                 'uploader':     video_uploader,
1954                                 'upload_date':  u'NA',
1955                                 'title':        video_title,
1956                                 'stitle':       simple_title,
1957                                 'ext':          video_extension.decode('utf-8'),
1958                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1959                                 'description':  video_description,
1960                                 'thumbnail':    video_thumbnail,
1961                                 'player_url':   None,
1962                         })
1963                 except UnavailableVideoError:
1964                         self._downloader.trouble(u'\nERROR: unable to download video')
1965
1966
1967 class VimeoIE(InfoExtractor):
1968         """Information extractor for vimeo.com."""
1969
1970         # _VALID_URL matches Vimeo URLs
1971         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1972         IE_NAME = u'vimeo'
1973
1974         def __init__(self, downloader=None):
1975                 InfoExtractor.__init__(self, downloader)
1976
1977         def report_download_webpage(self, video_id):
1978                 """Report webpage download."""
1979                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1980
1981         def report_extraction(self, video_id):
1982                 """Report information extraction."""
1983                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1984
1985         def _real_extract(self, url, new_video=True):
1986                 # Extract ID from URL
1987                 mobj = re.match(self._VALID_URL, url)
1988                 if mobj is None:
1989                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1990                         return
1991
1992                 # At this point we have a new video
1993                 self._downloader.increment_downloads()
1994                 video_id = mobj.group(1)
1995
1996                 # Retrieve video webpage to extract further information
1997                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1998                 try:
1999                         self.report_download_webpage(video_id)
2000                         webpage = urllib2.urlopen(request).read()
2001                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2002                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2003                         return
2004
2005                 # Now we begin extracting as much information as we can from what we
2006                 # retrieved. First we extract the information common to all extractors,
2007                 # and latter we extract those that are Vimeo specific.
2008                 self.report_extraction(video_id)
2009
2010                 # Extract title
2011                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2012                 if mobj is None:
2013                         self._downloader.trouble(u'ERROR: unable to extract video title')
2014                         return
2015                 video_title = mobj.group(1).decode('utf-8')
2016                 simple_title = _simple_title(video_title)
2017
2018                 # Extract uploader
2019                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2020                 if mobj is None:
2021                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2022                         return
2023                 video_uploader = mobj.group(1).decode('utf-8')
2024
2025                 # Extract video thumbnail
2026                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2027                 if mobj is None:
2028                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2029                         return
2030                 video_thumbnail = mobj.group(1).decode('utf-8')
2031
2032                 # # Extract video description
2033                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2034                 # if mobj is None:
2035                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2036                 #       return
2037                 # video_description = mobj.group(1).decode('utf-8')
2038                 # if not video_description: video_description = 'No description available.'
2039                 video_description = 'Foo.'
2040
2041                 # Vimeo specific: extract request signature
2042                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2043                 if mobj is None:
2044                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2045                         return
2046                 sig = mobj.group(1).decode('utf-8')
2047
2048                 # Vimeo specific: extract video quality information
2049                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2050                 if mobj is None:
2051                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2052                         return
2053                 quality = mobj.group(1).decode('utf-8')
2054
2055                 if int(quality) == 1:
2056                         quality = 'hd'
2057                 else:
2058                         quality = 'sd'
2059
2060                 # Vimeo specific: Extract request signature expiration
2061                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2062                 if mobj is None:
2063                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2064                         return
2065                 sig_exp = mobj.group(1).decode('utf-8')
2066
2067                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2068
2069                 try:
2070                         # Process video information
2071                         self._downloader.process_info({
2072                                 'id':           video_id.decode('utf-8'),
2073                                 'url':          video_url,
2074                                 'uploader':     video_uploader,
2075                                 'upload_date':  u'NA',
2076                                 'title':        video_title,
2077                                 'stitle':       simple_title,
2078                                 'ext':          u'mp4',
2079                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2080                                 'description':  video_description,
2081                                 'thumbnail':    video_thumbnail,
2082                                 'description':  video_description,
2083                                 'player_url':   None,
2084                         })
2085                 except UnavailableVideoError:
2086                         self._downloader.trouble(u'ERROR: unable to download video')
2087
2088
2089 class GenericIE(InfoExtractor):
2090         """Generic last-resort information extractor."""
2091
2092         _VALID_URL = r'.*'
2093         IE_NAME = u'generic'
2094
2095         def __init__(self, downloader=None):
2096                 InfoExtractor.__init__(self, downloader)
2097
2098         def report_download_webpage(self, video_id):
2099                 """Report webpage download."""
2100                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2101                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2102
2103         def report_extraction(self, video_id):
2104                 """Report information extraction."""
2105                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2106
2107         def _real_extract(self, url):
2108                 # At this point we have a new video
2109                 self._downloader.increment_downloads()
2110
2111                 video_id = url.split('/')[-1]
2112                 request = urllib2.Request(url)
2113                 try:
2114                         self.report_download_webpage(video_id)
2115                         webpage = urllib2.urlopen(request).read()
2116                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2117                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2118                         return
2119                 except ValueError, err:
2120                         # since this is the last-resort InfoExtractor, if
2121                         # this error is thrown, it'll be thrown here
2122                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2123                         return
2124
2125                 self.report_extraction(video_id)
2126                 # Start with something easy: JW Player in SWFObject
2127                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2128                 if mobj is None:
2129                         # Broaden the search a little bit
2130                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2131                 if mobj is None:
2132                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2133                         return
2134
2135                 # It's possible that one of the regexes
2136                 # matched, but returned an empty group:
2137                 if mobj.group(1) is None:
2138                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2139                         return
2140
2141                 video_url = urllib.unquote(mobj.group(1))
2142                 video_id = os.path.basename(video_url)
2143
2144                 # here's a fun little line of code for you:
2145                 video_extension = os.path.splitext(video_id)[1][1:]
2146                 video_id = os.path.splitext(video_id)[0]
2147
2148                 # it's tempting to parse this further, but you would
2149                 # have to take into account all the variations like
2150                 #   Video Title - Site Name
2151                 #   Site Name | Video Title
2152                 #   Video Title - Tagline | Site Name
2153                 # and so on and so forth; it's just not practical
2154                 mobj = re.search(r'<title>(.*)</title>', webpage)
2155                 if mobj is None:
2156                         self._downloader.trouble(u'ERROR: unable to extract title')
2157                         return
2158                 video_title = mobj.group(1).decode('utf-8')
2159                 video_title = sanitize_title(video_title)
2160                 simple_title = _simplify_title(video_title)
2161
2162                 # video uploader is domain name
2163                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2164                 if mobj is None:
2165                         self._downloader.trouble(u'ERROR: unable to extract title')
2166                         return
2167                 video_uploader = mobj.group(1).decode('utf-8')
2168
2169                 try:
2170                         # Process video information
2171                         self._downloader.process_info({
2172                                 'id':           video_id.decode('utf-8'),
2173                                 'url':          video_url.decode('utf-8'),
2174                                 'uploader':     video_uploader,
2175                                 'upload_date':  u'NA',
2176                                 'title':        video_title,
2177                                 'stitle':       simple_title,
2178                                 'ext':          video_extension.decode('utf-8'),
2179                                 'format':       u'NA',
2180                                 'player_url':   None,
2181                         })
2182                 except UnavailableVideoError, err:
2183                         self._downloader.trouble(u'\nERROR: unable to download video')
2184
2185
2186 class YoutubeSearchIE(InfoExtractor):
2187         """Information Extractor for YouTube search queries."""
2188         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2189         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2190         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2191         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2192         _youtube_ie = None
2193         _max_youtube_results = 1000
2194         IE_NAME = u'youtube:search'
2195
2196         def __init__(self, youtube_ie, downloader=None):
2197                 InfoExtractor.__init__(self, downloader)
2198                 self._youtube_ie = youtube_ie
2199
2200         def report_download_page(self, query, pagenum):
2201                 """Report attempt to download playlist page with given number."""
2202                 query = query.decode(preferredencoding())
2203                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2204
2205         def _real_initialize(self):
2206                 self._youtube_ie.initialize()
2207
2208         def _real_extract(self, query):
2209                 mobj = re.match(self._VALID_URL, query)
2210                 if mobj is None:
2211                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2212                         return
2213
2214                 prefix, query = query.split(':')
2215                 prefix = prefix[8:]
2216                 query = query.encode('utf-8')
2217                 if prefix == '':
2218                         self._download_n_results(query, 1)
2219                         return
2220                 elif prefix == 'all':
2221                         self._download_n_results(query, self._max_youtube_results)
2222                         return
2223                 else:
2224                         try:
2225                                 n = long(prefix)
2226                                 if n <= 0:
2227                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2228                                         return
2229                                 elif n > self._max_youtube_results:
2230                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2231                                         n = self._max_youtube_results
2232                                 self._download_n_results(query, n)
2233                                 return
2234                         except ValueError: # parsing prefix as integer fails
2235                                 self._download_n_results(query, 1)
2236                                 return
2237
2238         def _download_n_results(self, query, n):
2239                 """Downloads a specified number of results for a query"""
2240
2241                 video_ids = []
2242                 already_seen = set()
2243                 pagenum = 1
2244
2245                 while True:
2246                         self.report_download_page(query, pagenum)
2247                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2248                         request = urllib2.Request(result_url)
2249                         try:
2250                                 page = urllib2.urlopen(request).read()
2251                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2252                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2253                                 return
2254
2255                         # Extract video identifiers
2256                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2257                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2258                                 if video_id not in already_seen:
2259                                         video_ids.append(video_id)
2260                                         already_seen.add(video_id)
2261                                         if len(video_ids) == n:
2262                                                 # Specified n videos reached
2263                                                 for id in video_ids:
2264                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2265                                                 return
2266
2267                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2268                                 for id in video_ids:
2269                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2270                                 return
2271
2272                         pagenum = pagenum + 1
2273
2274
2275 class GoogleSearchIE(InfoExtractor):
2276         """Information Extractor for Google Video search queries."""
2277         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2278         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2279         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2280         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2281         _google_ie = None
2282         _max_google_results = 1000
2283         IE_NAME = u'video.google:search'
2284
2285         def __init__(self, google_ie, downloader=None):
2286                 InfoExtractor.__init__(self, downloader)
2287                 self._google_ie = google_ie
2288
2289         def report_download_page(self, query, pagenum):
2290                 """Report attempt to download playlist page with given number."""
2291                 query = query.decode(preferredencoding())
2292                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2293
2294         def _real_initialize(self):
2295                 self._google_ie.initialize()
2296
2297         def _real_extract(self, query):
2298                 mobj = re.match(self._VALID_URL, query)
2299                 if mobj is None:
2300                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2301                         return
2302
2303                 prefix, query = query.split(':')
2304                 prefix = prefix[8:]
2305                 query = query.encode('utf-8')
2306                 if prefix == '':
2307                         self._download_n_results(query, 1)
2308                         return
2309                 elif prefix == 'all':
2310                         self._download_n_results(query, self._max_google_results)
2311                         return
2312                 else:
2313                         try:
2314                                 n = long(prefix)
2315                                 if n <= 0:
2316                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2317                                         return
2318                                 elif n > self._max_google_results:
2319                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2320                                         n = self._max_google_results
2321                                 self._download_n_results(query, n)
2322                                 return
2323                         except ValueError: # parsing prefix as integer fails
2324                                 self._download_n_results(query, 1)
2325                                 return
2326
2327         def _download_n_results(self, query, n):
2328                 """Downloads a specified number of results for a query"""
2329
2330                 video_ids = []
2331                 already_seen = set()
2332                 pagenum = 1
2333
2334                 while True:
2335                         self.report_download_page(query, pagenum)
2336                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2337                         request = urllib2.Request(result_url)
2338                         try:
2339                                 page = urllib2.urlopen(request).read()
2340                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2341                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2342                                 return
2343
2344                         # Extract video identifiers
2345                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2346                                 video_id = mobj.group(1)
2347                                 if video_id not in already_seen:
2348                                         video_ids.append(video_id)
2349                                         already_seen.add(video_id)
2350                                         if len(video_ids) == n:
2351                                                 # Specified n videos reached
2352                                                 for id in video_ids:
2353                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2354                                                 return
2355
2356                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2357                                 for id in video_ids:
2358                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2359                                 return
2360
2361                         pagenum = pagenum + 1
2362
2363
2364 class YahooSearchIE(InfoExtractor):
2365         """Information Extractor for Yahoo! Video search queries."""
2366         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2367         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2368         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2369         _MORE_PAGES_INDICATOR = r'\s*Next'
2370         _yahoo_ie = None
2371         _max_yahoo_results = 1000
2372         IE_NAME = u'video.yahoo:search'
2373
2374         def __init__(self, yahoo_ie, downloader=None):
2375                 InfoExtractor.__init__(self, downloader)
2376                 self._yahoo_ie = yahoo_ie
2377
2378         def report_download_page(self, query, pagenum):
2379                 """Report attempt to download playlist page with given number."""
2380                 query = query.decode(preferredencoding())
2381                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2382
2383         def _real_initialize(self):
2384                 self._yahoo_ie.initialize()
2385
2386         def _real_extract(self, query):
2387                 mobj = re.match(self._VALID_URL, query)
2388                 if mobj is None:
2389                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2390                         return
2391
2392                 prefix, query = query.split(':')
2393                 prefix = prefix[8:]
2394                 query = query.encode('utf-8')
2395                 if prefix == '':
2396                         self._download_n_results(query, 1)
2397                         return
2398                 elif prefix == 'all':
2399                         self._download_n_results(query, self._max_yahoo_results)
2400                         return
2401                 else:
2402                         try:
2403                                 n = long(prefix)
2404                                 if n <= 0:
2405                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2406                                         return
2407                                 elif n > self._max_yahoo_results:
2408                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2409                                         n = self._max_yahoo_results
2410                                 self._download_n_results(query, n)
2411                                 return
2412                         except ValueError: # parsing prefix as integer fails
2413                                 self._download_n_results(query, 1)
2414                                 return
2415
2416         def _download_n_results(self, query, n):
2417                 """Downloads a specified number of results for a query"""
2418
2419                 video_ids = []
2420                 already_seen = set()
2421                 pagenum = 1
2422
2423                 while True:
2424                         self.report_download_page(query, pagenum)
2425                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2426                         request = urllib2.Request(result_url)
2427                         try:
2428                                 page = urllib2.urlopen(request).read()
2429                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2430                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2431                                 return
2432
2433                         # Extract video identifiers
2434                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2435                                 video_id = mobj.group(1)
2436                                 if video_id not in already_seen:
2437                                         video_ids.append(video_id)
2438                                         already_seen.add(video_id)
2439                                         if len(video_ids) == n:
2440                                                 # Specified n videos reached
2441                                                 for id in video_ids:
2442                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2443                                                 return
2444
2445                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2446                                 for id in video_ids:
2447                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2448                                 return
2449
2450                         pagenum = pagenum + 1
2451
2452
2453 class YoutubePlaylistIE(InfoExtractor):
2454         """Information Extractor for YouTube playlists."""
2455
2456         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2457         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2458         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2459         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2460         _youtube_ie = None
2461         IE_NAME = u'youtube:playlist'
2462
2463         def __init__(self, youtube_ie, downloader=None):
2464                 InfoExtractor.__init__(self, downloader)
2465                 self._youtube_ie = youtube_ie
2466
2467         def report_download_page(self, playlist_id, pagenum):
2468                 """Report attempt to download playlist page with given number."""
2469                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2470
2471         def _real_initialize(self):
2472                 self._youtube_ie.initialize()
2473
2474         def _real_extract(self, url):
2475                 # Extract playlist id
2476                 mobj = re.match(self._VALID_URL, url)
2477                 if mobj is None:
2478                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2479                         return
2480
2481                 # Single video case
2482                 if mobj.group(3) is not None:
2483                         self._youtube_ie.extract(mobj.group(3))
2484                         return
2485
2486                 # Download playlist pages
2487                 # prefix is 'p' as default for playlists but there are other types that need extra care
2488                 playlist_prefix = mobj.group(1)
2489                 if playlist_prefix == 'a':
2490                         playlist_access = 'artist'
2491                 else:
2492                         playlist_prefix = 'p'
2493                         playlist_access = 'view_play_list'
2494                 playlist_id = mobj.group(2)
2495                 video_ids = []
2496                 pagenum = 1
2497
2498                 while True:
2499                         self.report_download_page(playlist_id, pagenum)
2500                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2501                         request = urllib2.Request(url)
2502                         try:
2503                                 page = urllib2.urlopen(request).read()
2504                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2505                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2506                                 return
2507
2508                         # Extract video identifiers
2509                         ids_in_page = []
2510                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2511                                 if mobj.group(1) not in ids_in_page:
2512                                         ids_in_page.append(mobj.group(1))
2513                         video_ids.extend(ids_in_page)
2514
2515                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2516                                 break
2517                         pagenum = pagenum + 1
2518
2519                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2520                 playlistend = self._downloader.params.get('playlistend', -1)
2521                 video_ids = video_ids[playliststart:playlistend]
2522
2523                 for id in video_ids:
2524                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2525                 return
2526
2527
2528 class YoutubeUserIE(InfoExtractor):
2529         """Information Extractor for YouTube users."""
2530
2531         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2532         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2533         _GDATA_PAGE_SIZE = 50
2534         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2535         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2536         _youtube_ie = None
2537         IE_NAME = u'youtube:user'
2538
2539         def __init__(self, youtube_ie, downloader=None):
2540                 InfoExtractor.__init__(self, downloader)
2541                 self._youtube_ie = youtube_ie
2542
2543         def report_download_page(self, username, start_index):
2544                 """Report attempt to download user page."""
2545                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2546                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2547
2548         def _real_initialize(self):
2549                 self._youtube_ie.initialize()
2550
2551         def _real_extract(self, url):
2552                 # Extract username
2553                 mobj = re.match(self._VALID_URL, url)
2554                 if mobj is None:
2555                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2556                         return
2557
2558                 username = mobj.group(1)
2559
2560                 # Download video ids using YouTube Data API. Result size per
2561                 # query is limited (currently to 50 videos) so we need to query
2562                 # page by page until there are no video ids - it means we got
2563                 # all of them.
2564
2565                 video_ids = []
2566                 pagenum = 0
2567
2568                 while True:
2569                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2570                         self.report_download_page(username, start_index)
2571
2572                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2573
2574                         try:
2575                                 page = urllib2.urlopen(request).read()
2576                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2577                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2578                                 return
2579
2580                         # Extract video identifiers
2581                         ids_in_page = []
2582
2583                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2584                                 if mobj.group(1) not in ids_in_page:
2585                                         ids_in_page.append(mobj.group(1))
2586
2587                         video_ids.extend(ids_in_page)
2588
2589                         # A little optimization - if current page is not
2590                         # "full", ie. does not contain PAGE_SIZE video ids then
2591                         # we can assume that this page is the last one - there
2592                         # are no more ids on further pages - no need to query
2593                         # again.
2594
2595                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2596                                 break
2597
2598                         pagenum += 1
2599
2600                 all_ids_count = len(video_ids)
2601                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2602                 playlistend = self._downloader.params.get('playlistend', -1)
2603
2604                 if playlistend == -1:
2605                         video_ids = video_ids[playliststart:]
2606                 else:
2607                         video_ids = video_ids[playliststart:playlistend]
2608
2609                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2610                                 (username, all_ids_count, len(video_ids)))
2611
2612                 for video_id in video_ids:
2613                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2614
2615
2616 class DepositFilesIE(InfoExtractor):
2617         """Information extractor for depositfiles.com"""
2618
2619         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2620         IE_NAME = u'DepositFiles'
2621
2622         def __init__(self, downloader=None):
2623                 InfoExtractor.__init__(self, downloader)
2624
2625         def report_download_webpage(self, file_id):
2626                 """Report webpage download."""
2627                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2628
2629         def report_extraction(self, file_id):
2630                 """Report information extraction."""
2631                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2632
2633         def _real_extract(self, url):
2634                 # At this point we have a new file
2635                 self._downloader.increment_downloads()
2636
2637                 file_id = url.split('/')[-1]
2638                 # Rebuild url in english locale
2639                 url = 'http://depositfiles.com/en/files/' + file_id
2640
2641                 # Retrieve file webpage with 'Free download' button pressed
2642                 free_download_indication = { 'gateway_result' : '1' }
2643                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2644                 try:
2645                         self.report_download_webpage(file_id)
2646                         webpage = urllib2.urlopen(request).read()
2647                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2648                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2649                         return
2650
2651                 # Search for the real file URL
2652                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2653                 if (mobj is None) or (mobj.group(1) is None):
2654                         # Try to figure out reason of the error.
2655                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2656                         if (mobj is not None) and (mobj.group(1) is not None):
2657                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2658                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2659                         else:
2660                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2661                         return
2662
2663                 file_url = mobj.group(1)
2664                 file_extension = os.path.splitext(file_url)[1][1:]
2665
2666                 # Search for file title
2667                 mobj = re.search(r'<b title="(.*?)">', webpage)
2668                 if mobj is None:
2669                         self._downloader.trouble(u'ERROR: unable to extract title')
2670                         return
2671                 file_title = mobj.group(1).decode('utf-8')
2672
2673                 try:
2674                         # Process file information
2675                         self._downloader.process_info({
2676                                 'id':           file_id.decode('utf-8'),
2677                                 'url':          file_url.decode('utf-8'),
2678                                 'uploader':     u'NA',
2679                                 'upload_date':  u'NA',
2680                                 'title':        file_title,
2681                                 'stitle':       file_title,
2682                                 'ext':          file_extension.decode('utf-8'),
2683                                 'format':       u'NA',
2684                                 'player_url':   None,
2685                         })
2686                 except UnavailableVideoError, err:
2687                         self._downloader.trouble(u'ERROR: unable to download file')
2688
2689
2690 class FacebookIE(InfoExtractor):
2691         """Information Extractor for Facebook"""
2692
2693         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2694         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2695         _NETRC_MACHINE = 'facebook'
2696         _available_formats = ['video', 'highqual', 'lowqual']
2697         _video_extensions = {
2698                 'video': 'mp4',
2699                 'highqual': 'mp4',
2700                 'lowqual': 'mp4',
2701         }
2702         IE_NAME = u'facebook'
2703
2704         def __init__(self, downloader=None):
2705                 InfoExtractor.__init__(self, downloader)
2706
2707         def _reporter(self, message):
2708                 """Add header and report message."""
2709                 self._downloader.to_screen(u'[facebook] %s' % message)
2710
2711         def report_login(self):
2712                 """Report attempt to log in."""
2713                 self._reporter(u'Logging in')
2714
2715         def report_video_webpage_download(self, video_id):
2716                 """Report attempt to download video webpage."""
2717                 self._reporter(u'%s: Downloading video webpage' % video_id)
2718
2719         def report_information_extraction(self, video_id):
2720                 """Report attempt to extract video information."""
2721                 self._reporter(u'%s: Extracting video information' % video_id)
2722
2723         def _parse_page(self, video_webpage):
2724                 """Extract video information from page"""
2725                 # General data
2726                 data = {'title': r'\("video_title", "(.*?)"\)',
2727                         'description': r'<div class="datawrap">(.*?)</div>',
2728                         'owner': r'\("video_owner_name", "(.*?)"\)',
2729                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2730                         }
2731                 video_info = {}
2732                 for piece in data.keys():
2733                         mobj = re.search(data[piece], video_webpage)
2734                         if mobj is not None:
2735                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2736
2737                 # Video urls
2738                 video_urls = {}
2739                 for fmt in self._available_formats:
2740                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2741                         if mobj is not None:
2742                                 # URL is in a Javascript segment inside an escaped Unicode format within
2743                                 # the generally utf-8 page
2744                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2745                 video_info['video_urls'] = video_urls
2746
2747                 return video_info
2748
2749         def _real_initialize(self):
2750                 if self._downloader is None:
2751                         return
2752
2753                 useremail = None
2754                 password = None
2755                 downloader_params = self._downloader.params
2756
2757                 # Attempt to use provided username and password or .netrc data
2758                 if downloader_params.get('username', None) is not None:
2759                         useremail = downloader_params['username']
2760                         password = downloader_params['password']
2761                 elif downloader_params.get('usenetrc', False):
2762                         try:
2763                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2764                                 if info is not None:
2765                                         useremail = info[0]
2766                                         password = info[2]
2767                                 else:
2768                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2769                         except (IOError, netrc.NetrcParseError), err:
2770                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2771                                 return
2772
2773                 if useremail is None:
2774                         return
2775
2776                 # Log in
2777                 login_form = {
2778                         'email': useremail,
2779                         'pass': password,
2780                         'login': 'Log+In'
2781                         }
2782                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2783                 try:
2784                         self.report_login()
2785                         login_results = urllib2.urlopen(request).read()
2786                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2787                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2788                                 return
2789                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2790                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2791                         return
2792
2793         def _real_extract(self, url):
2794                 mobj = re.match(self._VALID_URL, url)
2795                 if mobj is None:
2796                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2797                         return
2798                 video_id = mobj.group('ID')
2799
2800                 # Get video webpage
2801                 self.report_video_webpage_download(video_id)
2802                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2803                 try:
2804                         page = urllib2.urlopen(request)
2805                         video_webpage = page.read()
2806                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2807                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2808                         return
2809
2810                 # Start extracting information
2811                 self.report_information_extraction(video_id)
2812
2813                 # Extract information
2814                 video_info = self._parse_page(video_webpage)
2815
2816                 # uploader
2817                 if 'owner' not in video_info:
2818                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2819                         return
2820                 video_uploader = video_info['owner']
2821
2822                 # title
2823                 if 'title' not in video_info:
2824                         self._downloader.trouble(u'ERROR: unable to extract video title')
2825                         return
2826                 video_title = video_info['title']
2827                 video_title = video_title.decode('utf-8')
2828                 video_title = sanitize_title(video_title)
2829
2830                 simple_title = _simplify_title(video_title)
2831
2832                 # thumbnail image
2833                 if 'thumbnail' not in video_info:
2834                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2835                         video_thumbnail = ''
2836                 else:
2837                         video_thumbnail = video_info['thumbnail']
2838
2839                 # upload date
2840                 upload_date = u'NA'
2841                 if 'upload_date' in video_info:
2842                         upload_time = video_info['upload_date']
2843                         timetuple = email.utils.parsedate_tz(upload_time)
2844                         if timetuple is not None:
2845                                 try:
2846                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2847                                 except:
2848                                         pass
2849
2850                 # description
2851                 video_description = video_info.get('description', 'No description available.')
2852
2853                 url_map = video_info['video_urls']
2854                 if len(url_map.keys()) > 0:
2855                         # Decide which formats to download
2856                         req_format = self._downloader.params.get('format', None)
2857                         format_limit = self._downloader.params.get('format_limit', None)
2858
2859                         if format_limit is not None and format_limit in self._available_formats:
2860                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2861                         else:
2862                                 format_list = self._available_formats
2863                         existing_formats = [x for x in format_list if x in url_map]
2864                         if len(existing_formats) == 0:
2865                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2866                                 return
2867                         if req_format is None:
2868                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2869                         elif req_format == 'worst':
2870                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2871                         elif req_format == '-1':
2872                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2873                         else:
2874                                 # Specific format
2875                                 if req_format not in url_map:
2876                                         self._downloader.trouble(u'ERROR: requested format not available')
2877                                         return
2878                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2879
2880                 for format_param, video_real_url in video_url_list:
2881
2882                         # At this point we have a new video
2883                         self._downloader.increment_downloads()
2884
2885                         # Extension
2886                         video_extension = self._video_extensions.get(format_param, 'mp4')
2887
2888                         try:
2889                                 # Process video information
2890                                 self._downloader.process_info({
2891                                         'id':           video_id.decode('utf-8'),
2892                                         'url':          video_real_url.decode('utf-8'),
2893                                         'uploader':     video_uploader.decode('utf-8'),
2894                                         'upload_date':  upload_date,
2895                                         'title':        video_title,
2896                                         'stitle':       simple_title,
2897                                         'ext':          video_extension.decode('utf-8'),
2898                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2899                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2900                                         'description':  video_description.decode('utf-8'),
2901                                         'player_url':   None,
2902                                 })
2903                         except UnavailableVideoError, err:
2904                                 self._downloader.trouble(u'\nERROR: unable to download video')
2905
2906 class BlipTVIE(InfoExtractor):
2907         """Information extractor for blip.tv"""
2908
2909         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2910         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2911         IE_NAME = u'blip.tv'
2912
2913         def report_extraction(self, file_id):
2914                 """Report information extraction."""
2915                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2916
2917         def report_direct_download(self, title):
2918                 """Report information extraction."""
2919                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2920
2921         def _real_extract(self, url):
2922                 mobj = re.match(self._VALID_URL, url)
2923                 if mobj is None:
2924                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2925                         return
2926
2927                 if '?' in url:
2928                         cchar = '&'
2929                 else:
2930                         cchar = '?'
2931                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2932                 request = urllib2.Request(json_url)
2933                 self.report_extraction(mobj.group(1))
2934                 info = None
2935                 try:
2936                         urlh = urllib2.urlopen(request)
2937                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2938                                 basename = url.split('/')[-1]
2939                                 title,ext = os.path.splitext(basename)
2940                                 ext = ext.replace('.', '')
2941                                 self.report_direct_download(title)
2942                                 info = {
2943                                         'id': title,
2944                                         'url': url,
2945                                         'title': title,
2946                                         'stitle': _simplify_title(title),
2947                                         'ext': ext,
2948                                         'urlhandle': urlh
2949                                 }
2950                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2951                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2952                         return
2953                 if info is None: # Regular URL
2954                         try:
2955                                 json_code = urlh.read()
2956                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2957                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2958                                 return
2959
2960                         try:
2961                                 json_data = json.loads(json_code)
2962                                 if 'Post' in json_data:
2963                                         data = json_data['Post']
2964                                 else:
2965                                         data = json_data
2966
2967                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2968                                 video_url = data['media']['url']
2969                                 umobj = re.match(self._URL_EXT, video_url)
2970                                 if umobj is None:
2971                                         raise ValueError('Can not determine filename extension')
2972                                 ext = umobj.group(1)
2973
2974                                 info = {
2975                                         'id': data['item_id'],
2976                                         'url': video_url,
2977                                         'uploader': data['display_name'],
2978                                         'upload_date': upload_date,
2979                                         'title': data['title'],
2980                                         'stitle': _simplify_title(data['title']),
2981                                         'ext': ext,
2982                                         'format': data['media']['mimeType'],
2983                                         'thumbnail': data['thumbnailUrl'],
2984                                         'description': data['description'],
2985                                         'player_url': data['embedUrl']
2986                                 }
2987                         except (ValueError,KeyError), err:
2988                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2989                                 return
2990
2991                 self._downloader.increment_downloads()
2992
2993                 try:
2994                         self._downloader.process_info(info)
2995                 except UnavailableVideoError, err:
2996                         self._downloader.trouble(u'\nERROR: unable to download video')
2997
2998
2999 class MyVideoIE(InfoExtractor):
3000         """Information Extractor for myvideo.de."""
3001
3002         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3003         IE_NAME = u'myvideo'
3004
3005         def __init__(self, downloader=None):
3006                 InfoExtractor.__init__(self, downloader)
3007
3008         def report_download_webpage(self, video_id):
3009                 """Report webpage download."""
3010                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3011
3012         def report_extraction(self, video_id):
3013                 """Report information extraction."""
3014                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3015
3016         def _real_extract(self,url):
3017                 mobj = re.match(self._VALID_URL, url)
3018                 if mobj is None:
3019                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3020                         return
3021
3022                 video_id = mobj.group(1)
3023
3024                 # Get video webpage
3025                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3026                 try:
3027                         self.report_download_webpage(video_id)
3028                         webpage = urllib2.urlopen(request).read()
3029                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3030                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3031                         return
3032
3033                 self.report_extraction(video_id)
3034                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3035                                  webpage)
3036                 if mobj is None:
3037                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3038                         return
3039                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3040
3041                 mobj = re.search('<title>([^<]+)</title>', webpage)
3042                 if mobj is None:
3043                         self._downloader.trouble(u'ERROR: unable to extract title')
3044                         return
3045
3046                 video_title = mobj.group(1)
3047                 video_title = sanitize_title(video_title)
3048
3049                 simple_title = _simplify_title(video_title)
3050
3051                 try:
3052                         self._downloader.process_info({
3053                                 'id':           video_id,
3054                                 'url':          video_url,
3055                                 'uploader':     u'NA',
3056                                 'upload_date':  u'NA',
3057                                 'title':        video_title,
3058                                 'stitle':       simple_title,
3059                                 'ext':          u'flv',
3060                                 'format':       u'NA',
3061                                 'player_url':   None,
3062                         })
3063                 except UnavailableVideoError:
3064                         self._downloader.trouble(u'\nERROR: Unable to download video')
3065
3066 class ComedyCentralIE(InfoExtractor):
3067         """Information extractor for The Daily Show and Colbert Report """
3068
3069         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3070         IE_NAME = u'comedycentral'
3071
3072         def report_extraction(self, episode_id):
3073                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3074
3075         def report_config_download(self, episode_id):
3076                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3077
3078         def report_index_download(self, episode_id):
3079                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3080
3081         def report_player_url(self, episode_id):
3082                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3083
3084         def _real_extract(self, url):
3085                 mobj = re.match(self._VALID_URL, url)
3086                 if mobj is None:
3087                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3088                         return
3089
3090                 if mobj.group('shortname'):
3091                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3092                                 url = 'http://www.thedailyshow.com/full-episodes/'
3093                         else:
3094                                 url = 'http://www.colbertnation.com/full-episodes/'
3095                         mobj = re.match(self._VALID_URL, url)
3096                         assert mobj is not None
3097
3098                 dlNewest = not mobj.group('episode')
3099                 if dlNewest:
3100                         epTitle = mobj.group('showname')
3101                 else:
3102                         epTitle = mobj.group('episode')
3103
3104                 req = urllib2.Request(url)
3105                 self.report_extraction(epTitle)
3106                 try:
3107                         htmlHandle = urllib2.urlopen(req)
3108                         html = htmlHandle.read()
3109                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3110                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3111                         return
3112                 if dlNewest:
3113                         url = htmlHandle.geturl()
3114                         mobj = re.match(self._VALID_URL, url)
3115                         if mobj is None:
3116                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3117                                 return
3118                         if mobj.group('episode') == '':
3119                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3120                                 return
3121                         epTitle = mobj.group('episode')
3122
3123                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3124                 if len(mMovieParams) == 0:
3125                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3126                         return
3127
3128                 playerUrl_raw = mMovieParams[0][0]
3129                 self.report_player_url(epTitle)
3130                 try:
3131                         urlHandle = urllib2.urlopen(playerUrl_raw)
3132                         playerUrl = urlHandle.geturl()
3133                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3134                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3135                         return
3136
3137                 uri = mMovieParams[0][1]
3138                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3139                 self.report_index_download(epTitle)
3140                 try:
3141                         indexXml = urllib2.urlopen(indexUrl).read()
3142                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3143                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3144                         return
3145
3146                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3147                 itemEls = idoc.findall('.//item')
3148                 for itemEl in itemEls:
3149                         mediaId = itemEl.findall('./guid')[0].text
3150                         shortMediaId = mediaId.split(':')[-1]
3151                         showId = mediaId.split(':')[-2].replace('.com', '')
3152                         officialTitle = itemEl.findall('./title')[0].text
3153                         officialDate = itemEl.findall('./pubDate')[0].text
3154
3155                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3156                                                 urllib.urlencode({'uri': mediaId}))
3157                         configReq = urllib2.Request(configUrl)
3158                         self.report_config_download(epTitle)
3159                         try:
3160                                 configXml = urllib2.urlopen(configReq).read()
3161                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3162                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3163                                 return
3164
3165                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3166                         turls = []
3167                         for rendition in cdoc.findall('.//rendition'):
3168                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3169                                 turls.append(finfo)
3170
3171                         if len(turls) == 0:
3172                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3173                                 continue
3174
3175                         # For now, just pick the highest bitrate
3176                         format,video_url = turls[-1]
3177
3178                         self._downloader.increment_downloads()
3179
3180                         effTitle = showId + '-' + epTitle
3181                         info = {
3182                                 'id': shortMediaId,
3183                                 'url': video_url,
3184                                 'uploader': showId,
3185                                 'upload_date': officialDate,
3186                                 'title': effTitle,
3187                                 'stitle': self._simplify_title(effTitle),
3188                                 'ext': 'mp4',
3189                                 'format': format,
3190                                 'thumbnail': None,
3191                                 'description': officialTitle,
3192                                 'player_url': playerUrl
3193                         }
3194
3195                         try:
3196                                 self._downloader.process_info(info)
3197                         except UnavailableVideoError, err:
3198                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3199                                 continue
3200
3201
3202 class EscapistIE(InfoExtractor):
3203         """Information extractor for The Escapist """
3204
3205         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3206         IE_NAME = u'escapist'
3207
3208         def report_extraction(self, showName):
3209                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3210
3211         def report_config_download(self, showName):
3212                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3213
3214         def _real_extract(self, url):
3215                 htmlParser = HTMLParser.HTMLParser()
3216
3217                 mobj = re.match(self._VALID_URL, url)
3218                 if mobj is None:
3219                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3220                         return
3221                 showName = mobj.group('showname')
3222                 videoId = mobj.group('episode')
3223
3224                 self.report_extraction(showName)
3225                 try:
3226                         webPage = urllib2.urlopen(url).read()
3227                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3228                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3229                         return
3230
3231                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3232                 description = htmlParser.unescape(descMatch.group(1))
3233                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3234                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3235                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3236                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3237                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3238                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3239
3240                 self.report_config_download(showName)
3241                 try:
3242                         configJSON = urllib2.urlopen(configUrl).read()
3243                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3244                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3245                         return
3246
3247                 # Technically, it's JavaScript, not JSON
3248                 configJSON = configJSON.replace("'", '"')
3249
3250                 try:
3251                         config = json.loads(configJSON)
3252                 except (ValueError,), err:
3253                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3254                         return
3255
3256                 playlist = config['playlist']
3257                 videoUrl = playlist[1]['url']
3258
3259                 self._downloader.increment_downloads()
3260                 info = {
3261                         'id': videoId,
3262                         'url': videoUrl,
3263                         'uploader': showName,
3264                         'upload_date': None,
3265                         'title': showName,
3266                         'stitle': _simplify_title(showName),
3267                         'ext': 'flv',
3268                         'format': 'flv',
3269                         'thumbnail': imgUrl,
3270                         'description': description,
3271                         'player_url': playerUrl,
3272                 }
3273
3274                 try:
3275                         self._downloader.process_info(info)
3276                 except UnavailableVideoError, err:
3277                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3278
3279
3280 class CollegeHumorIE(InfoExtractor):
3281         """Information extractor for collegehumor.com"""
3282
3283         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3284         IE_NAME = u'collegehumor'
3285
3286         def report_webpage(self, video_id):
3287                 """Report information extraction."""
3288                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3289
3290         def report_extraction(self, video_id):
3291                 """Report information extraction."""
3292                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3293
3294         def _real_extract(self, url):
3295                 htmlParser = HTMLParser.HTMLParser()
3296
3297                 mobj = re.match(self._VALID_URL, url)
3298                 if mobj is None:
3299                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3300                         return
3301                 video_id = mobj.group('videoid')
3302
3303                 self.report_webpage(video_id)
3304                 request = urllib2.Request(url)
3305                 try:
3306                         webpage = urllib2.urlopen(request).read()
3307                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3308                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3309                         return
3310
3311                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3312                 if m is None:
3313                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3314                         return
3315                 internal_video_id = m.group('internalvideoid')
3316
3317                 info = {
3318                         'id': video_id,
3319                         'internal_id': internal_video_id,
3320                 }
3321
3322                 self.report_extraction(video_id)
3323                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3324                 try:
3325                         metaXml = urllib2.urlopen(xmlUrl).read()
3326                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3327                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3328                         return
3329
3330                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3331                 try:
3332                         videoNode = mdoc.findall('./video')[0]
3333                         info['description'] = videoNode.findall('./description')[0].text
3334                         info['title'] = videoNode.findall('./caption')[0].text
3335                         info['stitle'] = _simplify_title(info['title'])
3336                         info['url'] = videoNode.findall('./file')[0].text
3337                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3338                         info['ext'] = info['url'].rpartition('.')[2]
3339                         info['format'] = info['ext']
3340                 except IndexError:
3341                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3342                         return
3343
3344                 self._downloader.increment_downloads()
3345
3346                 try:
3347                         self._downloader.process_info(info)
3348                 except UnavailableVideoError, err:
3349                         self._downloader.trouble(u'\nERROR: unable to download video')
3350
3351
3352 class XVideosIE(InfoExtractor):
3353         """Information extractor for xvideos.com"""
3354
3355         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3356         IE_NAME = u'xvideos'
3357
3358         def report_webpage(self, video_id):
3359                 """Report information extraction."""
3360                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3361
3362         def report_extraction(self, video_id):
3363                 """Report information extraction."""
3364                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3365
3366         def _real_extract(self, url):
3367                 htmlParser = HTMLParser.HTMLParser()
3368
3369                 mobj = re.match(self._VALID_URL, url)
3370                 if mobj is None:
3371                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3372                         return
3373                 video_id = mobj.group(1).decode('utf-8')
3374
3375                 self.report_webpage(video_id)
3376
3377                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3378                 try:
3379                         webpage = urllib2.urlopen(request).read()
3380                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3381                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3382                         return
3383
3384                 self.report_extraction(video_id)
3385
3386
3387                 # Extract video URL
3388                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3389                 if mobj is None:
3390                         self._downloader.trouble(u'ERROR: unable to extract video url')
3391                         return
3392                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3393
3394
3395                 # Extract title
3396                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3397                 if mobj is None:
3398                         self._downloader.trouble(u'ERROR: unable to extract video title')
3399                         return
3400                 video_title = mobj.group(1).decode('utf-8')
3401
3402
3403                 # Extract video thumbnail
3404                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3405                 if mobj is None:
3406                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3407                         return
3408                 video_thumbnail = mobj.group(1).decode('utf-8')
3409
3410
3411
3412                 self._downloader.increment_downloads()
3413                 info = {
3414                         'id': video_id,
3415                         'url': video_url,
3416                         'uploader': None,
3417                         'upload_date': None,
3418                         'title': video_title,
3419                         'stitle': _simplify_title(video_title),
3420                         'ext': 'flv',
3421                         'format': 'flv',
3422                         'thumbnail': video_thumbnail,
3423                         'description': None,
3424                         'player_url': None,
3425                 }
3426
3427                 try:
3428                         self._downloader.process_info(info)
3429                 except UnavailableVideoError, err:
3430                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3431
3432
3433 class SoundcloudIE(InfoExtractor):
3434         """Information extractor for soundcloud.com
3435            To access the media, the uid of the song and a stream token
3436            must be extracted from the page source and the script must make
3437            a request to media.soundcloud.com/crossdomain.xml. Then
3438            the media can be grabbed by requesting from an url composed
3439            of the stream token and uid
3440          """
3441
3442         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3443         IE_NAME = u'soundcloud'
3444
3445         def __init__(self, downloader=None):
3446                 InfoExtractor.__init__(self, downloader)
3447
3448         def report_webpage(self, video_id):
3449                 """Report information extraction."""
3450                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3451
3452         def report_extraction(self, video_id):
3453                 """Report information extraction."""
3454                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3455
3456         def _real_extract(self, url):
3457                 htmlParser = HTMLParser.HTMLParser()
3458
3459                 mobj = re.match(self._VALID_URL, url)
3460                 if mobj is None:
3461                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3462                         return
3463
3464                 # extract uploader (which is in the url)
3465                 uploader = mobj.group(1).decode('utf-8')
3466                 # extract simple title (uploader + slug of song title)
3467                 slug_title =  mobj.group(2).decode('utf-8')
3468                 simple_title = uploader + '-' + slug_title
3469
3470                 self.report_webpage('%s/%s' % (uploader, slug_title))
3471
3472                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3473                 try:
3474                         webpage = urllib2.urlopen(request).read()
3475                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3476                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3477                         return
3478
3479                 self.report_extraction('%s/%s' % (uploader, slug_title))
3480
3481                 # extract uid and stream token that soundcloud hands out for access
3482                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3483                 if mobj:
3484                         video_id = mobj.group(1)
3485                         stream_token = mobj.group(2)
3486
3487                 # extract unsimplified title
3488                 mobj = re.search('"title":"(.*?)",', webpage)
3489                 if mobj:
3490                         title = mobj.group(1)
3491
3492                 # construct media url (with uid/token)
3493                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3494                 mediaURL = mediaURL % (video_id, stream_token)
3495
3496                 # description
3497                 description = u'No description available'
3498                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3499                 if mobj:
3500                         description = mobj.group(1)
3501
3502                 # upload date
3503                 upload_date = None
3504                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3505                 if mobj:
3506                         try:
3507                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3508                         except Exception as e:
3509                                 print str(e)
3510
3511                 # for soundcloud, a request to a cross domain is required for cookies
3512                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3513
3514                 try:
3515                         self._downloader.process_info({
3516                                 'id':           video_id.decode('utf-8'),
3517                                 'url':          mediaURL,
3518                                 'uploader':     uploader.decode('utf-8'),
3519                                 'upload_date':  upload_date,
3520                                 'title':        simple_title.decode('utf-8'),
3521                                 'stitle':       simple_title.decode('utf-8'),
3522                                 'ext':          u'mp3',
3523                                 'format':       u'NA',
3524                                 'player_url':   None,
3525                                 'description': description.decode('utf-8')
3526                         })
3527                 except UnavailableVideoError:
3528                         self._downloader.trouble(u'\nERROR: unable to download video')
3529
3530
3531 class InfoQIE(InfoExtractor):
3532         """Information extractor for infoq.com"""
3533
3534         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3535         IE_NAME = u'infoq'
3536
3537         def report_webpage(self, video_id):
3538                 """Report information extraction."""
3539                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3540
3541         def report_extraction(self, video_id):
3542                 """Report information extraction."""
3543                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3544
3545         def _real_extract(self, url):
3546                 htmlParser = HTMLParser.HTMLParser()
3547
3548                 mobj = re.match(self._VALID_URL, url)
3549                 if mobj is None:
3550                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3551                         return
3552
3553                 self.report_webpage(url)
3554
3555                 request = urllib2.Request(url)
3556                 try:
3557                         webpage = urllib2.urlopen(request).read()
3558                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3559                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3560                         return
3561
3562                 self.report_extraction(url)
3563
3564
3565                 # Extract video URL
3566                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3567                 if mobj is None:
3568                         self._downloader.trouble(u'ERROR: unable to extract video url')
3569                         return
3570                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3571
3572
3573                 # Extract title
3574                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3575                 if mobj is None:
3576                         self._downloader.trouble(u'ERROR: unable to extract video title')
3577                         return
3578                 video_title = mobj.group(1).decode('utf-8')
3579
3580                 # Extract description
3581                 video_description = u'No description available.'
3582                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3583                 if mobj is not None:
3584                         video_description = mobj.group(1).decode('utf-8')
3585
3586                 video_filename = video_url.split('/')[-1]
3587                 video_id, extension = video_filename.split('.')
3588
3589                 self._downloader.increment_downloads()
3590                 info = {
3591                         'id': video_id,
3592                         'url': video_url,
3593                         'uploader': None,
3594                         'upload_date': None,
3595                         'title': video_title,
3596                         'stitle': _simplify_title(video_title),
3597                         'ext': extension,
3598                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3599                         'thumbnail': None,
3600                         'description': video_description,
3601                         'player_url': None,
3602                 }
3603
3604                 try:
3605                         self._downloader.process_info(info)
3606                 except UnavailableVideoError, err:
3607                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3608
3609
3610
3611 class PostProcessor(object):
3612         """Post Processor class.
3613
3614         PostProcessor objects can be added to downloaders with their
3615         add_post_processor() method. When the downloader has finished a
3616         successful download, it will take its internal chain of PostProcessors
3617         and start calling the run() method on each one of them, first with
3618         an initial argument and then with the returned value of the previous
3619         PostProcessor.
3620
3621         The chain will be stopped if one of them ever returns None or the end
3622         of the chain is reached.
3623
3624         PostProcessor objects follow a "mutual registration" process similar
3625         to InfoExtractor objects.
3626         """
3627
3628         _downloader = None
3629
3630         def __init__(self, downloader=None):
3631                 self._downloader = downloader
3632
3633         def set_downloader(self, downloader):
3634                 """Sets the downloader for this PP."""
3635                 self._downloader = downloader
3636
3637         def run(self, information):
3638                 """Run the PostProcessor.
3639
3640                 The "information" argument is a dictionary like the ones
3641                 composed by InfoExtractors. The only difference is that this
3642                 one has an extra field called "filepath" that points to the
3643                 downloaded file.
3644
3645                 When this method returns None, the postprocessing chain is
3646                 stopped. However, this method may return an information
3647                 dictionary that will be passed to the next postprocessing
3648                 object in the chain. It can be the one it received after
3649                 changing some fields.
3650
3651                 In addition, this method may raise a PostProcessingError
3652                 exception that will be taken into account by the downloader
3653                 it was called from.
3654                 """
3655                 return information # by default, do nothing
3656
3657
3658 class FFmpegExtractAudioPP(PostProcessor):
3659
3660         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3661                 PostProcessor.__init__(self, downloader)
3662                 if preferredcodec is None:
3663                         preferredcodec = 'best'
3664                 self._preferredcodec = preferredcodec
3665                 self._preferredquality = preferredquality
3666                 self._keepvideo = keepvideo
3667
3668         @staticmethod
3669         def get_audio_codec(path):
3670                 try:
3671                         cmd = ['ffprobe', '-show_streams', '--', path]
3672                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3673                         output = handle.communicate()[0]
3674                         if handle.wait() != 0:
3675                                 return None
3676                 except (IOError, OSError):
3677                         return None
3678                 audio_codec = None
3679                 for line in output.split('\n'):
3680                         if line.startswith('codec_name='):
3681                                 audio_codec = line.split('=')[1].strip()
3682                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3683                                 return audio_codec
3684                 return None
3685
3686         @staticmethod
3687         def run_ffmpeg(path, out_path, codec, more_opts):
3688                 try:
3689                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3690                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3691                         return (ret == 0)
3692                 except (IOError, OSError):
3693                         return False
3694
3695         def run(self, information):
3696                 path = information['filepath']
3697
3698                 filecodec = self.get_audio_codec(path)
3699                 if filecodec is None:
3700                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3701                         return None
3702
3703                 more_opts = []
3704                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3705                         if filecodec in ['aac', 'mp3', 'vorbis']:
3706                                 # Lossless if possible
3707                                 acodec = 'copy'
3708                                 extension = filecodec
3709                                 if filecodec == 'aac':
3710                                         more_opts = ['-f', 'adts']
3711                                 if filecodec == 'vorbis':
3712                                         extension = 'ogg'
3713                         else:
3714                                 # MP3 otherwise.
3715                                 acodec = 'libmp3lame'
3716                                 extension = 'mp3'
3717                                 more_opts = []
3718                                 if self._preferredquality is not None:
3719                                         more_opts += ['-ab', self._preferredquality]
3720                 else:
3721                         # We convert the audio (lossy)
3722                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3723                         extension = self._preferredcodec
3724                         more_opts = []
3725                         if self._preferredquality is not None:
3726                                 more_opts += ['-ab', self._preferredquality]
3727                         if self._preferredcodec == 'aac':
3728                                 more_opts += ['-f', 'adts']
3729                         if self._preferredcodec == 'vorbis':
3730                                 extension = 'ogg'
3731
3732                 (prefix, ext) = os.path.splitext(path)
3733                 new_path = prefix + '.' + extension
3734                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3735                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3736
3737                 if not status:
3738                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3739                         return None
3740
3741                 # Try to update the date time for extracted audio file.
3742                 if information.get('filetime') is not None:
3743                         try:
3744                                 os.utime(new_path, (time.time(), information['filetime']))
3745                         except:
3746                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3747
3748                 if not self._keepvideo:
3749                         try:
3750                                 os.remove(path)
3751                         except (IOError, OSError):
3752                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3753                                 return None
3754
3755                 information['filepath'] = new_path
3756                 return information
3757
3758
3759 def updateSelf(downloader, filename):
3760         ''' Update the program file with the latest version from the repository '''
3761         # Note: downloader only used for options
3762         if not os.access(filename, os.W_OK):
3763                 sys.exit('ERROR: no write permissions on %s' % filename)
3764
3765         downloader.to_screen('Updating to latest version...')
3766
3767         try:
3768                 try:
3769                         urlh = urllib.urlopen(UPDATE_URL)
3770                         newcontent = urlh.read()
3771
3772                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3773                         if vmatch is not None and vmatch.group(1) == __version__:
3774                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3775                                 return
3776                 finally:
3777                         urlh.close()
3778         except (IOError, OSError), err:
3779                 sys.exit('ERROR: unable to download latest version')
3780
3781         try:
3782                 outf = open(filename, 'wb')
3783                 try:
3784                         outf.write(newcontent)
3785                 finally:
3786                         outf.close()
3787         except (IOError, OSError), err:
3788                 sys.exit('ERROR: unable to overwrite current version')
3789
3790         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3791
3792 def parseOpts():
3793         # Deferred imports
3794         import getpass
3795         import optparse
3796
3797         def _format_option_string(option):
3798                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3799
3800                 opts = []
3801
3802                 if option._short_opts: opts.append(option._short_opts[0])
3803                 if option._long_opts: opts.append(option._long_opts[0])
3804                 if len(opts) > 1: opts.insert(1, ', ')
3805
3806                 if option.takes_value(): opts.append(' %s' % option.metavar)
3807
3808                 return "".join(opts)
3809
3810         def _find_term_columns():
3811                 columns = os.environ.get('COLUMNS', None)
3812                 if columns:
3813                         return int(columns)
3814
3815                 try:
3816                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3817                         out,err = sp.communicate()
3818                         return int(out.split()[1])
3819                 except:
3820                         pass
3821                 return None
3822
3823         max_width = 80
3824         max_help_position = 80
3825
3826         # No need to wrap help messages if we're on a wide console
3827         columns = _find_term_columns()
3828         if columns: max_width = columns
3829
3830         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3831         fmt.format_option_strings = _format_option_string
3832
3833         kw = {
3834                 'version'   : __version__,
3835                 'formatter' : fmt,
3836                 'usage' : '%prog [options] url [url...]',
3837                 'conflict_handler' : 'resolve',
3838         }
3839
3840         parser = optparse.OptionParser(**kw)
3841
3842         # option groups
3843         general        = optparse.OptionGroup(parser, 'General Options')
3844         selection      = optparse.OptionGroup(parser, 'Video Selection')
3845         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3846         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3847         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3848         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3849         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3850
3851         general.add_option('-h', '--help',
3852                         action='help', help='print this help text and exit')
3853         general.add_option('-v', '--version',
3854                         action='version', help='print program version and exit')
3855         general.add_option('-U', '--update',
3856                         action='store_true', dest='update_self', help='update this program to latest version')
3857         general.add_option('-i', '--ignore-errors',
3858                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3859         general.add_option('-r', '--rate-limit',
3860                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3861         general.add_option('-R', '--retries',
3862                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3863         general.add_option('--dump-user-agent',
3864                         action='store_true', dest='dump_user_agent',
3865                         help='display the current browser identification', default=False)
3866         general.add_option('--list-extractors',
3867                         action='store_true', dest='list_extractors',
3868                         help='List all supported extractors and the URLs they would handle', default=False)
3869
3870         selection.add_option('--playlist-start',
3871                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3872         selection.add_option('--playlist-end',
3873                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3874         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3875         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3876
3877         authentication.add_option('-u', '--username',
3878                         dest='username', metavar='USERNAME', help='account username')
3879         authentication.add_option('-p', '--password',
3880                         dest='password', metavar='PASSWORD', help='account password')
3881         authentication.add_option('-n', '--netrc',
3882                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3883
3884
3885         video_format.add_option('-f', '--format',
3886                         action='store', dest='format', metavar='FORMAT', help='video format code')
3887         video_format.add_option('--all-formats',
3888                         action='store_const', dest='format', help='download all available video formats', const='all')
3889         video_format.add_option('--max-quality',
3890                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3891         video_format.add_option('-F', '--list-formats',
3892                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3893
3894
3895         verbosity.add_option('-q', '--quiet',
3896                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3897         verbosity.add_option('-s', '--simulate',
3898                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3899         verbosity.add_option('--skip-download',
3900                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3901         verbosity.add_option('-g', '--get-url',
3902                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3903         verbosity.add_option('-e', '--get-title',
3904                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3905         verbosity.add_option('--get-thumbnail',
3906                         action='store_true', dest='getthumbnail',
3907                         help='simulate, quiet but print thumbnail URL', default=False)
3908         verbosity.add_option('--get-description',
3909                         action='store_true', dest='getdescription',
3910                         help='simulate, quiet but print video description', default=False)
3911         verbosity.add_option('--get-filename',
3912                         action='store_true', dest='getfilename',
3913                         help='simulate, quiet but print output filename', default=False)
3914         verbosity.add_option('--get-format',
3915                         action='store_true', dest='getformat',
3916                         help='simulate, quiet but print output format', default=False)
3917         verbosity.add_option('--no-progress',
3918                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3919         verbosity.add_option('--console-title',
3920                         action='store_true', dest='consoletitle',
3921                         help='display progress in console titlebar', default=False)
3922
3923
3924         filesystem.add_option('-t', '--title',
3925                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3926         filesystem.add_option('-l', '--literal',
3927                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3928         filesystem.add_option('-A', '--auto-number',
3929                         action='store_true', dest='autonumber',
3930                         help='number downloaded files starting from 00000', default=False)
3931         filesystem.add_option('-o', '--output',
3932                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3933         filesystem.add_option('-a', '--batch-file',
3934                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3935         filesystem.add_option('-w', '--no-overwrites',
3936                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3937         filesystem.add_option('-c', '--continue',
3938                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3939         filesystem.add_option('--no-continue',
3940                         action='store_false', dest='continue_dl',
3941                         help='do not resume partially downloaded files (restart from beginning)')
3942         filesystem.add_option('--cookies',
3943                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3944         filesystem.add_option('--no-part',
3945                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3946         filesystem.add_option('--no-mtime',
3947                         action='store_false', dest='updatetime',
3948                         help='do not use the Last-modified header to set the file modification time', default=True)
3949         filesystem.add_option('--write-description',
3950                         action='store_true', dest='writedescription',
3951                         help='write video description to a .description file', default=False)
3952         filesystem.add_option('--write-info-json',
3953                         action='store_true', dest='writeinfojson',
3954                         help='write video metadata to a .info.json file', default=False)
3955
3956
3957         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3958                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3959         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3960                         help='"best", "aac", "vorbis" or "mp3"; best by default')
3961         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3962                         help='ffmpeg audio bitrate specification, 128k by default')
3963         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3964                         help='keeps the video file on disk after the post-processing; the video is erased by default')
3965
3966
3967         parser.add_option_group(general)
3968         parser.add_option_group(selection)
3969         parser.add_option_group(filesystem)
3970         parser.add_option_group(verbosity)
3971         parser.add_option_group(video_format)
3972         parser.add_option_group(authentication)
3973         parser.add_option_group(postproc)
3974
3975         opts, args = parser.parse_args()
3976
3977         return parser, opts, args
3978
3979 def gen_extractors():
3980         """ Return a list of an instance of every supported extractor.
3981         The order does matter; the first extractor matched is the one handling the URL.
3982         """
3983         youtube_ie = YoutubeIE()
3984         google_ie = GoogleIE()
3985         yahoo_ie = YahooIE()
3986         return [
3987                 YoutubePlaylistIE(youtube_ie),
3988                 YoutubeUserIE(youtube_ie),
3989                 YoutubeSearchIE(youtube_ie),
3990                 youtube_ie,
3991                 MetacafeIE(youtube_ie),
3992                 DailymotionIE(),
3993                 google_ie,
3994                 GoogleSearchIE(google_ie),
3995                 PhotobucketIE(),
3996                 yahoo_ie,
3997                 YahooSearchIE(yahoo_ie),
3998                 DepositFilesIE(),
3999                 FacebookIE(),
4000                 BlipTVIE(),
4001                 VimeoIE(),
4002                 MyVideoIE(),
4003                 ComedyCentralIE(),
4004                 EscapistIE(),
4005                 CollegeHumorIE(),
4006                 XVideosIE(),
4007                 SoundcloudIE(),
4008                 InfoQIE(),
4009
4010                 GenericIE()
4011         ]
4012
4013 def _real_main():
4014         parser, opts, args = parseOpts()
4015
4016         # Open appropriate CookieJar
4017         if opts.cookiefile is None:
4018                 jar = cookielib.CookieJar()
4019         else:
4020                 try:
4021                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4022                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4023                                 jar.load()
4024                 except (IOError, OSError), err:
4025                         sys.exit(u'ERROR: unable to open cookie file')
4026
4027         # Dump user agent
4028         if opts.dump_user_agent:
4029                 print std_headers['User-Agent']
4030                 sys.exit(0)
4031
4032         # Batch file verification
4033         batchurls = []
4034         if opts.batchfile is not None:
4035                 try:
4036                         if opts.batchfile == '-':
4037                                 batchfd = sys.stdin
4038                         else:
4039                                 batchfd = open(opts.batchfile, 'r')
4040                         batchurls = batchfd.readlines()
4041                         batchurls = [x.strip() for x in batchurls]
4042                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4043                 except IOError:
4044                         sys.exit(u'ERROR: batch file could not be read')
4045         all_urls = batchurls + args
4046
4047         # General configuration
4048         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4049         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4050         urllib2.install_opener(opener)
4051         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4052
4053         extractors = gen_extractors()
4054
4055         if opts.list_extractors:
4056                 for ie in extractors:
4057                         print(ie.IE_NAME)
4058                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4059                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4060                         for mu in matchedUrls:
4061                                 print(u'  ' + mu)
4062                 sys.exit(0)
4063
4064         # Conflicting, missing and erroneous options
4065         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4066                 parser.error(u'using .netrc conflicts with giving username/password')
4067         if opts.password is not None and opts.username is None:
4068                 parser.error(u'account username missing')
4069         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4070                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4071         if opts.usetitle and opts.useliteral:
4072                 parser.error(u'using title conflicts with using literal title')
4073         if opts.username is not None and opts.password is None:
4074                 opts.password = getpass.getpass(u'Type account password and press return:')
4075         if opts.ratelimit is not None:
4076                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4077                 if numeric_limit is None:
4078                         parser.error(u'invalid rate limit specified')
4079                 opts.ratelimit = numeric_limit
4080         if opts.retries is not None:
4081                 try:
4082                         opts.retries = long(opts.retries)
4083                 except (TypeError, ValueError), err:
4084                         parser.error(u'invalid retry count specified')
4085         try:
4086                 opts.playliststart = int(opts.playliststart)
4087                 if opts.playliststart <= 0:
4088                         raise ValueError(u'Playlist start must be positive')
4089         except (TypeError, ValueError), err:
4090                 parser.error(u'invalid playlist start number specified')
4091         try:
4092                 opts.playlistend = int(opts.playlistend)
4093                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4094                         raise ValueError(u'Playlist end must be greater than playlist start')
4095         except (TypeError, ValueError), err:
4096                 parser.error(u'invalid playlist end number specified')
4097         if opts.extractaudio:
4098                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4099                         parser.error(u'invalid audio format specified')
4100
4101         # File downloader
4102         fd = FileDownloader({
4103                 'usenetrc': opts.usenetrc,
4104                 'username': opts.username,
4105                 'password': opts.password,
4106                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4107                 'forceurl': opts.geturl,
4108                 'forcetitle': opts.gettitle,
4109                 'forcethumbnail': opts.getthumbnail,
4110                 'forcedescription': opts.getdescription,
4111                 'forcefilename': opts.getfilename,
4112                 'forceformat': opts.getformat,
4113                 'simulate': opts.simulate,
4114                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4115                 'format': opts.format,
4116                 'format_limit': opts.format_limit,
4117                 'listformats': opts.listformats,
4118                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4119                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4120                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4121                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4122                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4123                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4124                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4125                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4126                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4127                         or u'%(id)s.%(ext)s'),
4128                 'ignoreerrors': opts.ignoreerrors,
4129                 'ratelimit': opts.ratelimit,
4130                 'nooverwrites': opts.nooverwrites,
4131                 'retries': opts.retries,
4132                 'continuedl': opts.continue_dl,
4133                 'noprogress': opts.noprogress,
4134                 'playliststart': opts.playliststart,
4135                 'playlistend': opts.playlistend,
4136                 'logtostderr': opts.outtmpl == '-',
4137                 'consoletitle': opts.consoletitle,
4138                 'nopart': opts.nopart,
4139                 'updatetime': opts.updatetime,
4140                 'writedescription': opts.writedescription,
4141                 'writeinfojson': opts.writeinfojson,
4142                 'matchtitle': opts.matchtitle,
4143                 'rejecttitle': opts.rejecttitle,
4144                 })
4145         for extractor in extractors:
4146                 fd.add_info_extractor(extractor)
4147
4148         # PostProcessors
4149         if opts.extractaudio:
4150                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4151
4152         # Update version
4153         if opts.update_self:
4154                 updateSelf(fd, sys.argv[0])
4155
4156         # Maybe do nothing
4157         if len(all_urls) < 1:
4158                 if not opts.update_self:
4159                         parser.error(u'you must provide at least one URL')
4160                 else:
4161                         sys.exit()
4162         retcode = fd.download(all_urls)
4163
4164         # Dump cookie jar if requested
4165         if opts.cookiefile is not None:
4166                 try:
4167                         jar.save()
4168                 except (IOError, OSError), err:
4169                         sys.exit(u'ERROR: unable to save cookie jar')
4170
4171         sys.exit(retcode)
4172
4173 def main():
4174         try:
4175                 _real_main()
4176         except DownloadError:
4177                 sys.exit(1)
4178         except SameFileError:
4179                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4180         except KeyboardInterrupt:
4181                 sys.exit(u'\nERROR: Interrupted by user')
4182
4183 if __name__ == '__main__':
4184         main()
4185
4186 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: