youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         format_limit:   Highest quality format to try.
 193         outtmpl:        Template for output names.
 194         ignoreerrors:   Do not stop on download errors.
 195         ratelimit:      Download speed limit, in bytes/sec.
 196         nooverwrites:   Prevent overwriting files.
 197         retries:        Number of times to retry for HTTP error 503
 198         continuedl:     Try to continue downloads if possible.
 199         noprogress:     Do not print the progress bar.
 200         """
 201
 202         params = None
 203         _ies = []
 204         _pps = []
 205         _download_retcode = None
 206         _num_downloads = None
 207
 208         def __init__(self, params):
 209                 """Create a FileDownloader object with the given options."""
 210                 self._ies = []
 211                 self._pps = []
 212                 self._download_retcode = 0
 213                 self._num_downloads = 0
 214                 self.params = params
 215
 216         @staticmethod
 217         def pmkdir(filename):
 218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 219                 components = filename.split(os.sep)
 220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 222                 for dir in aggregate:
 223                         if not os.path.exists(dir):
 224                                 os.mkdir(dir)
 225
 226         @staticmethod
 227         def format_bytes(bytes):
 228                 if bytes is None:
 229                         return 'N/A'
 230                 if type(bytes) is str:
 231                         bytes = float(bytes)
 232                 if bytes == 0.0:
 233                         exponent = 0
 234                 else:
 235                         exponent = long(math.log(bytes, 1024.0))
 236                 suffix = 'bkMGTPEZY'[exponent]
 237                 converted = float(bytes) / float(1024**exponent)
 238                 return '%.2f%s' % (converted, suffix)
 239
 240         @staticmethod
 241         def calc_percent(byte_counter, data_len):
 242                 if data_len is None:
 243                         return '---.-%'
 244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 245
 246         @staticmethod
 247         def calc_eta(start, now, total, current):
 248                 if total is None:
 249                         return '--:--'
 250                 dif = now - start
 251                 if current == 0 or dif < 0.001: # One millisecond
 252                         return '--:--'
 253                 rate = float(current) / dif
 254                 eta = long((float(total) - float(current)) / rate)
 255                 (eta_mins, eta_secs) = divmod(eta, 60)
 256                 if eta_mins > 99:
 257                         return '--:--'
 258                 return '%02d:%02d' % (eta_mins, eta_secs)
 259
 260         @staticmethod
 261         def calc_speed(start, now, bytes):
 262                 dif = now - start
 263                 if bytes == 0 or dif < 0.001: # One millisecond
 264                         return '%10s' % '---b/s'
 265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 266
 267         @staticmethod
 268         def best_block_size(elapsed_time, bytes):
 269                 new_min = max(bytes / 2.0, 1.0)
 270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 271                 if elapsed_time < 0.001:
 272                         return long(new_max)
 273                 rate = bytes / elapsed_time
 274                 if rate > new_max:
 275                         return long(new_max)
 276                 if rate < new_min:
 277                         return long(new_min)
 278                 return long(rate)
 279
 280         @staticmethod
 281         def parse_bytes(bytestr):
 282                 """Parse a string indicating a byte quantity into a long integer."""
 283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 284                 if matchobj is None:
 285                         return None
 286                 number = float(matchobj.group(1))
 287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 288                 return long(round(number * multiplier))
 289
 290         @staticmethod
 291         def verify_url(url):
 292                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 293                 request = urllib2.Request(url, None, std_headers)
 294                 data = urllib2.urlopen(request)
 295                 data.read(1)
 296                 url = data.geturl()
 297                 data.close()
 298                 return url
 299
 300         def add_info_extractor(self, ie):
 301                 """Add an InfoExtractor object to the end of the list."""
 302                 self._ies.append(ie)
 303                 ie.set_downloader(self)
 304
 305         def add_post_processor(self, pp):
 306                 """Add a PostProcessor object to the end of the chain."""
 307                 self._pps.append(pp)
 308                 pp.set_downloader(self)
 309
 310         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 311                 """Print message to stdout if not in quiet mode."""
 312                 try:
 313                         if not self.params.get('quiet', False):
 314                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 315                         sys.stdout.flush()
 316                 except (UnicodeEncodeError), err:
 317                         if not ignore_encoding_errors:
 318                                 raise
 319
 320         def to_stderr(self, message):
 321                 """Print message to stderr."""
 322                 print >>sys.stderr, message.encode(preferredencoding())
 323
 324         def fixed_template(self):
 325                 """Checks if the output template is fixed."""
 326                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 327
 328         def trouble(self, message=None):
 329                 """Determine action to take when a download problem appears.
 330
 331                 Depending on if the downloader has been configured to ignore
 332                 download errors or not, this method may throw an exception or
 333                 not when errors are found, after printing the message.
 334                 """
 335                 if message is not None:
 336                         self.to_stderr(message)
 337                 if not self.params.get('ignoreerrors', False):
 338                         raise DownloadError(message)
 339                 self._download_retcode = 1
 340
 341         def slow_down(self, start_time, byte_counter):
 342                 """Sleep if the download speed is over the rate limit."""
 343                 rate_limit = self.params.get('ratelimit', None)
 344                 if rate_limit is None or byte_counter == 0:
 345                         return
 346                 now = time.time()
 347                 elapsed = now - start_time
 348                 if elapsed <= 0.0:
 349                         return
 350                 speed = float(byte_counter) / elapsed
 351                 if speed > rate_limit:
 352                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 353
 354         def report_destination(self, filename):
 355                 """Report destination filename."""
 356                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 357
 358         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 359                 """Report download progress."""
 360                 if self.params.get('noprogress', False):
 361                         return
 362                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 363                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 364
 365         def report_resuming_byte(self, resume_len):
 366                 """Report attemtp to resume at given byte."""
 367                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 368
 369         def report_retry(self, count, retries):
 370                 """Report retry in case of HTTP error 503"""
 371                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
 372
 373         def report_file_already_downloaded(self, file_name):
 374                 """Report file has already been fully downloaded."""
 375                 try:
 376                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 377                 except (UnicodeEncodeError), err:
 378                         self.to_stdout(u'[download] The file has already been downloaded')
 379
 380         def report_unable_to_resume(self):
 381                 """Report it was impossible to resume download."""
 382                 self.to_stdout(u'[download] Unable to resume')
 383
 384         def report_finish(self):
 385                 """Report download finished."""
 386                 if self.params.get('noprogress', False):
 387                         self.to_stdout(u'[download] Download completed')
 388                 else:
 389                         self.to_stdout(u'')
 390
 391         def process_info(self, info_dict):
 392                 """Process a single dictionary returned by an InfoExtractor."""
 393                 # Do nothing else if in simulate mode
 394                 if self.params.get('simulate', False):
 395                         # Verify URL if it's an HTTP one
 396                         if info_dict['url'].startswith('http'):
 397                                 try:
 398                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 399                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 400                                         raise UnavailableFormatError
 401
 402                         # Forced printings
 403                         if self.params.get('forcetitle', False):
 404                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 405                         if self.params.get('forceurl', False):
 406                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 407                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 408                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 409                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 410                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 411
 412                         return
 413
 414                 try:
 415                         template_dict = dict(info_dict)
 416                         template_dict['epoch'] = unicode(long(time.time()))
 417                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 418                         filename = self.params['outtmpl'] % template_dict
 419                 except (ValueError, KeyError), err:
 420                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 421                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 422                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 423                         return
 424
 425                 try:
 426                         self.pmkdir(filename)
 427                 except (OSError, IOError), err:
 428                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 429                         return
 430
 431                 try:
 432                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 433                 except (OSError, IOError), err:
 434                         raise UnavailableFormatError
 435                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 436                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 437                         return
 438                 except (ContentTooShortError, ), err:
 439                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 440                         return
 441
 442                 if success:
 443                         try:
 444                                 self.post_process(filename, info_dict)
 445                         except (PostProcessingError), err:
 446                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 447                                 return
 448
 449         def download(self, url_list):
 450                 """Download a given list of URLs."""
 451                 if len(url_list) > 1 and self.fixed_template():
 452                         raise SameFileError(self.params['outtmpl'])
 453
 454                 for url in url_list:
 455                         suitable_found = False
 456                         for ie in self._ies:
 457                                 # Go to next InfoExtractor if not suitable
 458                                 if not ie.suitable(url):
 459                                         continue
 460
 461                                 # Suitable InfoExtractor found
 462                                 suitable_found = True
 463
 464                                 # Extract information from URL and process it
 465                                 ie.extract(url)
 466
 467                                 # Suitable InfoExtractor had been found; go to next URL
 468                                 break
 469
 470                         if not suitable_found:
 471                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 472
 473                 return self._download_retcode
 474
 475         def post_process(self, filename, ie_info):
 476                 """Run the postprocessing chain on the given file."""
 477                 info = dict(ie_info)
 478                 info['filepath'] = filename
 479                 for pp in self._pps:
 480                         info = pp.run(info)
 481                         if info is None:
 482                                 break
 483
 484         def _download_with_rtmpdump(self, filename, url, player_url):
 485                 self.report_destination(filename)
 486
 487                 # Check for rtmpdump first
 488                 try:
 489                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 490                 except (OSError, IOError):
 491                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 492                         return False
 493
 494                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 495                 # the connection was interrumpted and resuming appears to be
 496                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 497                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
 498                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 499                 while retval == 2 or retval == 1:
 500                         prevsize = os.path.getsize(filename)
 501                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 502                         time.sleep(5.0) # This seems to be needed
 503                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 504                         cursize = os.path.getsize(filename)
 505                         if prevsize == cursize and retval == 1:
 506                                 break
 507                 if retval == 0:
 508                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 509                         return True
 510                 else:
 511                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 512                         return False
 513
 514         def _do_download(self, filename, url, player_url):
 515                 # Attempt to download using rtmpdump
 516                 if url.startswith('rtmp'):
 517                         return self._download_with_rtmpdump(filename, url, player_url)
 518
 519                 stream = None
 520                 open_mode = 'wb'
 521                 basic_request = urllib2.Request(url, None, std_headers)
 522                 request = urllib2.Request(url, None, std_headers)
 523
 524                 # Establish possible resume length
 525                 if os.path.isfile(filename):
 526                         resume_len = os.path.getsize(filename)
 527                 else:
 528                         resume_len = 0
 529
 530                 # Request parameters in case of being able to resume
 531                 if self.params.get('continuedl', False) and resume_len != 0:
 532                         self.report_resuming_byte(resume_len)
 533                         request.add_header('Range','bytes=%d-' % resume_len)
 534                         open_mode = 'ab'
 535
 536                 count = 0
 537                 retries = self.params.get('retries', 0)
 538                 while True:
 539                         # Establish connection
 540                         try:
 541                                 data = urllib2.urlopen(request)
 542                                 break
 543                         except (urllib2.HTTPError, ), err:
 544                                 if err.code == 503:
 545                                         # Retry in case of HTTP error 503
 546                                         count += 1
 547                                         if count <= retries:
 548                                                 self.report_retry(count, retries)
 549                                                 continue
 550                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
 551                                         raise
 552                                 # Unable to resume
 553                                 data = urllib2.urlopen(basic_request)
 554                                 content_length = data.info()['Content-Length']
 555
 556                                 if content_length is not None and long(content_length) == resume_len:
 557                                         # Because the file had already been fully downloaded
 558                                         self.report_file_already_downloaded(filename)
 559                                         return True
 560                                 else:
 561                                         # Because the server didn't let us
 562                                         self.report_unable_to_resume()
 563                                         open_mode = 'wb'
 564
 565                 data_len = data.info().get('Content-length', None)
 566                 data_len_str = self.format_bytes(data_len)
 567                 byte_counter = 0
 568                 block_size = 1024
 569                 start = time.time()
 570                 while True:
 571                         # Download and write
 572                         before = time.time()
 573                         data_block = data.read(block_size)
 574                         after = time.time()
 575                         data_block_len = len(data_block)
 576                         if data_block_len == 0:
 577                                 break
 578                         byte_counter += data_block_len
 579
 580                         # Open file just in time
 581                         if stream is None:
 582                                 try:
 583                                         (stream, filename) = sanitize_open(filename, open_mode)
 584                                         self.report_destination(filename)
 585                                         self._num_downloads += 1
 586                                 except (OSError, IOError), err:
 587                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 588                                         return False
 589                         try:
 590                                 stream.write(data_block)
 591                         except (IOError, OSError), err:
 592                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 593                         block_size = self.best_block_size(after - before, data_block_len)
 594
 595                         # Progress message
 596                         percent_str = self.calc_percent(byte_counter, data_len)
 597                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 598                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 599                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 600
 601                         # Apply rate limit
 602                         self.slow_down(start, byte_counter)
 603
 604                 self.report_finish()
 605                 if data_len is not None and str(byte_counter) != data_len:
 606                         raise ContentTooShortError(byte_counter, long(data_len))
 607                 return True
 608
 609 class InfoExtractor(object):
 610         """Information Extractor class.
 611
 612         Information extractors are the classes that, given a URL, extract
 613         information from the video (or videos) the URL refers to. This
 614         information includes the real video URL, the video title and simplified
 615         title, author and others. The information is stored in a dictionary
 616         which is then passed to the FileDownloader. The FileDownloader
 617         processes this information possibly downloading the video to the file
 618         system, among other possible outcomes. The dictionaries must include
 619         the following fields:
 620
 621         id:             Video identifier.
 622         url:            Final video URL.
 623         uploader:       Nickname of the video uploader.
 624         title:          Literal title.
 625         stitle:         Simplified title.
 626         ext:            Video filename extension.
 627         format:         Video format.
 628         player_url:     SWF Player URL (may be None).
 629
 630         The following fields are optional. Their primary purpose is to allow
 631         youtube-dl to serve as the backend for a video search function, such
 632         as the one in youtube2mp3.  They are only used when their respective
 633         forced printing functions are called:
 634
 635         thumbnail:      Full URL to a video thumbnail image.
 636         description:    One-line video description.
 637
 638         Subclasses of this one should re-define the _real_initialize() and
 639         _real_extract() methods, as well as the suitable() static method.
 640         Probably, they should also be instantiated and added to the main
 641         downloader.
 642         """
 643
 644         _ready = False
 645         _downloader = None
 646
 647         def __init__(self, downloader=None):
 648                 """Constructor. Receives an optional downloader."""
 649                 self._ready = False
 650                 self.set_downloader(downloader)
 651
 652         @staticmethod
 653         def suitable(url):
 654                 """Receives a URL and returns True if suitable for this IE."""
 655                 return False
 656
 657         def initialize(self):
 658                 """Initializes an instance (authentication, etc)."""
 659                 if not self._ready:
 660                         self._real_initialize()
 661                         self._ready = True
 662
 663         def extract(self, url):
 664                 """Extracts URL information and returns it in list of dicts."""
 665                 self.initialize()
 666                 return self._real_extract(url)
 667
 668         def set_downloader(self, downloader):
 669                 """Sets the downloader for this IE."""
 670                 self._downloader = downloader
 671
 672         def _real_initialize(self):
 673                 """Real initialization process. Redefine in subclasses."""
 674                 pass
 675
 676         def _real_extract(self, url):
 677                 """Real extraction process. Redefine in subclasses."""
 678                 pass
 679
 680 class YoutubeIE(InfoExtractor):
 681         """Information extractor for youtube.com."""
 682
 683         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 684         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 685         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 686         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 687         _NETRC_MACHINE = 'youtube'
 688         # Listed in order of priority for the -b option
 689         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13', None]
 690         _video_extensions = {
 691                 '13': '3gp',
 692                 '17': 'mp4',
 693                 '18': 'mp4',
 694                 '22': 'mp4',
 695                 '37': 'mp4',
 696                 '38': 'video',
 697                 '43': 'webm',
 698                 '45': 'webm',
 699         }
 700
 701         @staticmethod
 702         def suitable(url):
 703                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 704
 705         def report_lang(self):
 706                 """Report attempt to set language."""
 707                 self._downloader.to_stdout(u'[youtube] Setting language')
 708
 709         def report_login(self):
 710                 """Report attempt to log in."""
 711                 self._downloader.to_stdout(u'[youtube] Logging in')
 712
 713         def report_age_confirmation(self):
 714                 """Report attempt to confirm age."""
 715                 self._downloader.to_stdout(u'[youtube] Confirming age')
 716
 717         def report_video_webpage_download(self, video_id):
 718                 """Report attempt to download video webpage."""
 719                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 720
 721         def report_video_info_webpage_download(self, video_id):
 722                 """Report attempt to download video info webpage."""
 723                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 724
 725         def report_information_extraction(self, video_id):
 726                 """Report attempt to extract video information."""
 727                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 728
 729         def report_unavailable_format(self, video_id, format):
 730                 """Report extracted video URL."""
 731                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 732
 733         def report_rtmp_download(self):
 734                 """Indicate the download will use the RTMP protocol."""
 735                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 736
 737         def _real_initialize(self):
 738                 if self._downloader is None:
 739                         return
 740
 741                 username = None
 742                 password = None
 743                 downloader_params = self._downloader.params
 744
 745                 # Attempt to use provided username and password or .netrc data
 746                 if downloader_params.get('username', None) is not None:
 747                         username = downloader_params['username']
 748                         password = downloader_params['password']
 749                 elif downloader_params.get('usenetrc', False):
 750                         try:
 751                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 752                                 if info is not None:
 753                                         username = info[0]
 754                                         password = info[2]
 755                                 else:
 756                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 757                         except (IOError, netrc.NetrcParseError), err:
 758                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 759                                 return
 760
 761                 # Set language
 762                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 763                 try:
 764                         self.report_lang()
 765                         urllib2.urlopen(request).read()
 766                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 767                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 768                         return
 769
 770                 # No authentication to be performed
 771                 if username is None:
 772                         return
 773
 774                 # Log in
 775                 login_form = {
 776                                 'current_form': 'loginForm',
 777                                 'next':         '/',
 778                                 'action_login': 'Log In',
 779                                 'username':     username,
 780                                 'password':     password,
 781                                 }
 782                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 783                 try:
 784                         self.report_login()
 785                         login_results = urllib2.urlopen(request).read()
 786                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 787                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 788                                 return
 789                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 790                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 791                         return
 792
 793                 # Confirm age
 794                 age_form = {
 795                                 'next_url':             '/',
 796                                 'action_confirm':       'Confirm',
 797                                 }
 798                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 799                 try:
 800                         self.report_age_confirmation()
 801                         age_results = urllib2.urlopen(request).read()
 802                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 803                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 804                         return
 805
 806         def _real_extract(self, url):
 807                 # Extract video id from URL
 808                 mobj = re.match(self._VALID_URL, url)
 809                 if mobj is None:
 810                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 811                         return
 812                 video_id = mobj.group(2)
 813
 814                 # Downloader parameters
 815                 best_quality = False
 816                 all_formats = False
 817                 format_param = None
 818                 quality_index = 0
 819                 if self._downloader is not None:
 820                         params = self._downloader.params
 821                         format_param = params.get('format', None)
 822                         if format_param == '0':
 823                                 format_limit = params.get('format_limit', None)
 824                                 if format_limit is not None:
 825                                         try:
 826                                                 # Start at a different format if the user has limited the maximum quality
 827                                                 quality_index = self._available_formats.index(format_limit)
 828                                         except ValueError:
 829                                                 pass
 830                                 format_param = self._available_formats[quality_index]
 831                                 best_quality = True
 832                         elif format_param == '-1':
 833                                 format_param = self._available_formats[quality_index]
 834                                 all_formats = True
 835
 836                 while True:
 837                         # Extension
 838                         video_extension = self._video_extensions.get(format_param, 'flv')
 839
 840                         # Get video webpage
 841                         self.report_video_webpage_download(video_id)
 842                         request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
 843                         try:
 844                                 video_webpage = urllib2.urlopen(request).read()
 845                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 846                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 847                                 return
 848
 849                         # Attempt to extract SWF player URL
 850                         mobj = re.search(r'swfConfig.*"(http://.*?watch-.*?\.swf)"', video_webpage)
 851                         if mobj is not None:
 852                                 player_url = mobj.group(1)
 853                         else:
 854                                 player_url = None
 855
 856                         # Get video info
 857                         self.report_video_info_webpage_download(video_id)
 858                         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 859                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 860                                                    % (video_id, el_type))
 861                                 request = urllib2.Request(video_info_url, None, std_headers)
 862                                 try:
 863                                         video_info_webpage = urllib2.urlopen(request).read()
 864                                         video_info = parse_qs(video_info_webpage)
 865                                         if 'token' in video_info:
 866                                                 break
 867                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 868                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 869                                         return
 870                         self.report_information_extraction(video_id)
 871
 872                         # "t" param
 873                         if 'token' not in video_info:
 874                                 # Attempt to see if YouTube has issued an error message
 875                                 if 'reason' not in video_info:
 876                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 877                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 878                                         stream.write(video_info_webpage)
 879                                         stream.close()
 880                                 else:
 881                                         reason = urllib.unquote_plus(video_info['reason'][0])
 882                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 883                                 return
 884                         token = urllib.unquote_plus(video_info['token'][0])
 885                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 886                         if format_param is not None:
 887                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 888
 889                         # Check possible RTMP download
 890                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 891                                 self.report_rtmp_download()
 892                                 video_real_url = video_info['conn'][0]
 893
 894                         # uploader
 895                         if 'author' not in video_info:
 896                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 897                                 return
 898                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 899
 900                         # title
 901                         if 'title' not in video_info:
 902                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 903                                 return
 904                         video_title = urllib.unquote_plus(video_info['title'][0])
 905                         video_title = video_title.decode('utf-8')
 906                         video_title = sanitize_title(video_title)
 907
 908                         # simplified title
 909                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 910                         simple_title = simple_title.strip(ur'_')
 911
 912                         # thumbnail image
 913                         if 'thumbnail_url' not in video_info:
 914                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 915                                 video_thumbnail = ''
 916                         else:   # don't panic if we can't find it
 917                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 918
 919                         # description
 920                         video_description = 'No description available.'
 921                         if self._downloader.params.get('forcedescription', False):
 922                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 923                                 if mobj is not None:
 924                                         video_description = mobj.group(1)
 925
 926                         try:
 927                                 # Process video information
 928                                 self._downloader.process_info({
 929                                         'id':           video_id.decode('utf-8'),
 930                                         'url':          video_real_url.decode('utf-8'),
 931                                         'uploader':     video_uploader.decode('utf-8'),
 932                                         'title':        video_title,
 933                                         'stitle':       simple_title,
 934                                         'ext':          video_extension.decode('utf-8'),
 935                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 936                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 937                                         'description':  video_description.decode('utf-8'),
 938                                         'player_url':   player_url,
 939                                 })
 940
 941                                 if all_formats:
 942                                         quality_index += 1
 943                                         if quality_index == len(self._available_formats):
 944                                                 # None left to get
 945                                                 return
 946                                         else:
 947                                                 format_param = self._available_formats[quality_index]
 948                                                 continue
 949                                 return
 950
 951                         except UnavailableFormatError, err:
 952                                 if best_quality or all_formats:
 953                                         quality_index += 1
 954                                         if quality_index == len(self._available_formats):
 955                                                 # I don't ever expect this to happen
 956                                                 if not all_formats:
 957                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 958                                                 return
 959                                         else:
 960                                                 self.report_unavailable_format(video_id, format_param)
 961                                                 format_param = self._available_formats[quality_index]
 962                                                 continue
 963                                 else:
 964                                         self._downloader.trouble('ERROR: format not available for video')
 965                                         return
 966
 967
 968 class MetacafeIE(InfoExtractor):
 969         """Information Extractor for metacafe.com."""
 970
 971         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 972         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 973         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 974         _youtube_ie = None
 975
 976         def __init__(self, youtube_ie, downloader=None):
 977                 InfoExtractor.__init__(self, downloader)
 978                 self._youtube_ie = youtube_ie
 979
 980         @staticmethod
 981         def suitable(url):
 982                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 983
 984         def report_disclaimer(self):
 985                 """Report disclaimer retrieval."""
 986                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 987
 988         def report_age_confirmation(self):
 989                 """Report attempt to confirm age."""
 990                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 991
 992         def report_download_webpage(self, video_id):
 993                 """Report webpage download."""
 994                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 995
 996         def report_extraction(self, video_id):
 997                 """Report information extraction."""
 998                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 999
1000         def _real_initialize(self):
1001                 # Retrieve disclaimer
1002                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1003                 try:
1004                         self.report_disclaimer()
1005                         disclaimer = urllib2.urlopen(request).read()
1006                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1007                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1008                         return
1009
1010                 # Confirm age
1011                 disclaimer_form = {
1012                         'filters': '0',
1013                         'submit': "Continue - I'm over 18",
1014                         }
1015                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1016                 try:
1017                         self.report_age_confirmation()
1018                         disclaimer = urllib2.urlopen(request).read()
1019                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1020                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1021                         return
1022
1023         def _real_extract(self, url):
1024                 # Extract id and simplified title from URL
1025                 mobj = re.match(self._VALID_URL, url)
1026                 if mobj is None:
1027                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1028                         return
1029
1030                 video_id = mobj.group(1)
1031
1032                 # Check if video comes from YouTube
1033                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1034                 if mobj2 is not None:
1035                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1036                         return
1037
1038                 simple_title = mobj.group(2).decode('utf-8')
1039                 video_extension = 'flv'
1040
1041                 # Retrieve video webpage to extract further information
1042                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1043                 try:
1044                         self.report_download_webpage(video_id)
1045                         webpage = urllib2.urlopen(request).read()
1046                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1047                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1048                         return
1049
1050                 # Extract URL, uploader and title from webpage
1051                 self.report_extraction(video_id)
1052                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1053                 if mobj is None:
1054                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1055                         return
1056                 mediaURL = urllib.unquote(mobj.group(1))
1057
1058                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1059                 #if mobj is None:
1060                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1061                 #       return
1062                 #gdaKey = mobj.group(1)
1063                 #
1064                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1065
1066                 video_url = mediaURL
1067
1068                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1069                 if mobj is None:
1070                         self._downloader.trouble(u'ERROR: unable to extract title')
1071                         return
1072                 video_title = mobj.group(1).decode('utf-8')
1073                 video_title = sanitize_title(video_title)
1074
1075                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1076                 if mobj is None:
1077                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1078                         return
1079                 video_uploader = mobj.group(1)
1080
1081                 try:
1082                         # Process video information
1083                         self._downloader.process_info({
1084                                 'id':           video_id.decode('utf-8'),
1085                                 'url':          video_url.decode('utf-8'),
1086                                 'uploader':     video_uploader.decode('utf-8'),
1087                                 'title':        video_title,
1088                                 'stitle':       simple_title,
1089                                 'ext':          video_extension.decode('utf-8'),
1090                                 'format':       u'NA',
1091                                 'player_url':   None,
1092                         })
1093                 except UnavailableFormatError:
1094                         self._downloader.trouble(u'ERROR: format not available for video')
1095
1096
1097 class DailymotionIE(InfoExtractor):
1098         """Information Extractor for Dailymotion"""
1099
1100         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1101
1102         def __init__(self, downloader=None):
1103                 InfoExtractor.__init__(self, downloader)
1104
1105         @staticmethod
1106         def suitable(url):
1107                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1108
1109         def report_download_webpage(self, video_id):
1110                 """Report webpage download."""
1111                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1112
1113         def report_extraction(self, video_id):
1114                 """Report information extraction."""
1115                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1116
1117         def _real_initialize(self):
1118                 return
1119
1120         def _real_extract(self, url):
1121                 # Extract id and simplified title from URL
1122                 mobj = re.match(self._VALID_URL, url)
1123                 if mobj is None:
1124                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1125                         return
1126
1127                 video_id = mobj.group(1)
1128
1129                 simple_title = mobj.group(2).decode('utf-8')
1130                 video_extension = 'flv'
1131
1132                 # Retrieve video webpage to extract further information
1133                 request = urllib2.Request(url)
1134                 try:
1135                         self.report_download_webpage(video_id)
1136                         webpage = urllib2.urlopen(request).read()
1137                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1139                         return
1140
1141                 # Extract URL, uploader and title from webpage
1142                 self.report_extraction(video_id)
1143                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1144                 if mobj is None:
1145                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1146                         return
1147                 mediaURL = urllib.unquote(mobj.group(1))
1148
1149                 # if needed add http://www.dailymotion.com/ if relative URL
1150
1151                 video_url = mediaURL
1152
1153                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1154                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1155                 if mobj is None:
1156                         self._downloader.trouble(u'ERROR: unable to extract title')
1157                         return
1158                 video_title = mobj.group(1).decode('utf-8')
1159                 video_title = sanitize_title(video_title)
1160
1161                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1162                 if mobj is None:
1163                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1164                         return
1165                 video_uploader = mobj.group(1)
1166
1167                 try:
1168                         # Process video information
1169                         self._downloader.process_info({
1170                                 'id':           video_id.decode('utf-8'),
1171                                 'url':          video_url.decode('utf-8'),
1172                                 'uploader':     video_uploader.decode('utf-8'),
1173                                 'title':        video_title,
1174                                 'stitle':       simple_title,
1175                                 'ext':          video_extension.decode('utf-8'),
1176                                 'format':       u'NA',
1177                                 'player_url':   None,
1178                         })
1179                 except UnavailableFormatError:
1180                         self._downloader.trouble(u'ERROR: format not available for video')
1181
1182 class GoogleIE(InfoExtractor):
1183         """Information extractor for video.google.com."""
1184
1185         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1186
1187         def __init__(self, downloader=None):
1188                 InfoExtractor.__init__(self, downloader)
1189
1190         @staticmethod
1191         def suitable(url):
1192                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1193
1194         def report_download_webpage(self, video_id):
1195                 """Report webpage download."""
1196                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1197
1198         def report_extraction(self, video_id):
1199                 """Report information extraction."""
1200                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1201
1202         def _real_initialize(self):
1203                 return
1204
1205         def _real_extract(self, url):
1206                 # Extract id from URL
1207                 mobj = re.match(self._VALID_URL, url)
1208                 if mobj is None:
1209                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1210                         return
1211
1212                 video_id = mobj.group(1)
1213
1214                 video_extension = 'mp4'
1215
1216                 # Retrieve video webpage to extract further information
1217                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1218                 try:
1219                         self.report_download_webpage(video_id)
1220                         webpage = urllib2.urlopen(request).read()
1221                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1222                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1223                         return
1224
1225                 # Extract URL, uploader, and title from webpage
1226                 self.report_extraction(video_id)
1227                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1228                 if mobj is None:
1229                         video_extension = 'flv'
1230                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1231                 if mobj is None:
1232                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1233                         return
1234                 mediaURL = urllib.unquote(mobj.group(1))
1235                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1236                 mediaURL = mediaURL.replace('\\x26', '\x26')
1237
1238                 video_url = mediaURL
1239
1240                 mobj = re.search(r'<title>(.*)</title>', webpage)
1241                 if mobj is None:
1242                         self._downloader.trouble(u'ERROR: unable to extract title')
1243                         return
1244                 video_title = mobj.group(1).decode('utf-8')
1245                 video_title = sanitize_title(video_title)
1246                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1247
1248                 # Extract video description
1249                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1250                 if mobj is None:
1251                         self._downloader.trouble(u'ERROR: unable to extract video description')
1252                         return
1253                 video_description = mobj.group(1).decode('utf-8')
1254                 if not video_description:
1255                         video_description = 'No description available.'
1256
1257                 # Extract video thumbnail
1258                 if self._downloader.params.get('forcethumbnail', False):
1259                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1260                         try:
1261                                 webpage = urllib2.urlopen(request).read()
1262                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1263                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1264                                 return
1265                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1266                         if mobj is None:
1267                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1268                                 return
1269                         video_thumbnail = mobj.group(1)
1270                 else:   # we need something to pass to process_info
1271                         video_thumbnail = ''
1272
1273
1274                 try:
1275                         # Process video information
1276                         self._downloader.process_info({
1277                                 'id':           video_id.decode('utf-8'),
1278                                 'url':          video_url.decode('utf-8'),
1279                                 'uploader':     u'NA',
1280                                 'title':        video_title,
1281                                 'stitle':       simple_title,
1282                                 'ext':          video_extension.decode('utf-8'),
1283                                 'format':       u'NA',
1284                                 'player_url':   None,
1285                         })
1286                 except UnavailableFormatError:
1287                         self._downloader.trouble(u'ERROR: format not available for video')
1288
1289
1290 class PhotobucketIE(InfoExtractor):
1291         """Information extractor for photobucket.com."""
1292
1293         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1294
1295         def __init__(self, downloader=None):
1296                 InfoExtractor.__init__(self, downloader)
1297
1298         @staticmethod
1299         def suitable(url):
1300                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1301
1302         def report_download_webpage(self, video_id):
1303                 """Report webpage download."""
1304                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1305
1306         def report_extraction(self, video_id):
1307                 """Report information extraction."""
1308                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1309
1310         def _real_initialize(self):
1311                 return
1312
1313         def _real_extract(self, url):
1314                 # Extract id from URL
1315                 mobj = re.match(self._VALID_URL, url)
1316                 if mobj is None:
1317                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1318                         return
1319
1320                 video_id = mobj.group(1)
1321
1322                 video_extension = 'flv'
1323
1324                 # Retrieve video webpage to extract further information
1325                 request = urllib2.Request(url)
1326                 try:
1327                         self.report_download_webpage(video_id)
1328                         webpage = urllib2.urlopen(request).read()
1329                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1330                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1331                         return
1332
1333                 # Extract URL, uploader, and title from webpage
1334                 self.report_extraction(video_id)
1335                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1336                 if mobj is None:
1337                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1338                         return
1339                 mediaURL = urllib.unquote(mobj.group(1))
1340
1341                 video_url = mediaURL
1342
1343                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1344                 if mobj is None:
1345                         self._downloader.trouble(u'ERROR: unable to extract title')
1346                         return
1347                 video_title = mobj.group(1).decode('utf-8')
1348                 video_title = sanitize_title(video_title)
1349                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1350
1351                 video_uploader = mobj.group(2).decode('utf-8')
1352
1353                 try:
1354                         # Process video information
1355                         self._downloader.process_info({
1356                                 'id':           video_id.decode('utf-8'),
1357                                 'url':          video_url.decode('utf-8'),
1358                                 'uploader':     video_uploader,
1359                                 'title':        video_title,
1360                                 'stitle':       simple_title,
1361                                 'ext':          video_extension.decode('utf-8'),
1362                                 'format':       u'NA',
1363                                 'player_url':   None,
1364                         })
1365                 except UnavailableFormatError:
1366                         self._downloader.trouble(u'ERROR: format not available for video')
1367
1368
1369 class YahooIE(InfoExtractor):
1370         """Information extractor for video.yahoo.com."""
1371
1372         # _VALID_URL matches all Yahoo! Video URLs
1373         # _VPAGE_URL matches only the extractable '/watch/' URLs
1374         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1375         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1376
1377         def __init__(self, downloader=None):
1378                 InfoExtractor.__init__(self, downloader)
1379
1380         @staticmethod
1381         def suitable(url):
1382                 return (re.match(YahooIE._VALID_URL, url) is not None)
1383
1384         def report_download_webpage(self, video_id):
1385                 """Report webpage download."""
1386                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1387
1388         def report_extraction(self, video_id):
1389                 """Report information extraction."""
1390                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1391
1392         def _real_initialize(self):
1393                 return
1394
1395         def _real_extract(self, url):
1396                 # Extract ID from URL
1397                 mobj = re.match(self._VALID_URL, url)
1398                 if mobj is None:
1399                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1400                         return
1401
1402                 video_id = mobj.group(2)
1403                 video_extension = 'flv'
1404
1405                 # Rewrite valid but non-extractable URLs as
1406                 # extractable English language /watch/ URLs
1407                 if re.match(self._VPAGE_URL, url) is None:
1408                         request = urllib2.Request(url)
1409                         try:
1410                                 webpage = urllib2.urlopen(request).read()
1411                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1413                                 return
1414
1415                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1416                         if mobj is None:
1417                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1418                                 return
1419                         yahoo_id = mobj.group(1)
1420
1421                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1422                         if mobj is None:
1423                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1424                                 return
1425                         yahoo_vid = mobj.group(1)
1426
1427                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1428                         return self._real_extract(url)
1429
1430                 # Retrieve video webpage to extract further information
1431                 request = urllib2.Request(url)
1432                 try:
1433                         self.report_download_webpage(video_id)
1434                         webpage = urllib2.urlopen(request).read()
1435                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1436                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1437                         return
1438
1439                 # Extract uploader and title from webpage
1440                 self.report_extraction(video_id)
1441                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1442                 if mobj is None:
1443                         self._downloader.trouble(u'ERROR: unable to extract video title')
1444                         return
1445                 video_title = mobj.group(1).decode('utf-8')
1446                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1447
1448                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1449                 if mobj is None:
1450                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1451                         return
1452                 video_uploader = mobj.group(1).decode('utf-8')
1453
1454                 # Extract video thumbnail
1455                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1456                 if mobj is None:
1457                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1458                         return
1459                 video_thumbnail = mobj.group(1).decode('utf-8')
1460
1461                 # Extract video description
1462                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1463                 if mobj is None:
1464                         self._downloader.trouble(u'ERROR: unable to extract video description')
1465                         return
1466                 video_description = mobj.group(1).decode('utf-8')
1467                 if not video_description: video_description = 'No description available.'
1468
1469                 # Extract video height and width
1470                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1471                 if mobj is None:
1472                         self._downloader.trouble(u'ERROR: unable to extract video height')
1473                         return
1474                 yv_video_height = mobj.group(1)
1475
1476                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1477                 if mobj is None:
1478                         self._downloader.trouble(u'ERROR: unable to extract video width')
1479                         return
1480                 yv_video_width = mobj.group(1)
1481
1482                 # Retrieve video playlist to extract media URL
1483                 # I'm not completely sure what all these options are, but we
1484                 # seem to need most of them, otherwise the server sends a 401.
1485                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1486                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1487                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1488                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1489                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1490                 try:
1491                         self.report_download_webpage(video_id)
1492                         webpage = urllib2.urlopen(request).read()
1493                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1494                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1495                         return
1496
1497                 # Extract media URL from playlist XML
1498                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1499                 if mobj is None:
1500                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1501                         return
1502                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1503                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1504
1505                 try:
1506                         # Process video information
1507                         self._downloader.process_info({
1508                                 'id':           video_id.decode('utf-8'),
1509                                 'url':          video_url,
1510                                 'uploader':     video_uploader,
1511                                 'title':        video_title,
1512                                 'stitle':       simple_title,
1513                                 'ext':          video_extension.decode('utf-8'),
1514                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1515                                 'description':  video_description,
1516                                 'thumbnail':    video_thumbnail,
1517                                 'description':  video_description,
1518                                 'player_url':   None,
1519                         })
1520                 except UnavailableFormatError:
1521                         self._downloader.trouble(u'ERROR: format not available for video')
1522
1523
1524 class GenericIE(InfoExtractor):
1525         """Generic last-resort information extractor."""
1526
1527         def __init__(self, downloader=None):
1528                 InfoExtractor.__init__(self, downloader)
1529
1530         @staticmethod
1531         def suitable(url):
1532                 return True
1533
1534         def report_download_webpage(self, video_id):
1535                 """Report webpage download."""
1536                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1537                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1538
1539         def report_extraction(self, video_id):
1540                 """Report information extraction."""
1541                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1542
1543         def _real_initialize(self):
1544                 return
1545
1546         def _real_extract(self, url):
1547                 video_id = url.split('/')[-1]
1548                 request = urllib2.Request(url)
1549                 try:
1550                         self.report_download_webpage(video_id)
1551                         webpage = urllib2.urlopen(request).read()
1552                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1553                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1554                         return
1555                 except ValueError, err:
1556                         # since this is the last-resort InfoExtractor, if
1557                         # this error is thrown, it'll be thrown here
1558                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1559                         return
1560
1561                 # Start with something easy: JW Player in SWFObject
1562                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1563                 if mobj is None:
1564                         # Broaden the search a little bit
1565                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1566                 if mobj is None:
1567                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1568                         return
1569
1570                 # It's possible that one of the regexes
1571                 # matched, but returned an empty group:
1572                 if mobj.group(1) is None:
1573                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1574                         return
1575
1576                 video_url = urllib.unquote(mobj.group(1))
1577                 video_id  = os.path.basename(video_url)
1578
1579                 # here's a fun little line of code for you:
1580                 video_extension = os.path.splitext(video_id)[1][1:]
1581                 video_id        = os.path.splitext(video_id)[0]
1582
1583                 # it's tempting to parse this further, but you would
1584                 # have to take into account all the variations like
1585                 #   Video Title - Site Name
1586                 #   Site Name | Video Title
1587                 #   Video Title - Tagline | Site Name
1588                 # and so on and so forth; it's just not practical
1589                 mobj = re.search(r'<title>(.*)</title>', webpage)
1590                 if mobj is None:
1591                         self._downloader.trouble(u'ERROR: unable to extract title')
1592                         return
1593                 video_title = mobj.group(1).decode('utf-8')
1594                 video_title = sanitize_title(video_title)
1595                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1596
1597                 # video uploader is domain name
1598                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1599                 if mobj is None:
1600                         self._downloader.trouble(u'ERROR: unable to extract title')
1601                         return
1602                 video_uploader = mobj.group(1).decode('utf-8')
1603
1604                 try:
1605                         # Process video information
1606                         self._downloader.process_info({
1607                                 'id':           video_id.decode('utf-8'),
1608                                 'url':          video_url.decode('utf-8'),
1609                                 'uploader':     video_uploader,
1610                                 'title':        video_title,
1611                                 'stitle':       simple_title,
1612                                 'ext':          video_extension.decode('utf-8'),
1613                                 'format':       u'NA',
1614                                 'player_url':   None,
1615                         })
1616                 except UnavailableFormatError:
1617                         self._downloader.trouble(u'ERROR: format not available for video')
1618
1619
1620 class YoutubeSearchIE(InfoExtractor):
1621         """Information Extractor for YouTube search queries."""
1622         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1623         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1624         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1625         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1626         _youtube_ie = None
1627         _max_youtube_results = 1000
1628
1629         def __init__(self, youtube_ie, downloader=None):
1630                 InfoExtractor.__init__(self, downloader)
1631                 self._youtube_ie = youtube_ie
1632
1633         @staticmethod
1634         def suitable(url):
1635                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1636
1637         def report_download_page(self, query, pagenum):
1638                 """Report attempt to download playlist page with given number."""
1639                 query = query.decode(preferredencoding())
1640                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1641
1642         def _real_initialize(self):
1643                 self._youtube_ie.initialize()
1644
1645         def _real_extract(self, query):
1646                 mobj = re.match(self._VALID_QUERY, query)
1647                 if mobj is None:
1648                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1649                         return
1650
1651                 prefix, query = query.split(':')
1652                 prefix = prefix[8:]
1653                 query  = query.encode('utf-8')
1654                 if prefix == '':
1655                         self._download_n_results(query, 1)
1656                         return
1657                 elif prefix == 'all':
1658                         self._download_n_results(query, self._max_youtube_results)
1659                         return
1660                 else:
1661                         try:
1662                                 n = long(prefix)
1663                                 if n <= 0:
1664                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1665                                         return
1666                                 elif n > self._max_youtube_results:
1667                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1668                                         n = self._max_youtube_results
1669                                 self._download_n_results(query, n)
1670                                 return
1671                         except ValueError: # parsing prefix as integer fails
1672                                 self._download_n_results(query, 1)
1673                                 return
1674
1675         def _download_n_results(self, query, n):
1676                 """Downloads a specified number of results for a query"""
1677
1678                 video_ids = []
1679                 already_seen = set()
1680                 pagenum = 1
1681
1682                 while True:
1683                         self.report_download_page(query, pagenum)
1684                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1685                         request = urllib2.Request(result_url, None, std_headers)
1686                         try:
1687                                 page = urllib2.urlopen(request).read()
1688                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1689                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1690                                 return
1691
1692                         # Extract video identifiers
1693                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1694                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1695                                 if video_id not in already_seen:
1696                                         video_ids.append(video_id)
1697                                         already_seen.add(video_id)
1698                                         if len(video_ids) == n:
1699                                                 # Specified n videos reached
1700                                                 for id in video_ids:
1701                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1702                                                 return
1703
1704                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1705                                 for id in video_ids:
1706                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1707                                 return
1708
1709                         pagenum = pagenum + 1
1710
1711 class GoogleSearchIE(InfoExtractor):
1712         """Information Extractor for Google Video search queries."""
1713         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1714         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1715         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1716         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1717         _google_ie = None
1718         _max_google_results = 1000
1719
1720         def __init__(self, google_ie, downloader=None):
1721                 InfoExtractor.__init__(self, downloader)
1722                 self._google_ie = google_ie
1723
1724         @staticmethod
1725         def suitable(url):
1726                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1727
1728         def report_download_page(self, query, pagenum):
1729                 """Report attempt to download playlist page with given number."""
1730                 query = query.decode(preferredencoding())
1731                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1732
1733         def _real_initialize(self):
1734                 self._google_ie.initialize()
1735
1736         def _real_extract(self, query):
1737                 mobj = re.match(self._VALID_QUERY, query)
1738                 if mobj is None:
1739                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1740                         return
1741
1742                 prefix, query = query.split(':')
1743                 prefix = prefix[8:]
1744                 query  = query.encode('utf-8')
1745                 if prefix == '':
1746                         self._download_n_results(query, 1)
1747                         return
1748                 elif prefix == 'all':
1749                         self._download_n_results(query, self._max_google_results)
1750                         return
1751                 else:
1752                         try:
1753                                 n = long(prefix)
1754                                 if n <= 0:
1755                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1756                                         return
1757                                 elif n > self._max_google_results:
1758                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1759                                         n = self._max_google_results
1760                                 self._download_n_results(query, n)
1761                                 return
1762                         except ValueError: # parsing prefix as integer fails
1763                                 self._download_n_results(query, 1)
1764                                 return
1765
1766         def _download_n_results(self, query, n):
1767                 """Downloads a specified number of results for a query"""
1768
1769                 video_ids = []
1770                 already_seen = set()
1771                 pagenum = 1
1772
1773                 while True:
1774                         self.report_download_page(query, pagenum)
1775                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1776                         request = urllib2.Request(result_url, None, std_headers)
1777                         try:
1778                                 page = urllib2.urlopen(request).read()
1779                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1780                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1781                                 return
1782
1783                         # Extract video identifiers
1784                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1785                                 video_id = mobj.group(1)
1786                                 if video_id not in already_seen:
1787                                         video_ids.append(video_id)
1788                                         already_seen.add(video_id)
1789                                         if len(video_ids) == n:
1790                                                 # Specified n videos reached
1791                                                 for id in video_ids:
1792                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1793                                                 return
1794
1795                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1796                                 for id in video_ids:
1797                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1798                                 return
1799
1800                         pagenum = pagenum + 1
1801
1802 class YahooSearchIE(InfoExtractor):
1803         """Information Extractor for Yahoo! Video search queries."""
1804         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1805         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1806         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1807         _MORE_PAGES_INDICATOR = r'\s*Next'
1808         _yahoo_ie = None
1809         _max_yahoo_results = 1000
1810
1811         def __init__(self, yahoo_ie, downloader=None):
1812                 InfoExtractor.__init__(self, downloader)
1813                 self._yahoo_ie = yahoo_ie
1814
1815         @staticmethod
1816         def suitable(url):
1817                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1818
1819         def report_download_page(self, query, pagenum):
1820                 """Report attempt to download playlist page with given number."""
1821                 query = query.decode(preferredencoding())
1822                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1823
1824         def _real_initialize(self):
1825                 self._yahoo_ie.initialize()
1826
1827         def _real_extract(self, query):
1828                 mobj = re.match(self._VALID_QUERY, query)
1829                 if mobj is None:
1830                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1831                         return
1832
1833                 prefix, query = query.split(':')
1834                 prefix = prefix[8:]
1835                 query  = query.encode('utf-8')
1836                 if prefix == '':
1837                         self._download_n_results(query, 1)
1838                         return
1839                 elif prefix == 'all':
1840                         self._download_n_results(query, self._max_yahoo_results)
1841                         return
1842                 else:
1843                         try:
1844                                 n = long(prefix)
1845                                 if n <= 0:
1846                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1847                                         return
1848                                 elif n > self._max_yahoo_results:
1849                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1850                                         n = self._max_yahoo_results
1851                                 self._download_n_results(query, n)
1852                                 return
1853                         except ValueError: # parsing prefix as integer fails
1854                                 self._download_n_results(query, 1)
1855                                 return
1856
1857         def _download_n_results(self, query, n):
1858                 """Downloads a specified number of results for a query"""
1859
1860                 video_ids = []
1861                 already_seen = set()
1862                 pagenum = 1
1863
1864                 while True:
1865                         self.report_download_page(query, pagenum)
1866                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1867                         request = urllib2.Request(result_url, None, std_headers)
1868                         try:
1869                                 page = urllib2.urlopen(request).read()
1870                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1871                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1872                                 return
1873
1874                         # Extract video identifiers
1875                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1876                                 video_id = mobj.group(1)
1877                                 if video_id not in already_seen:
1878                                         video_ids.append(video_id)
1879                                         already_seen.add(video_id)
1880                                         if len(video_ids) == n:
1881                                                 # Specified n videos reached
1882                                                 for id in video_ids:
1883                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1884                                                 return
1885
1886                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1887                                 for id in video_ids:
1888                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1889                                 return
1890
1891                         pagenum = pagenum + 1
1892
1893 class YoutubePlaylistIE(InfoExtractor):
1894         """Information Extractor for YouTube playlists."""
1895
1896         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1897         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1898         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1899         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1900         _youtube_ie = None
1901
1902         def __init__(self, youtube_ie, downloader=None):
1903                 InfoExtractor.__init__(self, downloader)
1904                 self._youtube_ie = youtube_ie
1905
1906         @staticmethod
1907         def suitable(url):
1908                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1909
1910         def report_download_page(self, playlist_id, pagenum):
1911                 """Report attempt to download playlist page with given number."""
1912                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1913
1914         def _real_initialize(self):
1915                 self._youtube_ie.initialize()
1916
1917         def _real_extract(self, url):
1918                 # Extract playlist id
1919                 mobj = re.match(self._VALID_URL, url)
1920                 if mobj is None:
1921                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1922                         return
1923
1924                 # Download playlist pages
1925                 playlist_id = mobj.group(1)
1926                 video_ids = []
1927                 pagenum = 1
1928
1929                 while True:
1930                         self.report_download_page(playlist_id, pagenum)
1931                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1932                         try:
1933                                 page = urllib2.urlopen(request).read()
1934                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1935                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1936                                 return
1937
1938                         # Extract video identifiers
1939                         ids_in_page = []
1940                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1941                                 if mobj.group(1) not in ids_in_page:
1942                                         ids_in_page.append(mobj.group(1))
1943                         video_ids.extend(ids_in_page)
1944
1945                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1946                                 break
1947                         pagenum = pagenum + 1
1948
1949                 for id in video_ids:
1950                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1951                 return
1952
1953 class YoutubeUserIE(InfoExtractor):
1954         """Information Extractor for YouTube users."""
1955
1956         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1957         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1958         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1959         _youtube_ie = None
1960
1961         def __init__(self, youtube_ie, downloader=None):
1962                 InfoExtractor.__init__(self, downloader)
1963                 self._youtube_ie = youtube_ie
1964
1965         @staticmethod
1966         def suitable(url):
1967                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1968
1969         def report_download_page(self, username):
1970                 """Report attempt to download user page."""
1971                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1972
1973         def _real_initialize(self):
1974                 self._youtube_ie.initialize()
1975
1976         def _real_extract(self, url):
1977                 # Extract username
1978                 mobj = re.match(self._VALID_URL, url)
1979                 if mobj is None:
1980                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1981                         return
1982
1983                 # Download user page
1984                 username = mobj.group(1)
1985                 video_ids = []
1986                 pagenum = 1
1987
1988                 self.report_download_page(username)
1989                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1990                 try:
1991                         page = urllib2.urlopen(request).read()
1992                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1994                         return
1995
1996                 # Extract video identifiers
1997                 ids_in_page = []
1998
1999                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2000                         if mobj.group(1) not in ids_in_page:
2001                                 ids_in_page.append(mobj.group(1))
2002                 video_ids.extend(ids_in_page)
2003
2004                 for id in video_ids:
2005                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2006                 return
2007
2008 class PostProcessor(object):
2009         """Post Processor class.
2010
2011         PostProcessor objects can be added to downloaders with their
2012         add_post_processor() method. When the downloader has finished a
2013         successful download, it will take its internal chain of PostProcessors
2014         and start calling the run() method on each one of them, first with
2015         an initial argument and then with the returned value of the previous
2016         PostProcessor.
2017
2018         The chain will be stopped if one of them ever returns None or the end
2019         of the chain is reached.
2020
2021         PostProcessor objects follow a "mutual registration" process similar
2022         to InfoExtractor objects.
2023         """
2024
2025         _downloader = None
2026
2027         def __init__(self, downloader=None):
2028                 self._downloader = downloader
2029
2030         def set_downloader(self, downloader):
2031                 """Sets the downloader for this PP."""
2032                 self._downloader = downloader
2033
2034         def run(self, information):
2035                 """Run the PostProcessor.
2036
2037                 The "information" argument is a dictionary like the ones
2038                 composed by InfoExtractors. The only difference is that this
2039                 one has an extra field called "filepath" that points to the
2040                 downloaded file.
2041
2042                 When this method returns None, the postprocessing chain is
2043                 stopped. However, this method may return an information
2044                 dictionary that will be passed to the next postprocessing
2045                 object in the chain. It can be the one it received after
2046                 changing some fields.
2047
2048                 In addition, this method may raise a PostProcessingError
2049                 exception that will be taken into account by the downloader
2050                 it was called from.
2051                 """
2052                 return information # by default, do nothing
2053
2054 ### MAIN PROGRAM ###
2055 if __name__ == '__main__':
2056         try:
2057                 # Modules needed only when running the main program
2058                 import getpass
2059                 import optparse
2060
2061                 # Function to update the program file with the latest version from bitbucket.org
2062                 def update_self(downloader, filename):
2063                         # Note: downloader only used for options
2064                         if not os.access (filename, os.W_OK):
2065                                 sys.exit('ERROR: no write permissions on %s' % filename)
2066
2067                         downloader.to_stdout('Updating to latest stable version...')
2068                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2069                         latest_version = urllib.urlopen(latest_url).read().strip()
2070                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2071                         newcontent = urllib.urlopen(prog_url).read()
2072                         stream = open(filename, 'w')
2073                         stream.write(newcontent)
2074                         stream.close()
2075                         downloader.to_stdout('Updated to version %s' % latest_version)
2076
2077                 # General configuration
2078                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2079                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2080                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2081
2082                 # Parse command line
2083                 parser = optparse.OptionParser(
2084                         usage='Usage: %prog [options] url...',
2085                         version='2010.06.06',
2086                         conflict_handler='resolve',
2087                 )
2088
2089                 parser.add_option('-h', '--help',
2090                                 action='help', help='print this help text and exit')
2091                 parser.add_option('-v', '--version',
2092                                 action='version', help='print program version and exit')
2093                 parser.add_option('-U', '--update',
2094                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2095                 parser.add_option('-i', '--ignore-errors',
2096                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2097                 parser.add_option('-r', '--rate-limit',
2098                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2099                 parser.add_option('-R', '--retries',
2100                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2101
2102                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2103                 authentication.add_option('-u', '--username',
2104                                 dest='username', metavar='USERNAME', help='account username')
2105                 authentication.add_option('-p', '--password',
2106                                 dest='password', metavar='PASSWORD', help='account password')
2107                 authentication.add_option('-n', '--netrc',
2108                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2109                 parser.add_option_group(authentication)
2110
2111                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2112                 video_format.add_option('-f', '--format',
2113                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2114                 video_format.add_option('-b', '--best-quality',
2115                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
2116                 video_format.add_option('-m', '--mobile-version',
2117                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2118                 video_format.add_option('-d', '--high-def',
2119                                 action='store_const', dest='format', help='alias for -f 22', const='22')
2120                 video_format.add_option('--all-formats',
2121                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2122                 video_format.add_option('--max-quality',
2123                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format limit for -b')
2124                 parser.add_option_group(video_format)
2125
2126                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2127                 verbosity.add_option('-q', '--quiet',
2128                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2129                 verbosity.add_option('-s', '--simulate',
2130                                 action='store_true', dest='simulate', help='do not download video', default=False)
2131                 verbosity.add_option('-g', '--get-url',
2132                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2133                 verbosity.add_option('-e', '--get-title',
2134                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2135                 verbosity.add_option('--get-thumbnail',
2136                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2137                 verbosity.add_option('--get-description',
2138                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2139                 verbosity.add_option('--no-progress',
2140                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2141                 parser.add_option_group(verbosity)
2142
2143                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2144                 filesystem.add_option('-t', '--title',
2145                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2146                 filesystem.add_option('-l', '--literal',
2147                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2148                 filesystem.add_option('-o', '--output',
2149                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2150                 filesystem.add_option('-a', '--batch-file',
2151                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2152                 filesystem.add_option('-w', '--no-overwrites',
2153                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2154                 filesystem.add_option('-c', '--continue',
2155                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2156                 parser.add_option_group(filesystem)
2157
2158                 (opts, args) = parser.parse_args()
2159
2160                 # Batch file verification
2161                 batchurls = []
2162                 if opts.batchfile is not None:
2163                         try:
2164                                 if opts.batchfile == '-':
2165                                         batchfd = sys.stdin
2166                                 else:
2167                                         batchfd = open(opts.batchfile, 'r')
2168                                 batchurls = batchfd.readlines()
2169                                 batchurls = [x.strip() for x in batchurls]
2170                                 batchurls = [x for x in batchurls if len(x) > 0]
2171                         except IOError:
2172                                 sys.exit(u'ERROR: batch file could not be read')
2173                 all_urls = batchurls + args
2174
2175                 # Conflicting, missing and erroneous options
2176                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2177                         parser.error(u'using .netrc conflicts with giving username/password')
2178                 if opts.password is not None and opts.username is None:
2179                         parser.error(u'account username missing')
2180                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2181                         parser.error(u'using output template conflicts with using title or literal title')
2182                 if opts.usetitle and opts.useliteral:
2183                         parser.error(u'using title conflicts with using literal title')
2184                 if opts.username is not None and opts.password is None:
2185                         opts.password = getpass.getpass(u'Type account password and press return:')
2186                 if opts.ratelimit is not None:
2187                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2188                         if numeric_limit is None:
2189                                 parser.error(u'invalid rate limit specified')
2190                         opts.ratelimit = numeric_limit
2191                 if opts.retries is not None:
2192                         try:
2193                                 opts.retries = long(opts.retries)
2194                         except (TypeError, ValueError), err:
2195                                 parser.error(u'invalid retry count specified')
2196
2197                 # Information extractors
2198                 youtube_ie = YoutubeIE()
2199                 metacafe_ie = MetacafeIE(youtube_ie)
2200                 dailymotion_ie = DailymotionIE()
2201                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2202                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2203                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2204                 google_ie = GoogleIE()
2205                 google_search_ie = GoogleSearchIE(google_ie)
2206                 photobucket_ie = PhotobucketIE()
2207                 yahoo_ie = YahooIE()
2208                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2209                 generic_ie = GenericIE()
2210
2211                 # File downloader
2212                 fd = FileDownloader({
2213                         'usenetrc': opts.usenetrc,
2214                         'username': opts.username,
2215                         'password': opts.password,
2216                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2217                         'forceurl': opts.geturl,
2218                         'forcetitle': opts.gettitle,
2219                         'forcethumbnail': opts.getthumbnail,
2220                         'forcedescription': opts.getdescription,
2221                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2222                         'format': opts.format,
2223                         'format_limit': opts.format_limit,
2224                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2225                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2226                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2227                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2228                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2229                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2230                                 or u'%(id)s.%(ext)s'),
2231                         'ignoreerrors': opts.ignoreerrors,
2232                         'ratelimit': opts.ratelimit,
2233                         'nooverwrites': opts.nooverwrites,
2234                         'retries': opts.retries,
2235                         'continuedl': opts.continue_dl,
2236                         'noprogress': opts.noprogress,
2237                         })
2238                 fd.add_info_extractor(youtube_search_ie)
2239                 fd.add_info_extractor(youtube_pl_ie)
2240                 fd.add_info_extractor(youtube_user_ie)
2241                 fd.add_info_extractor(metacafe_ie)
2242                 fd.add_info_extractor(dailymotion_ie)
2243                 fd.add_info_extractor(youtube_ie)
2244                 fd.add_info_extractor(google_ie)
2245                 fd.add_info_extractor(google_search_ie)
2246                 fd.add_info_extractor(photobucket_ie)
2247                 fd.add_info_extractor(yahoo_ie)
2248                 fd.add_info_extractor(yahoo_search_ie)
2249
2250                 # This must come last since it's the
2251                 # fallback if none of the others work
2252                 fd.add_info_extractor(generic_ie)
2253
2254                 # Update version
2255                 if opts.update_self:
2256                         update_self(fd, sys.argv[0])
2257
2258                 # Maybe do nothing
2259                 if len(all_urls) < 1:
2260                         if not opts.update_self:
2261                                 parser.error(u'you must provide at least one URL')
2262                         else:
2263                                 sys.exit()
2264                 retcode = fd.download(all_urls)
2265                 sys.exit(retcode)
2266
2267         except DownloadError:
2268                 sys.exit(1)
2269         except SameFileError:
2270                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2271         except KeyboardInterrupt:
2272                 sys.exit(u'\nERROR: Interrupted by user')