youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         retries:        Number of times to retry for HTTP error 503
 197         continuedl:     Try to continue downloads if possible.
 198         noprogress:     Do not print the progress bar.
 199         """
 200
 201         params = None
 202         _ies = []
 203         _pps = []
 204         _download_retcode = None
 205         _num_downloads = None
 206
 207         def __init__(self, params):
 208                 """Create a FileDownloader object with the given options."""
 209                 self._ies = []
 210                 self._pps = []
 211                 self._download_retcode = 0
 212                 self._num_downloads = 0
 213                 self.params = params
 214
 215         @staticmethod
 216         def pmkdir(filename):
 217                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 218                 components = filename.split(os.sep)
 219                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 220                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 221                 for dir in aggregate:
 222                         if not os.path.exists(dir):
 223                                 os.mkdir(dir)
 224
 225         @staticmethod
 226         def format_bytes(bytes):
 227                 if bytes is None:
 228                         return 'N/A'
 229                 if type(bytes) is str:
 230                         bytes = float(bytes)
 231                 if bytes == 0.0:
 232                         exponent = 0
 233                 else:
 234                         exponent = long(math.log(bytes, 1024.0))
 235                 suffix = 'bkMGTPEZY'[exponent]
 236                 converted = float(bytes) / float(1024**exponent)
 237                 return '%.2f%s' % (converted, suffix)
 238
 239         @staticmethod
 240         def calc_percent(byte_counter, data_len):
 241                 if data_len is None:
 242                         return '---.-%'
 243                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 244
 245         @staticmethod
 246         def calc_eta(start, now, total, current):
 247                 if total is None:
 248                         return '--:--'
 249                 dif = now - start
 250                 if current == 0 or dif < 0.001: # One millisecond
 251                         return '--:--'
 252                 rate = float(current) / dif
 253                 eta = long((float(total) - float(current)) / rate)
 254                 (eta_mins, eta_secs) = divmod(eta, 60)
 255                 if eta_mins > 99:
 256                         return '--:--'
 257                 return '%02d:%02d' % (eta_mins, eta_secs)
 258
 259         @staticmethod
 260         def calc_speed(start, now, bytes):
 261                 dif = now - start
 262                 if bytes == 0 or dif < 0.001: # One millisecond
 263                         return '%10s' % '---b/s'
 264                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 265
 266         @staticmethod
 267         def best_block_size(elapsed_time, bytes):
 268                 new_min = max(bytes / 2.0, 1.0)
 269                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 270                 if elapsed_time < 0.001:
 271                         return long(new_max)
 272                 rate = bytes / elapsed_time
 273                 if rate > new_max:
 274                         return long(new_max)
 275                 if rate < new_min:
 276                         return long(new_min)
 277                 return long(rate)
 278
 279         @staticmethod
 280         def parse_bytes(bytestr):
 281                 """Parse a string indicating a byte quantity into a long integer."""
 282                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 283                 if matchobj is None:
 284                         return None
 285                 number = float(matchobj.group(1))
 286                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 287                 return long(round(number * multiplier))
 288
 289         @staticmethod
 290         def verify_url(url):
 291                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 292                 request = urllib2.Request(url, None, std_headers)
 293                 data = urllib2.urlopen(request)
 294                 data.read(1)
 295                 url = data.geturl()
 296                 data.close()
 297                 return url
 298
 299         def add_info_extractor(self, ie):
 300                 """Add an InfoExtractor object to the end of the list."""
 301                 self._ies.append(ie)
 302                 ie.set_downloader(self)
 303
 304         def add_post_processor(self, pp):
 305                 """Add a PostProcessor object to the end of the chain."""
 306                 self._pps.append(pp)
 307                 pp.set_downloader(self)
 308
 309         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 310                 """Print message to stdout if not in quiet mode."""
 311                 try:
 312                         if not self.params.get('quiet', False):
 313                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 314                         sys.stdout.flush()
 315                 except (UnicodeEncodeError), err:
 316                         if not ignore_encoding_errors:
 317                                 raise
 318
 319         def to_stderr(self, message):
 320                 """Print message to stderr."""
 321                 print >>sys.stderr, message.encode(preferredencoding())
 322
 323         def fixed_template(self):
 324                 """Checks if the output template is fixed."""
 325                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 326
 327         def trouble(self, message=None):
 328                 """Determine action to take when a download problem appears.
 329
 330                 Depending on if the downloader has been configured to ignore
 331                 download errors or not, this method may throw an exception or
 332                 not when errors are found, after printing the message.
 333                 """
 334                 if message is not None:
 335                         self.to_stderr(message)
 336                 if not self.params.get('ignoreerrors', False):
 337                         raise DownloadError(message)
 338                 self._download_retcode = 1
 339
 340         def slow_down(self, start_time, byte_counter):
 341                 """Sleep if the download speed is over the rate limit."""
 342                 rate_limit = self.params.get('ratelimit', None)
 343                 if rate_limit is None or byte_counter == 0:
 344                         return
 345                 now = time.time()
 346                 elapsed = now - start_time
 347                 if elapsed <= 0.0:
 348                         return
 349                 speed = float(byte_counter) / elapsed
 350                 if speed > rate_limit:
 351                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 352
 353         def report_destination(self, filename):
 354                 """Report destination filename."""
 355                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 356
 357         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 358                 """Report download progress."""
 359                 if self.params.get('noprogress', False):
 360                         return
 361                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 362                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 363
 364         def report_resuming_byte(self, resume_len):
 365                 """Report attemtp to resume at given byte."""
 366                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 367
 368         def report_retry(self, count, retries):
 369                 """Report retry in case of HTTP error 503"""
 370                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
 371
 372         def report_file_already_downloaded(self, file_name):
 373                 """Report file has already been fully downloaded."""
 374                 try:
 375                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 376                 except (UnicodeEncodeError), err:
 377                         self.to_stdout(u'[download] The file has already been downloaded')
 378
 379         def report_unable_to_resume(self):
 380                 """Report it was impossible to resume download."""
 381                 self.to_stdout(u'[download] Unable to resume')
 382
 383         def report_finish(self):
 384                 """Report download finished."""
 385                 if self.params.get('noprogress', False):
 386                         self.to_stdout(u'[download] Download completed')
 387                 else:
 388                         self.to_stdout(u'')
 389
 390         def process_info(self, info_dict):
 391                 """Process a single dictionary returned by an InfoExtractor."""
 392                 # Do nothing else if in simulate mode
 393                 if self.params.get('simulate', False):
 394                         # Verify URL if it's an HTTP one
 395                         if info_dict['url'].startswith('http'):
 396                                 try:
 397                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 398                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 399                                         raise UnavailableFormatError
 400
 401                         # Forced printings
 402                         if self.params.get('forcetitle', False):
 403                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 404                         if self.params.get('forceurl', False):
 405                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 406                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 407                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 408                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 409                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 410
 411                         return
 412
 413                 try:
 414                         template_dict = dict(info_dict)
 415                         template_dict['epoch'] = unicode(long(time.time()))
 416                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 417                         filename = self.params['outtmpl'] % template_dict
 418                 except (ValueError, KeyError), err:
 419                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 420                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 421                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 422                         return
 423
 424                 try:
 425                         self.pmkdir(filename)
 426                 except (OSError, IOError), err:
 427                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 428                         return
 429
 430                 try:
 431                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 432                 except (OSError, IOError), err:
 433                         raise UnavailableFormatError
 434                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 435                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 436                         return
 437                 except (ContentTooShortError, ), err:
 438                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 439                         return
 440
 441                 if success:
 442                         try:
 443                                 self.post_process(filename, info_dict)
 444                         except (PostProcessingError), err:
 445                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 446                                 return
 447
 448         def download(self, url_list):
 449                 """Download a given list of URLs."""
 450                 if len(url_list) > 1 and self.fixed_template():
 451                         raise SameFileError(self.params['outtmpl'])
 452
 453                 for url in url_list:
 454                         suitable_found = False
 455                         for ie in self._ies:
 456                                 # Go to next InfoExtractor if not suitable
 457                                 if not ie.suitable(url):
 458                                         continue
 459
 460                                 # Suitable InfoExtractor found
 461                                 suitable_found = True
 462
 463                                 # Extract information from URL and process it
 464                                 ie.extract(url)
 465
 466                                 # Suitable InfoExtractor had been found; go to next URL
 467                                 break
 468
 469                         if not suitable_found:
 470                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 471
 472                 return self._download_retcode
 473
 474         def post_process(self, filename, ie_info):
 475                 """Run the postprocessing chain on the given file."""
 476                 info = dict(ie_info)
 477                 info['filepath'] = filename
 478                 for pp in self._pps:
 479                         info = pp.run(info)
 480                         if info is None:
 481                                 break
 482
 483         def _download_with_rtmpdump(self, filename, url, player_url):
 484                 self.report_destination(filename)
 485
 486                 # Check for rtmpdump first
 487                 try:
 488                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 489                 except (OSError, IOError):
 490                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 491                         return False
 492
 493                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 494                 # the connection was interrumpted and resuming appears to be
 495                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 496                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
 497                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 498                 while retval == 2 or retval == 1:
 499                         prevsize = os.path.getsize(filename)
 500                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 501                         time.sleep(5.0) # This seems to be needed
 502                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 503                         cursize = os.path.getsize(filename)
 504                         if prevsize == cursize and retval == 1:
 505                                 break
 506                 if retval == 0:
 507                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 508                         return True
 509                 else:
 510                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 511                         return False
 512
 513         def _do_download(self, filename, url, player_url):
 514                 # Attempt to download using rtmpdump
 515                 if url.startswith('rtmp'):
 516                         return self._download_with_rtmpdump(filename, url, player_url)
 517
 518                 stream = None
 519                 open_mode = 'wb'
 520                 basic_request = urllib2.Request(url, None, std_headers)
 521                 request = urllib2.Request(url, None, std_headers)
 522
 523                 # Establish possible resume length
 524                 if os.path.isfile(filename):
 525                         resume_len = os.path.getsize(filename)
 526                 else:
 527                         resume_len = 0
 528
 529                 # Request parameters in case of being able to resume
 530                 if self.params.get('continuedl', False) and resume_len != 0:
 531                         self.report_resuming_byte(resume_len)
 532                         request.add_header('Range','bytes=%d-' % resume_len)
 533                         open_mode = 'ab'
 534
 535                 count = 0
 536                 retries = self.params.get('retries', 0)
 537                 while True:
 538                         # Establish connection
 539                         try:
 540                                 data = urllib2.urlopen(request)
 541                                 break
 542                         except (urllib2.HTTPError, ), err:
 543                                 if err.code == 503:
 544                                         # Retry in case of HTTP error 503
 545                                         count += 1
 546                                         if count <= retries:
 547                                                 self.report_retry(count, retries)
 548                                                 continue
 549                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
 550                                         raise
 551                                 # Unable to resume
 552                                 data = urllib2.urlopen(basic_request)
 553                                 content_length = data.info()['Content-Length']
 554
 555                                 if content_length is not None and long(content_length) == resume_len:
 556                                         # Because the file had already been fully downloaded
 557                                         self.report_file_already_downloaded(filename)
 558                                         return True
 559                                 else:
 560                                         # Because the server didn't let us
 561                                         self.report_unable_to_resume()
 562                                         open_mode = 'wb'
 563
 564                 data_len = data.info().get('Content-length', None)
 565                 data_len_str = self.format_bytes(data_len)
 566                 byte_counter = 0
 567                 block_size = 1024
 568                 start = time.time()
 569                 while True:
 570                         # Download and write
 571                         before = time.time()
 572                         data_block = data.read(block_size)
 573                         after = time.time()
 574                         data_block_len = len(data_block)
 575                         if data_block_len == 0:
 576                                 break
 577                         byte_counter += data_block_len
 578
 579                         # Open file just in time
 580                         if stream is None:
 581                                 try:
 582                                         (stream, filename) = sanitize_open(filename, open_mode)
 583                                         self.report_destination(filename)
 584                                         self._num_downloads += 1
 585                                 except (OSError, IOError), err:
 586                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 587                                         return False
 588                         try:
 589                                 stream.write(data_block)
 590                         except (IOError, OSError), err:
 591                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
 592                         block_size = self.best_block_size(after - before, data_block_len)
 593
 594                         # Progress message
 595                         percent_str = self.calc_percent(byte_counter, data_len)
 596                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 597                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 598                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 599
 600                         # Apply rate limit
 601                         self.slow_down(start, byte_counter)
 602
 603                 self.report_finish()
 604                 if data_len is not None and str(byte_counter) != data_len:
 605                         raise ContentTooShortError(byte_counter, long(data_len))
 606                 return True
 607
 608 class InfoExtractor(object):
 609         """Information Extractor class.
 610
 611         Information extractors are the classes that, given a URL, extract
 612         information from the video (or videos) the URL refers to. This
 613         information includes the real video URL, the video title and simplified
 614         title, author and others. The information is stored in a dictionary
 615         which is then passed to the FileDownloader. The FileDownloader
 616         processes this information possibly downloading the video to the file
 617         system, among other possible outcomes. The dictionaries must include
 618         the following fields:
 619
 620         id:             Video identifier.
 621         url:            Final video URL.
 622         uploader:       Nickname of the video uploader.
 623         title:          Literal title.
 624         stitle:         Simplified title.
 625         ext:            Video filename extension.
 626         format:         Video format.
 627         player_url:     SWF Player URL (may be None).
 628
 629         The following fields are optional. Their primary purpose is to allow
 630         youtube-dl to serve as the backend for a video search function, such
 631         as the one in youtube2mp3.  They are only used when their respective
 632         forced printing functions are called:
 633
 634         thumbnail:      Full URL to a video thumbnail image.
 635         description:    One-line video description.
 636
 637         Subclasses of this one should re-define the _real_initialize() and
 638         _real_extract() methods, as well as the suitable() static method.
 639         Probably, they should also be instantiated and added to the main
 640         downloader.
 641         """
 642
 643         _ready = False
 644         _downloader = None
 645
 646         def __init__(self, downloader=None):
 647                 """Constructor. Receives an optional downloader."""
 648                 self._ready = False
 649                 self.set_downloader(downloader)
 650
 651         @staticmethod
 652         def suitable(url):
 653                 """Receives a URL and returns True if suitable for this IE."""
 654                 return False
 655
 656         def initialize(self):
 657                 """Initializes an instance (authentication, etc)."""
 658                 if not self._ready:
 659                         self._real_initialize()
 660                         self._ready = True
 661
 662         def extract(self, url):
 663                 """Extracts URL information and returns it in list of dicts."""
 664                 self.initialize()
 665                 return self._real_extract(url)
 666
 667         def set_downloader(self, downloader):
 668                 """Sets the downloader for this IE."""
 669                 self._downloader = downloader
 670
 671         def _real_initialize(self):
 672                 """Real initialization process. Redefine in subclasses."""
 673                 pass
 674
 675         def _real_extract(self, url):
 676                 """Real extraction process. Redefine in subclasses."""
 677                 pass
 678
 679 class YoutubeIE(InfoExtractor):
 680         """Information extractor for youtube.com."""
 681
 682         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 683         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 684         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 685         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 686         _NETRC_MACHINE = 'youtube'
 687         # Listed in order of priority for the -b option
 688         _available_formats = ['37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13', None]
 689         _video_extensions = {
 690                 '13': '3gp',
 691                 '17': 'mp4',
 692                 '18': 'mp4',
 693                 '22': 'mp4',
 694                 '37': 'mp4',
 695                 '43': 'webm',
 696                 '45': 'webm',
 697         }
 698
 699         @staticmethod
 700         def suitable(url):
 701                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 702
 703         def report_lang(self):
 704                 """Report attempt to set language."""
 705                 self._downloader.to_stdout(u'[youtube] Setting language')
 706
 707         def report_login(self):
 708                 """Report attempt to log in."""
 709                 self._downloader.to_stdout(u'[youtube] Logging in')
 710
 711         def report_age_confirmation(self):
 712                 """Report attempt to confirm age."""
 713                 self._downloader.to_stdout(u'[youtube] Confirming age')
 714
 715         def report_video_webpage_download(self, video_id):
 716                 """Report attempt to download video webpage."""
 717                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 718
 719         def report_video_info_webpage_download(self, video_id):
 720                 """Report attempt to download video info webpage."""
 721                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 722
 723         def report_information_extraction(self, video_id):
 724                 """Report attempt to extract video information."""
 725                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 726
 727         def report_unavailable_format(self, video_id, format):
 728                 """Report extracted video URL."""
 729                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 730
 731         def report_rtmp_download(self):
 732                 """Indicate the download will use the RTMP protocol."""
 733                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 734
 735         def _real_initialize(self):
 736                 if self._downloader is None:
 737                         return
 738
 739                 username = None
 740                 password = None
 741                 downloader_params = self._downloader.params
 742
 743                 # Attempt to use provided username and password or .netrc data
 744                 if downloader_params.get('username', None) is not None:
 745                         username = downloader_params['username']
 746                         password = downloader_params['password']
 747                 elif downloader_params.get('usenetrc', False):
 748                         try:
 749                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 750                                 if info is not None:
 751                                         username = info[0]
 752                                         password = info[2]
 753                                 else:
 754                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 755                         except (IOError, netrc.NetrcParseError), err:
 756                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 757                                 return
 758
 759                 # Set language
 760                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 761                 try:
 762                         self.report_lang()
 763                         urllib2.urlopen(request).read()
 764                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 765                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 766                         return
 767
 768                 # No authentication to be performed
 769                 if username is None:
 770                         return
 771
 772                 # Log in
 773                 login_form = {
 774                                 'current_form': 'loginForm',
 775                                 'next':         '/',
 776                                 'action_login': 'Log In',
 777                                 'username':     username,
 778                                 'password':     password,
 779                                 }
 780                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 781                 try:
 782                         self.report_login()
 783                         login_results = urllib2.urlopen(request).read()
 784                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 785                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 786                                 return
 787                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 788                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 789                         return
 790
 791                 # Confirm age
 792                 age_form = {
 793                                 'next_url':             '/',
 794                                 'action_confirm':       'Confirm',
 795                                 }
 796                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 797                 try:
 798                         self.report_age_confirmation()
 799                         age_results = urllib2.urlopen(request).read()
 800                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 801                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 802                         return
 803
 804         def _real_extract(self, url):
 805                 # Extract video id from URL
 806                 mobj = re.match(self._VALID_URL, url)
 807                 if mobj is None:
 808                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 809                         return
 810                 video_id = mobj.group(2)
 811
 812                 # Downloader parameters
 813                 best_quality = False
 814                 all_formats = False
 815                 format_param = None
 816                 quality_index = 0
 817                 if self._downloader is not None:
 818                         params = self._downloader.params
 819                         format_param = params.get('format', None)
 820                         if format_param == '0':
 821                                 format_param = self._available_formats[quality_index]
 822                                 best_quality = True
 823                         elif format_param == '-1':
 824                                 format_param = self._available_formats[quality_index]
 825                                 all_formats = True
 826
 827                 while True:
 828                         # Extension
 829                         video_extension = self._video_extensions.get(format_param, 'flv')
 830
 831                         # Get video webpage
 832                         self.report_video_webpage_download(video_id)
 833                         request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
 834                         try:
 835                                 video_webpage = urllib2.urlopen(request).read()
 836                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 837                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 838                                 return
 839
 840                         # Attempt to extract SWF player URL
 841                         mobj = re.search(r'swfConfig.*"(http://.*?watch-.*?\.swf)"', video_webpage)
 842                         if mobj is not None:
 843                                 player_url = mobj.group(1)
 844                         else:
 845                                 player_url = None
 846
 847                         # Get video info
 848                         self.report_video_info_webpage_download(video_id)
 849                         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 850                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 851                                                    % (video_id, el_type))
 852                                 request = urllib2.Request(video_info_url, None, std_headers)
 853                                 try:
 854                                         video_info_webpage = urllib2.urlopen(request).read()
 855                                         video_info = parse_qs(video_info_webpage)
 856                                         if 'token' in video_info:
 857                                                 break
 858                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 859                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 860                                         return
 861                         self.report_information_extraction(video_id)
 862
 863                         # "t" param
 864                         if 'token' not in video_info:
 865                                 # Attempt to see if YouTube has issued an error message
 866                                 if 'reason' not in video_info:
 867                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 868                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 869                                         stream.write(video_info_webpage)
 870                                         stream.close()
 871                                 else:
 872                                         reason = urllib.unquote_plus(video_info['reason'][0])
 873                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 874                                 return
 875                         token = urllib.unquote_plus(video_info['token'][0])
 876                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 877                         if format_param is not None:
 878                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 879
 880                         # Check possible RTMP download
 881                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 882                                 self.report_rtmp_download()
 883                                 video_real_url = video_info['conn'][0]
 884
 885                         # uploader
 886                         if 'author' not in video_info:
 887                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 888                                 return
 889                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 890
 891                         # title
 892                         if 'title' not in video_info:
 893                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 894                                 return
 895                         video_title = urllib.unquote_plus(video_info['title'][0])
 896                         video_title = video_title.decode('utf-8')
 897                         video_title = sanitize_title(video_title)
 898
 899                         # simplified title
 900                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 901                         simple_title = simple_title.strip(ur'_')
 902
 903                         # thumbnail image
 904                         if 'thumbnail_url' not in video_info:
 905                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 906                                 video_thumbnail = ''
 907                         else:   # don't panic if we can't find it
 908                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 909
 910                         # description
 911                         video_description = 'No description available.'
 912                         if self._downloader.params.get('forcedescription', False):
 913                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 914                                 if mobj is not None:
 915                                         video_description = mobj.group(1)
 916
 917                         try:
 918                                 # Process video information
 919                                 self._downloader.process_info({
 920                                         'id':           video_id.decode('utf-8'),
 921                                         'url':          video_real_url.decode('utf-8'),
 922                                         'uploader':     video_uploader.decode('utf-8'),
 923                                         'title':        video_title,
 924                                         'stitle':       simple_title,
 925                                         'ext':          video_extension.decode('utf-8'),
 926                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 927                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 928                                         'description':  video_description.decode('utf-8'),
 929                                         'player_url':   player_url,
 930                                 })
 931
 932                                 if all_formats:
 933                                         quality_index += 1
 934                                         if quality_index == len(self._available_formats):
 935                                                 # None left to get
 936                                                 return
 937                                         else:
 938                                                 format_param = self._available_formats[quality_index]
 939                                                 continue
 940                                 return
 941
 942                         except UnavailableFormatError, err:
 943                                 if best_quality or all_formats:
 944                                         quality_index += 1
 945                                         if quality_index == len(self._available_formats):
 946                                                 # I don't ever expect this to happen
 947                                                 if not all_formats:
 948                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 949                                                 return
 950                                         else:
 951                                                 self.report_unavailable_format(video_id, format_param)
 952                                                 format_param = self._available_formats[quality_index]
 953                                                 continue
 954                                 else:
 955                                         self._downloader.trouble('ERROR: format not available for video')
 956                                         return
 957
 958
 959 class MetacafeIE(InfoExtractor):
 960         """Information Extractor for metacafe.com."""
 961
 962         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 963         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 964         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 965         _youtube_ie = None
 966
 967         def __init__(self, youtube_ie, downloader=None):
 968                 InfoExtractor.__init__(self, downloader)
 969                 self._youtube_ie = youtube_ie
 970
 971         @staticmethod
 972         def suitable(url):
 973                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 974
 975         def report_disclaimer(self):
 976                 """Report disclaimer retrieval."""
 977                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 978
 979         def report_age_confirmation(self):
 980                 """Report attempt to confirm age."""
 981                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 982
 983         def report_download_webpage(self, video_id):
 984                 """Report webpage download."""
 985                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 986
 987         def report_extraction(self, video_id):
 988                 """Report information extraction."""
 989                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 990
 991         def _real_initialize(self):
 992                 # Retrieve disclaimer
 993                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 994                 try:
 995                         self.report_disclaimer()
 996                         disclaimer = urllib2.urlopen(request).read()
 997                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 998                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 999                         return
1000
1001                 # Confirm age
1002                 disclaimer_form = {
1003                         'filters': '0',
1004                         'submit': "Continue - I'm over 18",
1005                         }
1006                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1007                 try:
1008                         self.report_age_confirmation()
1009                         disclaimer = urllib2.urlopen(request).read()
1010                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1011                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1012                         return
1013
1014         def _real_extract(self, url):
1015                 # Extract id and simplified title from URL
1016                 mobj = re.match(self._VALID_URL, url)
1017                 if mobj is None:
1018                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1019                         return
1020
1021                 video_id = mobj.group(1)
1022
1023                 # Check if video comes from YouTube
1024                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1025                 if mobj2 is not None:
1026                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1027                         return
1028
1029                 simple_title = mobj.group(2).decode('utf-8')
1030                 video_extension = 'flv'
1031
1032                 # Retrieve video webpage to extract further information
1033                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1034                 try:
1035                         self.report_download_webpage(video_id)
1036                         webpage = urllib2.urlopen(request).read()
1037                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1038                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1039                         return
1040
1041                 # Extract URL, uploader and title from webpage
1042                 self.report_extraction(video_id)
1043                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1044                 if mobj is None:
1045                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1046                         return
1047                 mediaURL = urllib.unquote(mobj.group(1))
1048
1049                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1050                 #if mobj is None:
1051                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1052                 #       return
1053                 #gdaKey = mobj.group(1)
1054                 #
1055                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1056
1057                 video_url = mediaURL
1058
1059                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1060                 if mobj is None:
1061                         self._downloader.trouble(u'ERROR: unable to extract title')
1062                         return
1063                 video_title = mobj.group(1).decode('utf-8')
1064                 video_title = sanitize_title(video_title)
1065
1066                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1067                 if mobj is None:
1068                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1069                         return
1070                 video_uploader = mobj.group(1)
1071
1072                 try:
1073                         # Process video information
1074                         self._downloader.process_info({
1075                                 'id':           video_id.decode('utf-8'),
1076                                 'url':          video_url.decode('utf-8'),
1077                                 'uploader':     video_uploader.decode('utf-8'),
1078                                 'title':        video_title,
1079                                 'stitle':       simple_title,
1080                                 'ext':          video_extension.decode('utf-8'),
1081                                 'format':       u'NA',
1082                                 'player_url':   None,
1083                         })
1084                 except UnavailableFormatError:
1085                         self._downloader.trouble(u'ERROR: format not available for video')
1086
1087
1088 class GoogleIE(InfoExtractor):
1089         """Information extractor for video.google.com."""
1090
1091         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1092
1093         def __init__(self, downloader=None):
1094                 InfoExtractor.__init__(self, downloader)
1095
1096         @staticmethod
1097         def suitable(url):
1098                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1099
1100         def report_download_webpage(self, video_id):
1101                 """Report webpage download."""
1102                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1103
1104         def report_extraction(self, video_id):
1105                 """Report information extraction."""
1106                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1107
1108         def _real_initialize(self):
1109                 return
1110
1111         def _real_extract(self, url):
1112                 # Extract id from URL
1113                 mobj = re.match(self._VALID_URL, url)
1114                 if mobj is None:
1115                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1116                         return
1117
1118                 video_id = mobj.group(1)
1119
1120                 video_extension = 'mp4'
1121
1122                 # Retrieve video webpage to extract further information
1123                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1124                 try:
1125                         self.report_download_webpage(video_id)
1126                         webpage = urllib2.urlopen(request).read()
1127                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1128                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1129                         return
1130
1131                 # Extract URL, uploader, and title from webpage
1132                 self.report_extraction(video_id)
1133                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1134                 if mobj is None:
1135                         video_extension = 'flv'
1136                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1137                 if mobj is None:
1138                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1139                         return
1140                 mediaURL = urllib.unquote(mobj.group(1))
1141                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1142                 mediaURL = mediaURL.replace('\\x26', '\x26')
1143
1144                 video_url = mediaURL
1145
1146                 mobj = re.search(r'<title>(.*)</title>', webpage)
1147                 if mobj is None:
1148                         self._downloader.trouble(u'ERROR: unable to extract title')
1149                         return
1150                 video_title = mobj.group(1).decode('utf-8')
1151                 video_title = sanitize_title(video_title)
1152                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1153
1154                 # Extract video description
1155                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1156                 if mobj is None:
1157                         self._downloader.trouble(u'ERROR: unable to extract video description')
1158                         return
1159                 video_description = mobj.group(1).decode('utf-8')
1160                 if not video_description:
1161                         video_description = 'No description available.'
1162
1163                 # Extract video thumbnail
1164                 if self._downloader.params.get('forcethumbnail', False):
1165                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1166                         try:
1167                                 webpage = urllib2.urlopen(request).read()
1168                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1169                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1170                                 return
1171                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1172                         if mobj is None:
1173                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1174                                 return
1175                         video_thumbnail = mobj.group(1)
1176                 else:   # we need something to pass to process_info
1177                         video_thumbnail = ''
1178
1179
1180                 try:
1181                         # Process video information
1182                         self._downloader.process_info({
1183                                 'id':           video_id.decode('utf-8'),
1184                                 'url':          video_url.decode('utf-8'),
1185                                 'uploader':     u'NA',
1186                                 'title':        video_title,
1187                                 'stitle':       simple_title,
1188                                 'ext':          video_extension.decode('utf-8'),
1189                                 'format':       u'NA',
1190                                 'player_url':   None,
1191                         })
1192                 except UnavailableFormatError:
1193                         self._downloader.trouble(u'ERROR: format not available for video')
1194
1195
1196 class PhotobucketIE(InfoExtractor):
1197         """Information extractor for photobucket.com."""
1198
1199         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1200
1201         def __init__(self, downloader=None):
1202                 InfoExtractor.__init__(self, downloader)
1203
1204         @staticmethod
1205         def suitable(url):
1206                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1207
1208         def report_download_webpage(self, video_id):
1209                 """Report webpage download."""
1210                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1211
1212         def report_extraction(self, video_id):
1213                 """Report information extraction."""
1214                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1215
1216         def _real_initialize(self):
1217                 return
1218
1219         def _real_extract(self, url):
1220                 # Extract id from URL
1221                 mobj = re.match(self._VALID_URL, url)
1222                 if mobj is None:
1223                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1224                         return
1225
1226                 video_id = mobj.group(1)
1227
1228                 video_extension = 'flv'
1229
1230                 # Retrieve video webpage to extract further information
1231                 request = urllib2.Request(url)
1232                 try:
1233                         self.report_download_webpage(video_id)
1234                         webpage = urllib2.urlopen(request).read()
1235                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1236                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1237                         return
1238
1239                 # Extract URL, uploader, and title from webpage
1240                 self.report_extraction(video_id)
1241                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1242                 if mobj is None:
1243                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1244                         return
1245                 mediaURL = urllib.unquote(mobj.group(1))
1246
1247                 video_url = mediaURL
1248
1249                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1250                 if mobj is None:
1251                         self._downloader.trouble(u'ERROR: unable to extract title')
1252                         return
1253                 video_title = mobj.group(1).decode('utf-8')
1254                 video_title = sanitize_title(video_title)
1255                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1256
1257                 video_uploader = mobj.group(2).decode('utf-8')
1258
1259                 try:
1260                         # Process video information
1261                         self._downloader.process_info({
1262                                 'id':           video_id.decode('utf-8'),
1263                                 'url':          video_url.decode('utf-8'),
1264                                 'uploader':     video_uploader,
1265                                 'title':        video_title,
1266                                 'stitle':       simple_title,
1267                                 'ext':          video_extension.decode('utf-8'),
1268                                 'format':       u'NA',
1269                                 'player_url':   None,
1270                         })
1271                 except UnavailableFormatError:
1272                         self._downloader.trouble(u'ERROR: format not available for video')
1273
1274
1275 class YahooIE(InfoExtractor):
1276         """Information extractor for video.yahoo.com."""
1277
1278         # _VALID_URL matches all Yahoo! Video URLs
1279         # _VPAGE_URL matches only the extractable '/watch/' URLs
1280         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1281         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1282
1283         def __init__(self, downloader=None):
1284                 InfoExtractor.__init__(self, downloader)
1285
1286         @staticmethod
1287         def suitable(url):
1288                 return (re.match(YahooIE._VALID_URL, url) is not None)
1289
1290         def report_download_webpage(self, video_id):
1291                 """Report webpage download."""
1292                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1293
1294         def report_extraction(self, video_id):
1295                 """Report information extraction."""
1296                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1297
1298         def _real_initialize(self):
1299                 return
1300
1301         def _real_extract(self, url):
1302                 # Extract ID from URL
1303                 mobj = re.match(self._VALID_URL, url)
1304                 if mobj is None:
1305                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1306                         return
1307
1308                 video_id = mobj.group(2)
1309                 video_extension = 'flv'
1310
1311                 # Rewrite valid but non-extractable URLs as
1312                 # extractable English language /watch/ URLs
1313                 if re.match(self._VPAGE_URL, url) is None:
1314                         request = urllib2.Request(url)
1315                         try:
1316                                 webpage = urllib2.urlopen(request).read()
1317                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1318                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1319                                 return
1320
1321                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1322                         if mobj is None:
1323                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1324                                 return
1325                         yahoo_id = mobj.group(1)
1326
1327                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1328                         if mobj is None:
1329                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1330                                 return
1331                         yahoo_vid = mobj.group(1)
1332
1333                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1334                         return self._real_extract(url)
1335
1336                 # Retrieve video webpage to extract further information
1337                 request = urllib2.Request(url)
1338                 try:
1339                         self.report_download_webpage(video_id)
1340                         webpage = urllib2.urlopen(request).read()
1341                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1342                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1343                         return
1344
1345                 # Extract uploader and title from webpage
1346                 self.report_extraction(video_id)
1347                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1348                 if mobj is None:
1349                         self._downloader.trouble(u'ERROR: unable to extract video title')
1350                         return
1351                 video_title = mobj.group(1).decode('utf-8')
1352                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1353
1354                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1355                 if mobj is None:
1356                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1357                         return
1358                 video_uploader = mobj.group(1).decode('utf-8')
1359
1360                 # Extract video thumbnail
1361                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1362                 if mobj is None:
1363                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1364                         return
1365                 video_thumbnail = mobj.group(1).decode('utf-8')
1366
1367                 # Extract video description
1368                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1369                 if mobj is None:
1370                         self._downloader.trouble(u'ERROR: unable to extract video description')
1371                         return
1372                 video_description = mobj.group(1).decode('utf-8')
1373                 if not video_description: video_description = 'No description available.'
1374
1375                 # Extract video height and width
1376                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1377                 if mobj is None:
1378                         self._downloader.trouble(u'ERROR: unable to extract video height')
1379                         return
1380                 yv_video_height = mobj.group(1)
1381
1382                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1383                 if mobj is None:
1384                         self._downloader.trouble(u'ERROR: unable to extract video width')
1385                         return
1386                 yv_video_width = mobj.group(1)
1387
1388                 # Retrieve video playlist to extract media URL
1389                 # I'm not completely sure what all these options are, but we
1390                 # seem to need most of them, otherwise the server sends a 401.
1391                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1392                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1393                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1394                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1395                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1396                 try:
1397                         self.report_download_webpage(video_id)
1398                         webpage = urllib2.urlopen(request).read()
1399                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1400                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1401                         return
1402
1403                 # Extract media URL from playlist XML
1404                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1405                 if mobj is None:
1406                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1407                         return
1408                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1409                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1410
1411                 try:
1412                         # Process video information
1413                         self._downloader.process_info({
1414                                 'id':           video_id.decode('utf-8'),
1415                                 'url':          video_url,
1416                                 'uploader':     video_uploader,
1417                                 'title':        video_title,
1418                                 'stitle':       simple_title,
1419                                 'ext':          video_extension.decode('utf-8'),
1420                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1421                                 'description':  video_description,
1422                                 'thumbnail':    video_thumbnail,
1423                                 'description':  video_description,
1424                                 'player_url':   None,
1425                         })
1426                 except UnavailableFormatError:
1427                         self._downloader.trouble(u'ERROR: format not available for video')
1428
1429
1430 class GenericIE(InfoExtractor):
1431         """Generic last-resort information extractor."""
1432
1433         def __init__(self, downloader=None):
1434                 InfoExtractor.__init__(self, downloader)
1435
1436         @staticmethod
1437         def suitable(url):
1438                 return True
1439
1440         def report_download_webpage(self, video_id):
1441                 """Report webpage download."""
1442                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1443                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1444
1445         def report_extraction(self, video_id):
1446                 """Report information extraction."""
1447                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1448
1449         def _real_initialize(self):
1450                 return
1451
1452         def _real_extract(self, url):
1453                 video_id = url.split('/')[-1]
1454                 request = urllib2.Request(url)
1455                 try:
1456                         self.report_download_webpage(video_id)
1457                         webpage = urllib2.urlopen(request).read()
1458                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1459                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1460                         return
1461                 except ValueError, err:
1462                         # since this is the last-resort InfoExtractor, if
1463                         # this error is thrown, it'll be thrown here
1464                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1465                         return
1466
1467                 # Start with something easy: JW Player in SWFObject
1468                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1469                 if mobj is None:
1470                         # Broaden the search a little bit
1471                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1472                 if mobj is None:
1473                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1474                         return
1475
1476                 # It's possible that one of the regexes
1477                 # matched, but returned an empty group:
1478                 if mobj.group(1) is None:
1479                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1480                         return
1481
1482                 video_url = urllib.unquote(mobj.group(1))
1483                 video_id  = os.path.basename(video_url)
1484
1485                 # here's a fun little line of code for you:
1486                 video_extension = os.path.splitext(video_id)[1][1:]
1487                 video_id        = os.path.splitext(video_id)[0]
1488
1489                 # it's tempting to parse this further, but you would
1490                 # have to take into account all the variations like
1491                 #   Video Title - Site Name
1492                 #   Site Name | Video Title
1493                 #   Video Title - Tagline | Site Name
1494                 # and so on and so forth; it's just not practical
1495                 mobj = re.search(r'<title>(.*)</title>', webpage)
1496                 if mobj is None:
1497                         self._downloader.trouble(u'ERROR: unable to extract title')
1498                         return
1499                 video_title = mobj.group(1).decode('utf-8')
1500                 video_title = sanitize_title(video_title)
1501                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1502
1503                 # video uploader is domain name
1504                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1505                 if mobj is None:
1506                         self._downloader.trouble(u'ERROR: unable to extract title')
1507                         return
1508                 video_uploader = mobj.group(1).decode('utf-8')
1509
1510                 try:
1511                         # Process video information
1512                         self._downloader.process_info({
1513                                 'id':           video_id.decode('utf-8'),
1514                                 'url':          video_url.decode('utf-8'),
1515                                 'uploader':     video_uploader,
1516                                 'title':        video_title,
1517                                 'stitle':       simple_title,
1518                                 'ext':          video_extension.decode('utf-8'),
1519                                 'format':       u'NA',
1520                                 'player_url':   None,
1521                         })
1522                 except UnavailableFormatError:
1523                         self._downloader.trouble(u'ERROR: format not available for video')
1524
1525
1526 class YoutubeSearchIE(InfoExtractor):
1527         """Information Extractor for YouTube search queries."""
1528         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1529         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1530         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1531         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1532         _youtube_ie = None
1533         _max_youtube_results = 1000
1534
1535         def __init__(self, youtube_ie, downloader=None):
1536                 InfoExtractor.__init__(self, downloader)
1537                 self._youtube_ie = youtube_ie
1538
1539         @staticmethod
1540         def suitable(url):
1541                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1542
1543         def report_download_page(self, query, pagenum):
1544                 """Report attempt to download playlist page with given number."""
1545                 query = query.decode(preferredencoding())
1546                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1547
1548         def _real_initialize(self):
1549                 self._youtube_ie.initialize()
1550
1551         def _real_extract(self, query):
1552                 mobj = re.match(self._VALID_QUERY, query)
1553                 if mobj is None:
1554                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1555                         return
1556
1557                 prefix, query = query.split(':')
1558                 prefix = prefix[8:]
1559                 query  = query.encode('utf-8')
1560                 if prefix == '':
1561                         self._download_n_results(query, 1)
1562                         return
1563                 elif prefix == 'all':
1564                         self._download_n_results(query, self._max_youtube_results)
1565                         return
1566                 else:
1567                         try:
1568                                 n = long(prefix)
1569                                 if n <= 0:
1570                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1571                                         return
1572                                 elif n > self._max_youtube_results:
1573                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1574                                         n = self._max_youtube_results
1575                                 self._download_n_results(query, n)
1576                                 return
1577                         except ValueError: # parsing prefix as integer fails
1578                                 self._download_n_results(query, 1)
1579                                 return
1580
1581         def _download_n_results(self, query, n):
1582                 """Downloads a specified number of results for a query"""
1583
1584                 video_ids = []
1585                 already_seen = set()
1586                 pagenum = 1
1587
1588                 while True:
1589                         self.report_download_page(query, pagenum)
1590                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1591                         request = urllib2.Request(result_url, None, std_headers)
1592                         try:
1593                                 page = urllib2.urlopen(request).read()
1594                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1595                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1596                                 return
1597
1598                         # Extract video identifiers
1599                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1600                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1601                                 if video_id not in already_seen:
1602                                         video_ids.append(video_id)
1603                                         already_seen.add(video_id)
1604                                         if len(video_ids) == n:
1605                                                 # Specified n videos reached
1606                                                 for id in video_ids:
1607                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1608                                                 return
1609
1610                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1611                                 for id in video_ids:
1612                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1613                                 return
1614
1615                         pagenum = pagenum + 1
1616
1617 class GoogleSearchIE(InfoExtractor):
1618         """Information Extractor for Google Video search queries."""
1619         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1620         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1621         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1622         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1623         _google_ie = None
1624         _max_google_results = 1000
1625
1626         def __init__(self, google_ie, downloader=None):
1627                 InfoExtractor.__init__(self, downloader)
1628                 self._google_ie = google_ie
1629
1630         @staticmethod
1631         def suitable(url):
1632                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1633
1634         def report_download_page(self, query, pagenum):
1635                 """Report attempt to download playlist page with given number."""
1636                 query = query.decode(preferredencoding())
1637                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1638
1639         def _real_initialize(self):
1640                 self._google_ie.initialize()
1641
1642         def _real_extract(self, query):
1643                 mobj = re.match(self._VALID_QUERY, query)
1644                 if mobj is None:
1645                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1646                         return
1647
1648                 prefix, query = query.split(':')
1649                 prefix = prefix[8:]
1650                 query  = query.encode('utf-8')
1651                 if prefix == '':
1652                         self._download_n_results(query, 1)
1653                         return
1654                 elif prefix == 'all':
1655                         self._download_n_results(query, self._max_google_results)
1656                         return
1657                 else:
1658                         try:
1659                                 n = long(prefix)
1660                                 if n <= 0:
1661                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1662                                         return
1663                                 elif n > self._max_google_results:
1664                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1665                                         n = self._max_google_results
1666                                 self._download_n_results(query, n)
1667                                 return
1668                         except ValueError: # parsing prefix as integer fails
1669                                 self._download_n_results(query, 1)
1670                                 return
1671
1672         def _download_n_results(self, query, n):
1673                 """Downloads a specified number of results for a query"""
1674
1675                 video_ids = []
1676                 already_seen = set()
1677                 pagenum = 1
1678
1679                 while True:
1680                         self.report_download_page(query, pagenum)
1681                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1682                         request = urllib2.Request(result_url, None, std_headers)
1683                         try:
1684                                 page = urllib2.urlopen(request).read()
1685                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1686                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1687                                 return
1688
1689                         # Extract video identifiers
1690                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1691                                 video_id = mobj.group(1)
1692                                 if video_id not in already_seen:
1693                                         video_ids.append(video_id)
1694                                         already_seen.add(video_id)
1695                                         if len(video_ids) == n:
1696                                                 # Specified n videos reached
1697                                                 for id in video_ids:
1698                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1699                                                 return
1700
1701                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1702                                 for id in video_ids:
1703                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1704                                 return
1705
1706                         pagenum = pagenum + 1
1707
1708 class YahooSearchIE(InfoExtractor):
1709         """Information Extractor for Yahoo! Video search queries."""
1710         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1711         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1712         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1713         _MORE_PAGES_INDICATOR = r'\s*Next'
1714         _yahoo_ie = None
1715         _max_yahoo_results = 1000
1716
1717         def __init__(self, yahoo_ie, downloader=None):
1718                 InfoExtractor.__init__(self, downloader)
1719                 self._yahoo_ie = yahoo_ie
1720
1721         @staticmethod
1722         def suitable(url):
1723                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1724
1725         def report_download_page(self, query, pagenum):
1726                 """Report attempt to download playlist page with given number."""
1727                 query = query.decode(preferredencoding())
1728                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1729
1730         def _real_initialize(self):
1731                 self._yahoo_ie.initialize()
1732
1733         def _real_extract(self, query):
1734                 mobj = re.match(self._VALID_QUERY, query)
1735                 if mobj is None:
1736                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1737                         return
1738
1739                 prefix, query = query.split(':')
1740                 prefix = prefix[8:]
1741                 query  = query.encode('utf-8')
1742                 if prefix == '':
1743                         self._download_n_results(query, 1)
1744                         return
1745                 elif prefix == 'all':
1746                         self._download_n_results(query, self._max_yahoo_results)
1747                         return
1748                 else:
1749                         try:
1750                                 n = long(prefix)
1751                                 if n <= 0:
1752                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1753                                         return
1754                                 elif n > self._max_yahoo_results:
1755                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1756                                         n = self._max_yahoo_results
1757                                 self._download_n_results(query, n)
1758                                 return
1759                         except ValueError: # parsing prefix as integer fails
1760                                 self._download_n_results(query, 1)
1761                                 return
1762
1763         def _download_n_results(self, query, n):
1764                 """Downloads a specified number of results for a query"""
1765
1766                 video_ids = []
1767                 already_seen = set()
1768                 pagenum = 1
1769
1770                 while True:
1771                         self.report_download_page(query, pagenum)
1772                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1773                         request = urllib2.Request(result_url, None, std_headers)
1774                         try:
1775                                 page = urllib2.urlopen(request).read()
1776                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1777                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1778                                 return
1779
1780                         # Extract video identifiers
1781                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1782                                 video_id = mobj.group(1)
1783                                 if video_id not in already_seen:
1784                                         video_ids.append(video_id)
1785                                         already_seen.add(video_id)
1786                                         if len(video_ids) == n:
1787                                                 # Specified n videos reached
1788                                                 for id in video_ids:
1789                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1790                                                 return
1791
1792                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1793                                 for id in video_ids:
1794                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1795                                 return
1796
1797                         pagenum = pagenum + 1
1798
1799 class YoutubePlaylistIE(InfoExtractor):
1800         """Information Extractor for YouTube playlists."""
1801
1802         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1803         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1804         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1805         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1806         _youtube_ie = None
1807
1808         def __init__(self, youtube_ie, downloader=None):
1809                 InfoExtractor.__init__(self, downloader)
1810                 self._youtube_ie = youtube_ie
1811
1812         @staticmethod
1813         def suitable(url):
1814                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1815
1816         def report_download_page(self, playlist_id, pagenum):
1817                 """Report attempt to download playlist page with given number."""
1818                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1819
1820         def _real_initialize(self):
1821                 self._youtube_ie.initialize()
1822
1823         def _real_extract(self, url):
1824                 # Extract playlist id
1825                 mobj = re.match(self._VALID_URL, url)
1826                 if mobj is None:
1827                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1828                         return
1829
1830                 # Download playlist pages
1831                 playlist_id = mobj.group(1)
1832                 video_ids = []
1833                 pagenum = 1
1834
1835                 while True:
1836                         self.report_download_page(playlist_id, pagenum)
1837                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1838                         try:
1839                                 page = urllib2.urlopen(request).read()
1840                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1841                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1842                                 return
1843
1844                         # Extract video identifiers
1845                         ids_in_page = []
1846                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1847                                 if mobj.group(1) not in ids_in_page:
1848                                         ids_in_page.append(mobj.group(1))
1849                         video_ids.extend(ids_in_page)
1850
1851                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1852                                 break
1853                         pagenum = pagenum + 1
1854
1855                 for id in video_ids:
1856                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1857                 return
1858
1859 class YoutubeUserIE(InfoExtractor):
1860         """Information Extractor for YouTube users."""
1861
1862         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1863         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1864         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1865         _youtube_ie = None
1866
1867         def __init__(self, youtube_ie, downloader=None):
1868                 InfoExtractor.__init__(self, downloader)
1869                 self._youtube_ie = youtube_ie
1870
1871         @staticmethod
1872         def suitable(url):
1873                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1874
1875         def report_download_page(self, username):
1876                 """Report attempt to download user page."""
1877                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1878
1879         def _real_initialize(self):
1880                 self._youtube_ie.initialize()
1881
1882         def _real_extract(self, url):
1883                 # Extract username
1884                 mobj = re.match(self._VALID_URL, url)
1885                 if mobj is None:
1886                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1887                         return
1888
1889                 # Download user page
1890                 username = mobj.group(1)
1891                 video_ids = []
1892                 pagenum = 1
1893
1894                 self.report_download_page(username)
1895                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1896                 try:
1897                         page = urllib2.urlopen(request).read()
1898                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1899                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1900                         return
1901
1902                 # Extract video identifiers
1903                 ids_in_page = []
1904
1905                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1906                         if mobj.group(1) not in ids_in_page:
1907                                 ids_in_page.append(mobj.group(1))
1908                 video_ids.extend(ids_in_page)
1909
1910                 for id in video_ids:
1911                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1912                 return
1913
1914 class PostProcessor(object):
1915         """Post Processor class.
1916
1917         PostProcessor objects can be added to downloaders with their
1918         add_post_processor() method. When the downloader has finished a
1919         successful download, it will take its internal chain of PostProcessors
1920         and start calling the run() method on each one of them, first with
1921         an initial argument and then with the returned value of the previous
1922         PostProcessor.
1923
1924         The chain will be stopped if one of them ever returns None or the end
1925         of the chain is reached.
1926
1927         PostProcessor objects follow a "mutual registration" process similar
1928         to InfoExtractor objects.
1929         """
1930
1931         _downloader = None
1932
1933         def __init__(self, downloader=None):
1934                 self._downloader = downloader
1935
1936         def set_downloader(self, downloader):
1937                 """Sets the downloader for this PP."""
1938                 self._downloader = downloader
1939
1940         def run(self, information):
1941                 """Run the PostProcessor.
1942
1943                 The "information" argument is a dictionary like the ones
1944                 composed by InfoExtractors. The only difference is that this
1945                 one has an extra field called "filepath" that points to the
1946                 downloaded file.
1947
1948                 When this method returns None, the postprocessing chain is
1949                 stopped. However, this method may return an information
1950                 dictionary that will be passed to the next postprocessing
1951                 object in the chain. It can be the one it received after
1952                 changing some fields.
1953
1954                 In addition, this method may raise a PostProcessingError
1955                 exception that will be taken into account by the downloader
1956                 it was called from.
1957                 """
1958                 return information # by default, do nothing
1959
1960 ### MAIN PROGRAM ###
1961 if __name__ == '__main__':
1962         try:
1963                 # Modules needed only when running the main program
1964                 import getpass
1965                 import optparse
1966
1967                 # Function to update the program file with the latest version from bitbucket.org
1968                 def update_self(downloader, filename):
1969                         # Note: downloader only used for options
1970                         if not os.access (filename, os.W_OK):
1971                                 sys.exit('ERROR: no write permissions on %s' % filename)
1972
1973                         downloader.to_stdout('Updating to latest stable version...')
1974                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1975                         latest_version = urllib.urlopen(latest_url).read().strip()
1976                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1977                         newcontent = urllib.urlopen(prog_url).read()
1978                         stream = open(filename, 'w')
1979                         stream.write(newcontent)
1980                         stream.close()
1981                         downloader.to_stdout('Updated to version %s' % latest_version)
1982
1983                 # General configuration
1984                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1985                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1986                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1987
1988                 # Parse command line
1989                 parser = optparse.OptionParser(
1990                         usage='Usage: %prog [options] url...',
1991                         version='2010.06.06',
1992                         conflict_handler='resolve',
1993                 )
1994
1995                 parser.add_option('-h', '--help',
1996                                 action='help', help='print this help text and exit')
1997                 parser.add_option('-v', '--version',
1998                                 action='version', help='print program version and exit')
1999                 parser.add_option('-U', '--update',
2000                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2001                 parser.add_option('-i', '--ignore-errors',
2002                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2003                 parser.add_option('-r', '--rate-limit',
2004                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2005                 parser.add_option('-R', '--retries',
2006                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2007
2008                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2009                 authentication.add_option('-u', '--username',
2010                                 dest='username', metavar='USERNAME', help='account username')
2011                 authentication.add_option('-p', '--password',
2012                                 dest='password', metavar='PASSWORD', help='account password')
2013                 authentication.add_option('-n', '--netrc',
2014                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2015                 parser.add_option_group(authentication)
2016
2017                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2018                 video_format.add_option('-f', '--format',
2019                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2020                 video_format.add_option('-b', '--best-quality',
2021                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
2022                 video_format.add_option('-m', '--mobile-version',
2023                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2024                 video_format.add_option('-d', '--high-def',
2025                                 action='store_const', dest='format', help='alias for -f 22', const='22')
2026                 video_format.add_option('--all-formats',
2027                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2028                 parser.add_option_group(video_format)
2029
2030                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2031                 verbosity.add_option('-q', '--quiet',
2032                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2033                 verbosity.add_option('-s', '--simulate',
2034                                 action='store_true', dest='simulate', help='do not download video', default=False)
2035                 verbosity.add_option('-g', '--get-url',
2036                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2037                 verbosity.add_option('-e', '--get-title',
2038                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2039                 verbosity.add_option('--get-thumbnail',
2040                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2041                 verbosity.add_option('--get-description',
2042                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2043                 verbosity.add_option('--no-progress',
2044                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2045                 parser.add_option_group(verbosity)
2046
2047                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2048                 filesystem.add_option('-t', '--title',
2049                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2050                 filesystem.add_option('-l', '--literal',
2051                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2052                 filesystem.add_option('-o', '--output',
2053                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2054                 filesystem.add_option('-a', '--batch-file',
2055                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2056                 filesystem.add_option('-w', '--no-overwrites',
2057                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2058                 filesystem.add_option('-c', '--continue',
2059                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2060                 parser.add_option_group(filesystem)
2061
2062                 (opts, args) = parser.parse_args()
2063
2064                 # Batch file verification
2065                 batchurls = []
2066                 if opts.batchfile is not None:
2067                         try:
2068                                 if opts.batchfile == '-':
2069                                         batchfd = sys.stdin
2070                                 else:
2071                                         batchfd = open(opts.batchfile, 'r')
2072                                 batchurls = batchfd.readlines()
2073                                 batchurls = [x.strip() for x in batchurls]
2074                                 batchurls = [x for x in batchurls if len(x) > 0]
2075                         except IOError:
2076                                 sys.exit(u'ERROR: batch file could not be read')
2077                 all_urls = batchurls + args
2078
2079                 # Conflicting, missing and erroneous options
2080                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2081                         parser.error(u'using .netrc conflicts with giving username/password')
2082                 if opts.password is not None and opts.username is None:
2083                         parser.error(u'account username missing')
2084                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2085                         parser.error(u'using output template conflicts with using title or literal title')
2086                 if opts.usetitle and opts.useliteral:
2087                         parser.error(u'using title conflicts with using literal title')
2088                 if opts.username is not None and opts.password is None:
2089                         opts.password = getpass.getpass(u'Type account password and press return:')
2090                 if opts.ratelimit is not None:
2091                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2092                         if numeric_limit is None:
2093                                 parser.error(u'invalid rate limit specified')
2094                         opts.ratelimit = numeric_limit
2095                 if opts.retries is not None:
2096                         try:
2097                                 opts.retries = long(opts.retries)
2098                         except (TypeError, ValueError), err:
2099                                 parser.error(u'invalid retry count specified')
2100
2101                 # Information extractors
2102                 youtube_ie = YoutubeIE()
2103                 metacafe_ie = MetacafeIE(youtube_ie)
2104                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2105                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2106                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2107                 google_ie = GoogleIE()
2108                 google_search_ie = GoogleSearchIE(google_ie)
2109                 photobucket_ie = PhotobucketIE()
2110                 yahoo_ie = YahooIE()
2111                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2112                 generic_ie = GenericIE()
2113
2114                 # File downloader
2115                 fd = FileDownloader({
2116                         'usenetrc': opts.usenetrc,
2117                         'username': opts.username,
2118                         'password': opts.password,
2119                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2120                         'forceurl': opts.geturl,
2121                         'forcetitle': opts.gettitle,
2122                         'forcethumbnail': opts.getthumbnail,
2123                         'forcedescription': opts.getdescription,
2124                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2125                         'format': opts.format,
2126                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2127                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2128                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2129                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2130                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2131                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2132                                 or u'%(id)s.%(ext)s'),
2133                         'ignoreerrors': opts.ignoreerrors,
2134                         'ratelimit': opts.ratelimit,
2135                         'nooverwrites': opts.nooverwrites,
2136                         'retries': opts.retries,
2137                         'continuedl': opts.continue_dl,
2138                         'noprogress': opts.noprogress,
2139                         })
2140                 fd.add_info_extractor(youtube_search_ie)
2141                 fd.add_info_extractor(youtube_pl_ie)
2142                 fd.add_info_extractor(youtube_user_ie)
2143                 fd.add_info_extractor(metacafe_ie)
2144                 fd.add_info_extractor(youtube_ie)
2145                 fd.add_info_extractor(google_ie)
2146                 fd.add_info_extractor(google_search_ie)
2147                 fd.add_info_extractor(photobucket_ie)
2148                 fd.add_info_extractor(yahoo_ie)
2149                 fd.add_info_extractor(yahoo_search_ie)
2150
2151                 # This must come last since it's the
2152                 # fallback if none of the others work
2153                 fd.add_info_extractor(generic_ie)
2154
2155                 # Update version
2156                 if opts.update_self:
2157                         update_self(fd, sys.argv[0])
2158
2159                 # Maybe do nothing
2160                 if len(all_urls) < 1:
2161                         if not opts.update_self:
2162                                 parser.error(u'you must provide at least one URL')
2163                         else:
2164                                 sys.exit()
2165                 retcode = fd.download(all_urls)
2166                 sys.exit(retcode)
2167
2168         except DownloadError:
2169                 sys.exit(1)
2170         except SameFileError:
2171                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2172         except KeyboardInterrupt:
2173                 sys.exit(u'\nERROR: Interrupted by user')