youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # Author: Paweł Paprota
   9 # Author: Gergely Imreh
  10 # License: Public domain code
  11
  12 from __future__ import with_statement
  13
  14 import contextlib
  15 import cookielib
  16 import ctypes
  17 import datetime
  18 import email.utils
  19 import gzip
  20 import htmlentitydefs
  21 import httplib
  22 import locale
  23 import math
  24 import netrc
  25 import os
  26 import os.path
  27 import re
  28 import socket
  29 import string
  30 import subprocess
  31 import sys
  32 import time
  33 import urllib
  34 import urllib2
  35 import warnings
  36 import zlib
  37
  38 try:
  39         import json
  40 except ImportError:
  41         warnings.warn('No JSON support (TODO: insert trivialjson here)')
  42
  43 try:
  44         import cStringIO as StringIO
  45 except ImportError:
  46         import StringIO
  47
  48 # parse_qs was moved from the cgi module to the urlparse module recently.
  49 try:
  50         from urlparse import parse_qs
  51 except ImportError:
  52         from cgi import parse_qs
  53
  54 try:
  55         import lxml.etree
  56 except ImportError: # Python < 2.6
  57         pass # Handled below
  58
  59 std_headers = {
  60         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
  61         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  62         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  63         'Accept-Encoding': 'gzip, deflate',
  64         'Accept-Language': 'en-us,en;q=0.5',
  65 }
  66
  67 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  68
  69 def preferredencoding():
  70         """Get preferred encoding.
  71
  72         Returns the best encoding scheme for the system, based on
  73         locale.getpreferredencoding() and some further tweaks.
  74         """
  75         def yield_preferredencoding():
  76                 try:
  77                         pref = locale.getpreferredencoding()
  78                         u'TEST'.encode(pref)
  79                 except:
  80                         pref = 'UTF-8'
  81                 while True:
  82                         yield pref
  83         return yield_preferredencoding().next()
  84
  85 def htmlentity_transform(matchobj):
  86         """Transforms an HTML entity to a Unicode character.
  87
  88         This function receives a match object and is intended to be used with
  89         the re.sub() function.
  90         """
  91         entity = matchobj.group(1)
  92
  93         # Known non-numeric HTML entity
  94         if entity in htmlentitydefs.name2codepoint:
  95                 return unichr(htmlentitydefs.name2codepoint[entity])
  96
  97         # Unicode character
  98         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  99         if mobj is not None:
 100                 numstr = mobj.group(1)
 101                 if numstr.startswith(u'x'):
 102                         base = 16
 103                         numstr = u'0%s' % numstr
 104                 else:
 105                         base = 10
 106                 return unichr(long(numstr, base))
 107
 108         # Unknown entity in name, return its literal representation
 109         return (u'&%s;' % entity)
 110
 111 def sanitize_title(utitle):
 112         """Sanitizes a video title so it could be used as part of a filename."""
 113         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 114         return utitle.replace(unicode(os.sep), u'%')
 115
 116 def sanitize_open(filename, open_mode):
 117         """Try to open the given filename, and slightly tweak it if this fails.
 118
 119         Attempts to open the given filename. If this fails, it tries to change
 120         the filename slightly, step by step, until it's either able to open it
 121         or it fails and raises a final exception, like the standard open()
 122         function.
 123
 124         It returns the tuple (stream, definitive_file_name).
 125         """
 126         try:
 127                 if filename == u'-':
 128                         if sys.platform == 'win32':
 129                                 import msvcrt
 130                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 131                         return (sys.stdout, filename)
 132                 stream = open(filename, open_mode)
 133                 return (stream, filename)
 134         except (IOError, OSError), err:
 135                 # In case of error, try to remove win32 forbidden chars
 136                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 137
 138                 # An exception here should be caught in the caller
 139                 stream = open(filename, open_mode)
 140                 return (stream, filename)
 141
 142 def timeconvert(timestr):
 143     """Convert RFC 2822 defined time string into system timestamp"""
 144     timestamp = None
 145     timetuple = email.utils.parsedate_tz(timestr)
 146     if timetuple is not None:
 147         timestamp = email.utils.mktime_tz(timetuple)
 148     return timestamp
 149
 150 class DownloadError(Exception):
 151         """Download Error exception.
 152
 153         This exception may be thrown by FileDownloader objects if they are not
 154         configured to continue on errors. They will contain the appropriate
 155         error message.
 156         """
 157         pass
 158
 159 class SameFileError(Exception):
 160         """Same File exception.
 161
 162         This exception will be thrown by FileDownloader objects if they detect
 163         multiple files would have to be downloaded to the same file on disk.
 164         """
 165         pass
 166
 167 class PostProcessingError(Exception):
 168         """Post Processing exception.
 169
 170         This exception may be raised by PostProcessor's .run() method to
 171         indicate an error in the postprocessing task.
 172         """
 173         pass
 174
 175 class UnavailableVideoError(Exception):
 176         """Unavailable Format exception.
 177
 178         This exception will be thrown when a video is requested
 179         in a format that is not available for that video.
 180         """
 181         pass
 182
 183 class ContentTooShortError(Exception):
 184         """Content Too Short exception.
 185
 186         This exception may be raised by FileDownloader objects when a file they
 187         download is too small for what the server announced first, indicating
 188         the connection was probably interrupted.
 189         """
 190         # Both in bytes
 191         downloaded = None
 192         expected = None
 193
 194         def __init__(self, downloaded, expected):
 195                 self.downloaded = downloaded
 196                 self.expected = expected
 197
 198 class YoutubeDLHandler(urllib2.HTTPHandler):
 199         """Handler for HTTP requests and responses.
 200
 201         This class, when installed with an OpenerDirector, automatically adds
 202         the standard headers to every HTTP request and handles gzipped and
 203         deflated responses from web servers. If compression is to be avoided in
 204         a particular request, the original request in the program code only has
 205         to include the HTTP header "Youtubedl-No-Compression", which will be
 206         removed before making the real request.
 207
 208         Part of this code was copied from:
 209
 210           http://techknack.net/python-urllib2-handlers/
 211
 212         Andrew Rowls, the author of that code, agreed to release it to the
 213         public domain.
 214         """
 215
 216         @staticmethod
 217         def deflate(data):
 218                 try:
 219                         return zlib.decompress(data, -zlib.MAX_WBITS)
 220                 except zlib.error:
 221                         return zlib.decompress(data)
 222
 223         @staticmethod
 224         def addinfourl_wrapper(stream, headers, url, code):
 225                 if hasattr(urllib2.addinfourl, 'getcode'):
 226                         return urllib2.addinfourl(stream, headers, url, code)
 227                 ret = urllib2.addinfourl(stream, headers, url)
 228                 ret.code = code
 229                 return ret
 230
 231         def http_request(self, req):
 232                 for h in std_headers:
 233                         if h in req.headers:
 234                                 del req.headers[h]
 235                         req.add_header(h, std_headers[h])
 236                 if 'Youtubedl-no-compression' in req.headers:
 237                         if 'Accept-encoding' in req.headers:
 238                                 del req.headers['Accept-encoding']
 239                         del req.headers['Youtubedl-no-compression']
 240                 return req
 241
 242         def http_response(self, req, resp):
 243                 old_resp = resp
 244                 # gzip
 245                 if resp.headers.get('Content-encoding', '') == 'gzip':
 246                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 247                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 248                         resp.msg = old_resp.msg
 249                 # deflate
 250                 if resp.headers.get('Content-encoding', '') == 'deflate':
 251                         gz = StringIO.StringIO(self.deflate(resp.read()))
 252                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 253                         resp.msg = old_resp.msg
 254                 return resp
 255
 256 class FileDownloader(object):
 257         """File Downloader class.
 258
 259         File downloader objects are the ones responsible of downloading the
 260         actual video file and writing it to disk if the user has requested
 261         it, among some other tasks. In most cases there should be one per
 262         program. As, given a video URL, the downloader doesn't know how to
 263         extract all the needed information, task that InfoExtractors do, it
 264         has to pass the URL to one of them.
 265
 266         For this, file downloader objects have a method that allows
 267         InfoExtractors to be registered in a given order. When it is passed
 268         a URL, the file downloader handles it to the first InfoExtractor it
 269         finds that reports being able to handle it. The InfoExtractor extracts
 270         all the information about the video or videos the URL refers to, and
 271         asks the FileDownloader to process the video information, possibly
 272         downloading the video.
 273
 274         File downloaders accept a lot of parameters. In order not to saturate
 275         the object constructor with arguments, it receives a dictionary of
 276         options instead. These options are available through the params
 277         attribute for the InfoExtractors to use. The FileDownloader also
 278         registers itself as the downloader in charge for the InfoExtractors
 279         that are added to it, so this is a "mutual registration".
 280
 281         Available options:
 282
 283         username:         Username for authentication purposes.
 284         password:         Password for authentication purposes.
 285         usenetrc:         Use netrc for authentication instead.
 286         quiet:            Do not print messages to stdout.
 287         forceurl:         Force printing final URL.
 288         forcetitle:       Force printing title.
 289         forcethumbnail:   Force printing thumbnail URL.
 290         forcedescription: Force printing description.
 291         forcefilename:    Force printing final filename.
 292         simulate:         Do not download the video files.
 293         format:           Video format code.
 294         format_limit:     Highest quality format to try.
 295         outtmpl:          Template for output names.
 296         ignoreerrors:     Do not stop on download errors.
 297         ratelimit:        Download speed limit, in bytes/sec.
 298         nooverwrites:     Prevent overwriting files.
 299         retries:          Number of times to retry for HTTP error 5xx
 300         continuedl:       Try to continue downloads if possible.
 301         noprogress:       Do not print the progress bar.
 302         playliststart:    Playlist item to start at.
 303         playlistend:      Playlist item to end at.
 304         logtostderr:      Log messages to stderr instead of stdout.
 305         consoletitle:     Display progress in console window's titlebar.
 306         nopart:           Do not use temporary .part files.
 307         updatetime:       Use the Last-modified header to set output file timestamps.
 308         writedescription: Write the video description to a .description file
 309         """
 310
 311         params = None
 312         _ies = []
 313         _pps = []
 314         _download_retcode = None
 315         _num_downloads = None
 316         _screen_file = None
 317
 318         def __init__(self, params):
 319                 """Create a FileDownloader object with the given options."""
 320                 self._ies = []
 321                 self._pps = []
 322                 self._download_retcode = 0
 323                 self._num_downloads = 0
 324                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 325                 self.params = params
 326
 327         @staticmethod
 328         def pmkdir(filename):
 329                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 330                 components = filename.split(os.sep)
 331                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 332                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 333                 for dir in aggregate:
 334                         if not os.path.exists(dir):
 335                                 os.mkdir(dir)
 336
 337         @staticmethod
 338         def format_bytes(bytes):
 339                 if bytes is None:
 340                         return 'N/A'
 341                 if type(bytes) is str:
 342                         bytes = float(bytes)
 343                 if bytes == 0.0:
 344                         exponent = 0
 345                 else:
 346                         exponent = long(math.log(bytes, 1024.0))
 347                 suffix = 'bkMGTPEZY'[exponent]
 348                 converted = float(bytes) / float(1024**exponent)
 349                 return '%.2f%s' % (converted, suffix)
 350
 351         @staticmethod
 352         def calc_percent(byte_counter, data_len):
 353                 if data_len is None:
 354                         return '---.-%'
 355                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 356
 357         @staticmethod
 358         def calc_eta(start, now, total, current):
 359                 if total is None:
 360                         return '--:--'
 361                 dif = now - start
 362                 if current == 0 or dif < 0.001: # One millisecond
 363                         return '--:--'
 364                 rate = float(current) / dif
 365                 eta = long((float(total) - float(current)) / rate)
 366                 (eta_mins, eta_secs) = divmod(eta, 60)
 367                 if eta_mins > 99:
 368                         return '--:--'
 369                 return '%02d:%02d' % (eta_mins, eta_secs)
 370
 371         @staticmethod
 372         def calc_speed(start, now, bytes):
 373                 dif = now - start
 374                 if bytes == 0 or dif < 0.001: # One millisecond
 375                         return '%10s' % '---b/s'
 376                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 377
 378         @staticmethod
 379         def best_block_size(elapsed_time, bytes):
 380                 new_min = max(bytes / 2.0, 1.0)
 381                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 382                 if elapsed_time < 0.001:
 383                         return long(new_max)
 384                 rate = bytes / elapsed_time
 385                 if rate > new_max:
 386                         return long(new_max)
 387                 if rate < new_min:
 388                         return long(new_min)
 389                 return long(rate)
 390
 391         @staticmethod
 392         def parse_bytes(bytestr):
 393                 """Parse a string indicating a byte quantity into a long integer."""
 394                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 395                 if matchobj is None:
 396                         return None
 397                 number = float(matchobj.group(1))
 398                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 399                 return long(round(number * multiplier))
 400
 401         def add_info_extractor(self, ie):
 402                 """Add an InfoExtractor object to the end of the list."""
 403                 self._ies.append(ie)
 404                 ie.set_downloader(self)
 405
 406         def add_post_processor(self, pp):
 407                 """Add a PostProcessor object to the end of the chain."""
 408                 self._pps.append(pp)
 409                 pp.set_downloader(self)
 410
 411         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 412                 """Print message to stdout if not in quiet mode."""
 413                 try:
 414                         if not self.params.get('quiet', False):
 415                                 terminator = [u'\n', u''][skip_eol]
 416                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 417                         self._screen_file.flush()
 418                 except (UnicodeEncodeError), err:
 419                         if not ignore_encoding_errors:
 420                                 raise
 421
 422         def to_stderr(self, message):
 423                 """Print message to stderr."""
 424                 print >>sys.stderr, message.encode(preferredencoding())
 425
 426         def to_cons_title(self, message):
 427                 """Set console/terminal window title to message."""
 428                 if not self.params.get('consoletitle', False):
 429                         return
 430                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 431                         # c_wchar_p() might not be necessary if `message` is
 432                         # already of type unicode()
 433                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 434                 elif 'TERM' in os.environ:
 435                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 436
 437         def fixed_template(self):
 438                 """Checks if the output template is fixed."""
 439                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 440
 441         def trouble(self, message=None):
 442                 """Determine action to take when a download problem appears.
 443
 444                 Depending on if the downloader has been configured to ignore
 445                 download errors or not, this method may throw an exception or
 446                 not when errors are found, after printing the message.
 447                 """
 448                 if message is not None:
 449                         self.to_stderr(message)
 450                 if not self.params.get('ignoreerrors', False):
 451                         raise DownloadError(message)
 452                 self._download_retcode = 1
 453
 454         def slow_down(self, start_time, byte_counter):
 455                 """Sleep if the download speed is over the rate limit."""
 456                 rate_limit = self.params.get('ratelimit', None)
 457                 if rate_limit is None or byte_counter == 0:
 458                         return
 459                 now = time.time()
 460                 elapsed = now - start_time
 461                 if elapsed <= 0.0:
 462                         return
 463                 speed = float(byte_counter) / elapsed
 464                 if speed > rate_limit:
 465                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 466
 467         def temp_name(self, filename):
 468                 """Returns a temporary filename for the given filename."""
 469                 if self.params.get('nopart', False) or filename == u'-' or \
 470                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 471                         return filename
 472                 return filename + u'.part'
 473
 474         def undo_temp_name(self, filename):
 475                 if filename.endswith(u'.part'):
 476                         return filename[:-len(u'.part')]
 477                 return filename
 478
 479         def try_rename(self, old_filename, new_filename):
 480                 try:
 481                         if old_filename == new_filename:
 482                                 return
 483                         os.rename(old_filename, new_filename)
 484                 except (IOError, OSError), err:
 485                         self.trouble(u'ERROR: unable to rename file')
 486
 487         def try_utime(self, filename, last_modified_hdr):
 488                 """Try to set the last-modified time of the given file."""
 489                 if last_modified_hdr is None:
 490                         return
 491                 if not os.path.isfile(filename):
 492                         return
 493                 timestr = last_modified_hdr
 494                 if timestr is None:
 495                         return
 496                 filetime = timeconvert(timestr)
 497                 if filetime is None:
 498                         return
 499                 try:
 500                         os.utime(filename,(time.time(), filetime))
 501                 except:
 502                         pass
 503
 504         def report_writedescription(self, descfn):
 505                 """ Report that the description file has been written """
 506                 self.to_screen(u'[info] Video description written to: %s' % descfn, ignore_encoding_errors=True)
 507
 508         def report_destination(self, filename):
 509                 """Report destination filename."""
 510                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 511
 512         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 513                 """Report download progress."""
 514                 if self.params.get('noprogress', False):
 515                         return
 516                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 517                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 518                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 519                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 520
 521         def report_resuming_byte(self, resume_len):
 522                 """Report attempt to resume at given byte."""
 523                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 524
 525         def report_retry(self, count, retries):
 526                 """Report retry in case of HTTP error 5xx"""
 527                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 528
 529         def report_file_already_downloaded(self, file_name):
 530                 """Report file has already been fully downloaded."""
 531                 try:
 532                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 533                 except (UnicodeEncodeError), err:
 534                         self.to_screen(u'[download] The file has already been downloaded')
 535
 536         def report_unable_to_resume(self):
 537                 """Report it was impossible to resume download."""
 538                 self.to_screen(u'[download] Unable to resume')
 539
 540         def report_finish(self):
 541                 """Report download finished."""
 542                 if self.params.get('noprogress', False):
 543                         self.to_screen(u'[download] Download completed')
 544                 else:
 545                         self.to_screen(u'')
 546
 547         def increment_downloads(self):
 548                 """Increment the ordinal that assigns a number to each file."""
 549                 self._num_downloads += 1
 550
 551         def prepare_filename(self, info_dict):
 552                 """Generate the output filename."""
 553                 try:
 554                         template_dict = dict(info_dict)
 555                         template_dict['epoch'] = unicode(long(time.time()))
 556                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 557                         filename = self.params['outtmpl'] % template_dict
 558                         return filename
 559                 except (ValueError, KeyError), err:
 560                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 561                         return None
 562
 563         def process_info(self, info_dict):
 564                 """Process a single dictionary returned by an InfoExtractor."""
 565                 filename = self.prepare_filename(info_dict)
 566                 # Do nothing else if in simulate mode
 567                 if self.params.get('simulate', False):
 568                         # Forced printings
 569                         if self.params.get('forcetitle', False):
 570                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 571                         if self.params.get('forceurl', False):
 572                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 573                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 574                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 575                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 576                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 577                         if self.params.get('forcefilename', False) and filename is not None:
 578                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 579
 580                         return
 581
 582                 if filename is None:
 583                         return
 584                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 585                         self.to_stderr(u'WARNING: file exists and will be skipped')
 586                         return
 587
 588                 try:
 589                         self.pmkdir(filename)
 590                 except (OSError, IOError), err:
 591                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 592                         return
 593
 594                 if self.params.get('writedescription', False):
 595                         try:
 596                                 descfn = filename + '.description'
 597                                 with contextlib.closing(open(descfn, 'wb')) as descfile:
 598                                         descfile.write(info_dict['description'].encode('utf-8'))
 599                                 self.report_writedescription(descfn)
 600                         except (OSError, IOError):
 601                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 602                                 return
 603
 604                 try:
 605                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 606                 except (OSError, IOError), err:
 607                         raise UnavailableVideoError
 608                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 609                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 610                         return
 611                 except (ContentTooShortError, ), err:
 612                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 613                         return
 614
 615                 if success:
 616                         try:
 617                                 self.post_process(filename, info_dict)
 618                         except (PostProcessingError), err:
 619                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 620                                 return
 621
 622         def download(self, url_list):
 623                 """Download a given list of URLs."""
 624                 if len(url_list) > 1 and self.fixed_template():
 625                         raise SameFileError(self.params['outtmpl'])
 626
 627                 for url in url_list:
 628                         suitable_found = False
 629                         for ie in self._ies:
 630                                 # Go to next InfoExtractor if not suitable
 631                                 if not ie.suitable(url):
 632                                         continue
 633
 634                                 # Suitable InfoExtractor found
 635                                 suitable_found = True
 636
 637                                 # Extract information from URL and process it
 638                                 ie.extract(url)
 639
 640                                 # Suitable InfoExtractor had been found; go to next URL
 641                                 break
 642
 643                         if not suitable_found:
 644                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 645
 646                 return self._download_retcode
 647
 648         def post_process(self, filename, ie_info):
 649                 """Run the postprocessing chain on the given file."""
 650                 info = dict(ie_info)
 651                 info['filepath'] = filename
 652                 for pp in self._pps:
 653                         info = pp.run(info)
 654                         if info is None:
 655                                 break
 656
 657         def _download_with_rtmpdump(self, filename, url, player_url):
 658                 self.report_destination(filename)
 659                 tmpfilename = self.temp_name(filename)
 660
 661                 # Check for rtmpdump first
 662                 try:
 663                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 664                 except (OSError, IOError):
 665                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 666                         return False
 667
 668                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 669                 # the connection was interrumpted and resuming appears to be
 670                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 671                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 672                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 673                 while retval == 2 or retval == 1:
 674                         prevsize = os.path.getsize(tmpfilename)
 675                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 676                         time.sleep(5.0) # This seems to be needed
 677                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 678                         cursize = os.path.getsize(tmpfilename)
 679                         if prevsize == cursize and retval == 1:
 680                                 break
 681                 if retval == 0:
 682                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 683                         self.try_rename(tmpfilename, filename)
 684                         return True
 685                 else:
 686                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 687                         return False
 688
 689         def _do_download(self, filename, url, player_url):
 690                 # Check file already present
 691                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 692                         self.report_file_already_downloaded(filename)
 693                         return True
 694
 695                 # Attempt to download using rtmpdump
 696                 if url.startswith('rtmp'):
 697                         return self._download_with_rtmpdump(filename, url, player_url)
 698
 699                 tmpfilename = self.temp_name(filename)
 700                 stream = None
 701                 open_mode = 'wb'
 702
 703                 # Do not include the Accept-Encoding header
 704                 headers = {'Youtubedl-no-compression': 'True'}
 705                 basic_request = urllib2.Request(url, None, headers)
 706                 request = urllib2.Request(url, None, headers)
 707
 708                 # Establish possible resume length
 709                 if os.path.isfile(tmpfilename):
 710                         resume_len = os.path.getsize(tmpfilename)
 711                 else:
 712                         resume_len = 0
 713
 714                 # Request parameters in case of being able to resume
 715                 if self.params.get('continuedl', False) and resume_len != 0:
 716                         self.report_resuming_byte(resume_len)
 717                         request.add_header('Range','bytes=%d-' % resume_len)
 718                         open_mode = 'ab'
 719
 720                 count = 0
 721                 retries = self.params.get('retries', 0)
 722                 while count <= retries:
 723                         # Establish connection
 724                         try:
 725                                 data = urllib2.urlopen(request)
 726                                 break
 727                         except (urllib2.HTTPError, ), err:
 728                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 729                                         # Unexpected HTTP error
 730                                         raise
 731                                 elif err.code == 416:
 732                                         # Unable to resume (requested range not satisfiable)
 733                                         try:
 734                                                 # Open the connection again without the range header
 735                                                 data = urllib2.urlopen(basic_request)
 736                                                 content_length = data.info()['Content-Length']
 737                                         except (urllib2.HTTPError, ), err:
 738                                                 if err.code < 500 or err.code >= 600:
 739                                                         raise
 740                                         else:
 741                                                 # Examine the reported length
 742                                                 if (content_length is not None and
 743                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 744                                                         # The file had already been fully downloaded.
 745                                                         # Explanation to the above condition: in issue #175 it was revealed that
 746                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 747                                                         # changing the file size slightly and causing problems for some users. So
 748                                                         # I decided to implement a suggested change and consider the file
 749                                                         # completely downloaded if the file size differs less than 100 bytes from
 750                                                         # the one in the hard drive.
 751                                                         self.report_file_already_downloaded(filename)
 752                                                         self.try_rename(tmpfilename, filename)
 753                                                         return True
 754                                                 else:
 755                                                         # The length does not match, we start the download over
 756                                                         self.report_unable_to_resume()
 757                                                         open_mode = 'wb'
 758                                                         break
 759                         # Retry
 760                         count += 1
 761                         if count <= retries:
 762                                 self.report_retry(count, retries)
 763
 764                 if count > retries:
 765                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 766                         return False
 767
 768                 data_len = data.info().get('Content-length', None)
 769                 if data_len is not None:
 770                         data_len = long(data_len) + resume_len
 771                 data_len_str = self.format_bytes(data_len)
 772                 byte_counter = 0 + resume_len
 773                 block_size = 1024
 774                 start = time.time()
 775                 while True:
 776                         # Download and write
 777                         before = time.time()
 778                         data_block = data.read(block_size)
 779                         after = time.time()
 780                         if len(data_block) == 0:
 781                                 break
 782                         byte_counter += len(data_block)
 783
 784                         # Open file just in time
 785                         if stream is None:
 786                                 try:
 787                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 788                                         filename = self.undo_temp_name(tmpfilename)
 789                                         self.report_destination(filename)
 790                                 except (OSError, IOError), err:
 791                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 792                                         return False
 793                         try:
 794                                 stream.write(data_block)
 795                         except (IOError, OSError), err:
 796                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 797                                 return False
 798                         block_size = self.best_block_size(after - before, len(data_block))
 799
 800                         # Progress message
 801                         percent_str = self.calc_percent(byte_counter, data_len)
 802                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 803                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 804                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 805
 806                         # Apply rate limit
 807                         self.slow_down(start, byte_counter - resume_len)
 808
 809                 stream.close()
 810                 self.report_finish()
 811                 if data_len is not None and byte_counter != data_len:
 812                         raise ContentTooShortError(byte_counter, long(data_len))
 813                 self.try_rename(tmpfilename, filename)
 814
 815                 # Update file modification time
 816                 if self.params.get('updatetime', True):
 817                         self.try_utime(filename, data.info().get('last-modified', None))
 818
 819                 return True
 820
 821 class InfoExtractor(object):
 822         """Information Extractor class.
 823
 824         Information extractors are the classes that, given a URL, extract
 825         information from the video (or videos) the URL refers to. This
 826         information includes the real video URL, the video title and simplified
 827         title, author and others. The information is stored in a dictionary
 828         which is then passed to the FileDownloader. The FileDownloader
 829         processes this information possibly downloading the video to the file
 830         system, among other possible outcomes. The dictionaries must include
 831         the following fields:
 832
 833         id:             Video identifier.
 834         url:            Final video URL.
 835         uploader:       Nickname of the video uploader.
 836         title:          Literal title.
 837         stitle:         Simplified title.
 838         ext:            Video filename extension.
 839         format:         Video format.
 840         player_url:     SWF Player URL (may be None).
 841
 842         The following fields are optional. Their primary purpose is to allow
 843         youtube-dl to serve as the backend for a video search function, such
 844         as the one in youtube2mp3.  They are only used when their respective
 845         forced printing functions are called:
 846
 847         thumbnail:      Full URL to a video thumbnail image.
 848         description:    One-line video description.
 849
 850         Subclasses of this one should re-define the _real_initialize() and
 851         _real_extract() methods, as well as the suitable() static method.
 852         Probably, they should also be instantiated and added to the main
 853         downloader.
 854         """
 855
 856         _ready = False
 857         _downloader = None
 858
 859         def __init__(self, downloader=None):
 860                 """Constructor. Receives an optional downloader."""
 861                 self._ready = False
 862                 self.set_downloader(downloader)
 863
 864         @staticmethod
 865         def suitable(url):
 866                 """Receives a URL and returns True if suitable for this IE."""
 867                 return False
 868
 869         def initialize(self):
 870                 """Initializes an instance (authentication, etc)."""
 871                 if not self._ready:
 872                         self._real_initialize()
 873                         self._ready = True
 874
 875         def extract(self, url):
 876                 """Extracts URL information and returns it in list of dicts."""
 877                 self.initialize()
 878                 return self._real_extract(url)
 879
 880         def set_downloader(self, downloader):
 881                 """Sets the downloader for this IE."""
 882                 self._downloader = downloader
 883
 884         def _real_initialize(self):
 885                 """Real initialization process. Redefine in subclasses."""
 886                 pass
 887
 888         def _real_extract(self, url):
 889                 """Real extraction process. Redefine in subclasses."""
 890                 pass
 891
 892 class YoutubeIE(InfoExtractor):
 893         """Information extractor for youtube.com."""
 894
 895         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 896         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 897         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 898         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 899         _NETRC_MACHINE = 'youtube'
 900         # Listed in order of quality
 901         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 902         _video_extensions = {
 903                 '13': '3gp',
 904                 '17': 'mp4',
 905                 '18': 'mp4',
 906                 '22': 'mp4',
 907                 '37': 'mp4',
 908                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 909                 '43': 'webm',
 910                 '45': 'webm',
 911         }
 912
 913         @staticmethod
 914         def suitable(url):
 915                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 916
 917         def report_lang(self):
 918                 """Report attempt to set language."""
 919                 self._downloader.to_screen(u'[youtube] Setting language')
 920
 921         def report_login(self):
 922                 """Report attempt to log in."""
 923                 self._downloader.to_screen(u'[youtube] Logging in')
 924
 925         def report_age_confirmation(self):
 926                 """Report attempt to confirm age."""
 927                 self._downloader.to_screen(u'[youtube] Confirming age')
 928
 929         def report_video_webpage_download(self, video_id):
 930                 """Report attempt to download video webpage."""
 931                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 932
 933         def report_video_info_webpage_download(self, video_id):
 934                 """Report attempt to download video info webpage."""
 935                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 936
 937         def report_information_extraction(self, video_id):
 938                 """Report attempt to extract video information."""
 939                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 940
 941         def report_unavailable_format(self, video_id, format):
 942                 """Report extracted video URL."""
 943                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 944
 945         def report_rtmp_download(self):
 946                 """Indicate the download will use the RTMP protocol."""
 947                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 948
 949         def _real_initialize(self):
 950                 if self._downloader is None:
 951                         return
 952
 953                 username = None
 954                 password = None
 955                 downloader_params = self._downloader.params
 956
 957                 # Attempt to use provided username and password or .netrc data
 958                 if downloader_params.get('username', None) is not None:
 959                         username = downloader_params['username']
 960                         password = downloader_params['password']
 961                 elif downloader_params.get('usenetrc', False):
 962                         try:
 963                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 964                                 if info is not None:
 965                                         username = info[0]
 966                                         password = info[2]
 967                                 else:
 968                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 969                         except (IOError, netrc.NetrcParseError), err:
 970                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 971                                 return
 972
 973                 # Set language
 974                 request = urllib2.Request(self._LANG_URL)
 975                 try:
 976                         self.report_lang()
 977                         urllib2.urlopen(request).read()
 978                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 979                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 980                         return
 981
 982                 # No authentication to be performed
 983                 if username is None:
 984                         return
 985
 986                 # Log in
 987                 login_form = {
 988                                 'current_form': 'loginForm',
 989                                 'next':         '/',
 990                                 'action_login': 'Log In',
 991                                 'username':     username,
 992                                 'password':     password,
 993                                 }
 994                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 995                 try:
 996                         self.report_login()
 997                         login_results = urllib2.urlopen(request).read()
 998                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 999                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1000                                 return
1001                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1002                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1003                         return
1004
1005                 # Confirm age
1006                 age_form = {
1007                                 'next_url':             '/',
1008                                 'action_confirm':       'Confirm',
1009                                 }
1010                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1011                 try:
1012                         self.report_age_confirmation()
1013                         age_results = urllib2.urlopen(request).read()
1014                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1015                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1016                         return
1017
1018         def _real_extract(self, url):
1019                 # Extract video id from URL
1020                 mobj = re.match(self._VALID_URL, url)
1021                 if mobj is None:
1022                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1023                         return
1024                 video_id = mobj.group(2)
1025
1026                 # Get video webpage
1027                 self.report_video_webpage_download(video_id)
1028                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1029                 try:
1030                         video_webpage = urllib2.urlopen(request).read()
1031                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1032                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1033                         return
1034
1035                 # Attempt to extract SWF player URL
1036                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1037                 if mobj is not None:
1038                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1039                 else:
1040                         player_url = None
1041
1042                 # Get video info
1043                 self.report_video_info_webpage_download(video_id)
1044                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1045                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1046                                            % (video_id, el_type))
1047                         request = urllib2.Request(video_info_url)
1048                         try:
1049                                 video_info_webpage = urllib2.urlopen(request).read()
1050                                 video_info = parse_qs(video_info_webpage)
1051                                 if 'token' in video_info:
1052                                         break
1053                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1054                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1055                                 return
1056                 if 'token' not in video_info:
1057                         if 'reason' in video_info:
1058                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1059                         else:
1060                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1061                         return
1062
1063                 # Start extracting information
1064                 self.report_information_extraction(video_id)
1065
1066                 # uploader
1067                 if 'author' not in video_info:
1068                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1069                         return
1070                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1071
1072                 # title
1073                 if 'title' not in video_info:
1074                         self._downloader.trouble(u'ERROR: unable to extract video title')
1075                         return
1076                 video_title = urllib.unquote_plus(video_info['title'][0])
1077                 video_title = video_title.decode('utf-8')
1078                 video_title = sanitize_title(video_title)
1079
1080                 # simplified title
1081                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1082                 simple_title = simple_title.strip(ur'_')
1083
1084                 # thumbnail image
1085                 if 'thumbnail_url' not in video_info:
1086                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1087                         video_thumbnail = ''
1088                 else:   # don't panic if we can't find it
1089                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1090
1091                 # upload date
1092                 upload_date = u'NA'
1093                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1094                 if mobj is not None:
1095                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1096                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1097                         for expression in format_expressions:
1098                                 try:
1099                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1100                                 except:
1101                                         pass
1102
1103                 # description
1104                 try:
1105                         lxml.etree
1106                 except NameError:
1107                         video_description = u'No description available.'
1108                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1109                                 warnings.warn(u'You are using an old Python version, install Python 2.6+ or lxml. Falling back to old video description extractor.')
1110                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1111                                 if mobj is not None:
1112                                         video_description = mobj.group(1).decode('utf-8')
1113                 else:
1114                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1115                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1116                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1117
1118                 # token
1119                 video_token = urllib.unquote_plus(video_info['token'][0])
1120
1121                 # Decide which formats to download
1122                 req_format = self._downloader.params.get('format', None)
1123
1124                 if 'fmt_url_map' in video_info and len(video_info['fmt_url_map']) >= 1 and ',' in video_info['fmt_url_map'][0]:
1125                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1126                         format_limit = self._downloader.params.get('format_limit', None)
1127                         if format_limit is not None and format_limit in self._available_formats:
1128                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1129                         else:
1130                                 format_list = self._available_formats
1131                         existing_formats = [x for x in format_list if x in url_map]
1132                         if len(existing_formats) == 0:
1133                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1134                                 return
1135                         if req_format is None:
1136                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1137                         elif req_format == '-1':
1138                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1139                         else:
1140                                 # Specific format
1141                                 if req_format not in url_map:
1142                                         self._downloader.trouble(u'ERROR: requested format not available')
1143                                         return
1144                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1145
1146                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1147                         self.report_rtmp_download()
1148                         video_url_list = [(None, video_info['conn'][0])]
1149
1150                 else:
1151                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1152                         return
1153
1154                 for format_param, video_real_url in video_url_list:
1155                         # At this point we have a new video
1156                         self._downloader.increment_downloads()
1157
1158                         # Extension
1159                         video_extension = self._video_extensions.get(format_param, 'flv')
1160
1161                         # Find the video URL in fmt_url_map or conn paramters
1162                         try:
1163                                 # Process video information
1164                                 self._downloader.process_info({
1165                                         'id':           video_id.decode('utf-8'),
1166                                         'url':          video_real_url.decode('utf-8'),
1167                                         'uploader':     video_uploader.decode('utf-8'),
1168                                         'upload_date':  upload_date,
1169                                         'title':        video_title,
1170                                         'stitle':       simple_title,
1171                                         'ext':          video_extension.decode('utf-8'),
1172                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1173                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1174                                         'description':  video_description,
1175                                         'player_url':   player_url,
1176                                 })
1177                         except UnavailableVideoError, err:
1178                                 self._downloader.trouble(u'\nERROR: unable to download video')
1179
1180
1181 class MetacafeIE(InfoExtractor):
1182         """Information Extractor for metacafe.com."""
1183
1184         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1185         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1186         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1187         _youtube_ie = None
1188
1189         def __init__(self, youtube_ie, downloader=None):
1190                 InfoExtractor.__init__(self, downloader)
1191                 self._youtube_ie = youtube_ie
1192
1193         @staticmethod
1194         def suitable(url):
1195                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1196
1197         def report_disclaimer(self):
1198                 """Report disclaimer retrieval."""
1199                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1200
1201         def report_age_confirmation(self):
1202                 """Report attempt to confirm age."""
1203                 self._downloader.to_screen(u'[metacafe] Confirming age')
1204
1205         def report_download_webpage(self, video_id):
1206                 """Report webpage download."""
1207                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1208
1209         def report_extraction(self, video_id):
1210                 """Report information extraction."""
1211                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1212
1213         def _real_initialize(self):
1214                 # Retrieve disclaimer
1215                 request = urllib2.Request(self._DISCLAIMER)
1216                 try:
1217                         self.report_disclaimer()
1218                         disclaimer = urllib2.urlopen(request).read()
1219                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1221                         return
1222
1223                 # Confirm age
1224                 disclaimer_form = {
1225                         'filters': '0',
1226                         'submit': "Continue - I'm over 18",
1227                         }
1228                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1229                 try:
1230                         self.report_age_confirmation()
1231                         disclaimer = urllib2.urlopen(request).read()
1232                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1233                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1234                         return
1235
1236         def _real_extract(self, url):
1237                 # Extract id and simplified title from URL
1238                 mobj = re.match(self._VALID_URL, url)
1239                 if mobj is None:
1240                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1241                         return
1242
1243                 video_id = mobj.group(1)
1244
1245                 # Check if video comes from YouTube
1246                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1247                 if mobj2 is not None:
1248                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1249                         return
1250
1251                 # At this point we have a new video
1252                 self._downloader.increment_downloads()
1253
1254                 simple_title = mobj.group(2).decode('utf-8')
1255
1256                 # Retrieve video webpage to extract further information
1257                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1258                 try:
1259                         self.report_download_webpage(video_id)
1260                         webpage = urllib2.urlopen(request).read()
1261                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1262                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1263                         return
1264
1265                 # Extract URL, uploader and title from webpage
1266                 self.report_extraction(video_id)
1267                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1268                 if mobj is not None:
1269                         mediaURL = urllib.unquote(mobj.group(1))
1270                         video_extension = mediaURL[-3:]
1271
1272                         # Extract gdaKey if available
1273                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1274                         if mobj is None:
1275                                 video_url = mediaURL
1276                         else:
1277                                 gdaKey = mobj.group(1)
1278                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1279                 else:
1280                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1281                         if mobj is None:
1282                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1283                                 return
1284                         vardict = parse_qs(mobj.group(1))
1285                         if 'mediaData' not in vardict:
1286                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1287                                 return
1288                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1289                         if mobj is None:
1290                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1291                                 return
1292                         mediaURL = mobj.group(1).replace('\\/', '/')
1293                         video_extension = mediaURL[-3:]
1294                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1295
1296                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1297                 if mobj is None:
1298                         self._downloader.trouble(u'ERROR: unable to extract title')
1299                         return
1300                 video_title = mobj.group(1).decode('utf-8')
1301                 video_title = sanitize_title(video_title)
1302
1303                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1304                 if mobj is None:
1305                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1306                         return
1307                 video_uploader = mobj.group(1)
1308
1309                 try:
1310                         # Process video information
1311                         self._downloader.process_info({
1312                                 'id':           video_id.decode('utf-8'),
1313                                 'url':          video_url.decode('utf-8'),
1314                                 'uploader':     video_uploader.decode('utf-8'),
1315                                 'upload_date':  u'NA',
1316                                 'title':        video_title,
1317                                 'stitle':       simple_title,
1318                                 'ext':          video_extension.decode('utf-8'),
1319                                 'format':       u'NA',
1320                                 'player_url':   None,
1321                         })
1322                 except UnavailableVideoError:
1323                         self._downloader.trouble(u'\nERROR: unable to download video')
1324
1325
1326 class DailymotionIE(InfoExtractor):
1327         """Information Extractor for Dailymotion"""
1328
1329         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1330
1331         def __init__(self, downloader=None):
1332                 InfoExtractor.__init__(self, downloader)
1333
1334         @staticmethod
1335         def suitable(url):
1336                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1337
1338         def report_download_webpage(self, video_id):
1339                 """Report webpage download."""
1340                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1341
1342         def report_extraction(self, video_id):
1343                 """Report information extraction."""
1344                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1345
1346         def _real_initialize(self):
1347                 return
1348
1349         def _real_extract(self, url):
1350                 # Extract id and simplified title from URL
1351                 mobj = re.match(self._VALID_URL, url)
1352                 if mobj is None:
1353                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1354                         return
1355
1356                 # At this point we have a new video
1357                 self._downloader.increment_downloads()
1358                 video_id = mobj.group(1)
1359
1360                 simple_title = mobj.group(2).decode('utf-8')
1361                 video_extension = 'flv'
1362
1363                 # Retrieve video webpage to extract further information
1364                 request = urllib2.Request(url)
1365                 try:
1366                         self.report_download_webpage(video_id)
1367                         webpage = urllib2.urlopen(request).read()
1368                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1369                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1370                         return
1371
1372                 # Extract URL, uploader and title from webpage
1373                 self.report_extraction(video_id)
1374                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1375                 if mobj is None:
1376                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1377                         return
1378                 mediaURL = urllib.unquote(mobj.group(1))
1379
1380                 # if needed add http://www.dailymotion.com/ if relative URL
1381
1382                 video_url = mediaURL
1383
1384                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1385                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1386                 if mobj is None:
1387                         self._downloader.trouble(u'ERROR: unable to extract title')
1388                         return
1389                 video_title = mobj.group(1).decode('utf-8')
1390                 video_title = sanitize_title(video_title)
1391
1392                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1393                 if mobj is None:
1394                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1395                         return
1396                 video_uploader = mobj.group(1)
1397
1398                 try:
1399                         # Process video information
1400                         self._downloader.process_info({
1401                                 'id':           video_id.decode('utf-8'),
1402                                 'url':          video_url.decode('utf-8'),
1403                                 'uploader':     video_uploader.decode('utf-8'),
1404                                 'upload_date':  u'NA',
1405                                 'title':        video_title,
1406                                 'stitle':       simple_title,
1407                                 'ext':          video_extension.decode('utf-8'),
1408                                 'format':       u'NA',
1409                                 'player_url':   None,
1410                         })
1411                 except UnavailableVideoError:
1412                         self._downloader.trouble(u'\nERROR: unable to download video')
1413
1414 class GoogleIE(InfoExtractor):
1415         """Information extractor for video.google.com."""
1416
1417         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1418
1419         def __init__(self, downloader=None):
1420                 InfoExtractor.__init__(self, downloader)
1421
1422         @staticmethod
1423         def suitable(url):
1424                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1425
1426         def report_download_webpage(self, video_id):
1427                 """Report webpage download."""
1428                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1429
1430         def report_extraction(self, video_id):
1431                 """Report information extraction."""
1432                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1433
1434         def _real_initialize(self):
1435                 return
1436
1437         def _real_extract(self, url):
1438                 # Extract id from URL
1439                 mobj = re.match(self._VALID_URL, url)
1440                 if mobj is None:
1441                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1442                         return
1443
1444                 # At this point we have a new video
1445                 self._downloader.increment_downloads()
1446                 video_id = mobj.group(1)
1447
1448                 video_extension = 'mp4'
1449
1450                 # Retrieve video webpage to extract further information
1451                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1452                 try:
1453                         self.report_download_webpage(video_id)
1454                         webpage = urllib2.urlopen(request).read()
1455                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1456                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1457                         return
1458
1459                 # Extract URL, uploader, and title from webpage
1460                 self.report_extraction(video_id)
1461                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1462                 if mobj is None:
1463                         video_extension = 'flv'
1464                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1465                 if mobj is None:
1466                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1467                         return
1468                 mediaURL = urllib.unquote(mobj.group(1))
1469                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1470                 mediaURL = mediaURL.replace('\\x26', '\x26')
1471
1472                 video_url = mediaURL
1473
1474                 mobj = re.search(r'<title>(.*)</title>', webpage)
1475                 if mobj is None:
1476                         self._downloader.trouble(u'ERROR: unable to extract title')
1477                         return
1478                 video_title = mobj.group(1).decode('utf-8')
1479                 video_title = sanitize_title(video_title)
1480                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1481
1482                 # Extract video description
1483                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1484                 if mobj is None:
1485                         self._downloader.trouble(u'ERROR: unable to extract video description')
1486                         return
1487                 video_description = mobj.group(1).decode('utf-8')
1488                 if not video_description:
1489                         video_description = 'No description available.'
1490
1491                 # Extract video thumbnail
1492                 if self._downloader.params.get('forcethumbnail', False):
1493                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1494                         try:
1495                                 webpage = urllib2.urlopen(request).read()
1496                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1497                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1498                                 return
1499                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1500                         if mobj is None:
1501                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1502                                 return
1503                         video_thumbnail = mobj.group(1)
1504                 else:   # we need something to pass to process_info
1505                         video_thumbnail = ''
1506
1507
1508                 try:
1509                         # Process video information
1510                         self._downloader.process_info({
1511                                 'id':           video_id.decode('utf-8'),
1512                                 'url':          video_url.decode('utf-8'),
1513                                 'uploader':     u'NA',
1514                                 'upload_date':  u'NA',
1515                                 'title':        video_title,
1516                                 'stitle':       simple_title,
1517                                 'ext':          video_extension.decode('utf-8'),
1518                                 'format':       u'NA',
1519                                 'player_url':   None,
1520                         })
1521                 except UnavailableVideoError:
1522                         self._downloader.trouble(u'\nERROR: unable to download video')
1523
1524
1525 class PhotobucketIE(InfoExtractor):
1526         """Information extractor for photobucket.com."""
1527
1528         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1529
1530         def __init__(self, downloader=None):
1531                 InfoExtractor.__init__(self, downloader)
1532
1533         @staticmethod
1534         def suitable(url):
1535                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1536
1537         def report_download_webpage(self, video_id):
1538                 """Report webpage download."""
1539                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1540
1541         def report_extraction(self, video_id):
1542                 """Report information extraction."""
1543                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1544
1545         def _real_initialize(self):
1546                 return
1547
1548         def _real_extract(self, url):
1549                 # Extract id from URL
1550                 mobj = re.match(self._VALID_URL, url)
1551                 if mobj is None:
1552                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1553                         return
1554
1555                 # At this point we have a new video
1556                 self._downloader.increment_downloads()
1557                 video_id = mobj.group(1)
1558
1559                 video_extension = 'flv'
1560
1561                 # Retrieve video webpage to extract further information
1562                 request = urllib2.Request(url)
1563                 try:
1564                         self.report_download_webpage(video_id)
1565                         webpage = urllib2.urlopen(request).read()
1566                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1567                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1568                         return
1569
1570                 # Extract URL, uploader, and title from webpage
1571                 self.report_extraction(video_id)
1572                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1573                 if mobj is None:
1574                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1575                         return
1576                 mediaURL = urllib.unquote(mobj.group(1))
1577
1578                 video_url = mediaURL
1579
1580                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1581                 if mobj is None:
1582                         self._downloader.trouble(u'ERROR: unable to extract title')
1583                         return
1584                 video_title = mobj.group(1).decode('utf-8')
1585                 video_title = sanitize_title(video_title)
1586                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1587
1588                 video_uploader = mobj.group(2).decode('utf-8')
1589
1590                 try:
1591                         # Process video information
1592                         self._downloader.process_info({
1593                                 'id':           video_id.decode('utf-8'),
1594                                 'url':          video_url.decode('utf-8'),
1595                                 'uploader':     video_uploader,
1596                                 'upload_date':  u'NA',
1597                                 'title':        video_title,
1598                                 'stitle':       simple_title,
1599                                 'ext':          video_extension.decode('utf-8'),
1600                                 'format':       u'NA',
1601                                 'player_url':   None,
1602                         })
1603                 except UnavailableVideoError:
1604                         self._downloader.trouble(u'\nERROR: unable to download video')
1605
1606
1607 class YahooIE(InfoExtractor):
1608         """Information extractor for video.yahoo.com."""
1609
1610         # _VALID_URL matches all Yahoo! Video URLs
1611         # _VPAGE_URL matches only the extractable '/watch/' URLs
1612         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1613         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1614
1615         def __init__(self, downloader=None):
1616                 InfoExtractor.__init__(self, downloader)
1617
1618         @staticmethod
1619         def suitable(url):
1620                 return (re.match(YahooIE._VALID_URL, url) is not None)
1621
1622         def report_download_webpage(self, video_id):
1623                 """Report webpage download."""
1624                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1625
1626         def report_extraction(self, video_id):
1627                 """Report information extraction."""
1628                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1629
1630         def _real_initialize(self):
1631                 return
1632
1633         def _real_extract(self, url, new_video=True):
1634                 # Extract ID from URL
1635                 mobj = re.match(self._VALID_URL, url)
1636                 if mobj is None:
1637                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1638                         return
1639
1640                 # At this point we have a new video
1641                 self._downloader.increment_downloads()
1642                 video_id = mobj.group(2)
1643                 video_extension = 'flv'
1644
1645                 # Rewrite valid but non-extractable URLs as
1646                 # extractable English language /watch/ URLs
1647                 if re.match(self._VPAGE_URL, url) is None:
1648                         request = urllib2.Request(url)
1649                         try:
1650                                 webpage = urllib2.urlopen(request).read()
1651                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1652                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1653                                 return
1654
1655                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1656                         if mobj is None:
1657                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1658                                 return
1659                         yahoo_id = mobj.group(1)
1660
1661                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1662                         if mobj is None:
1663                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1664                                 return
1665                         yahoo_vid = mobj.group(1)
1666
1667                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1668                         return self._real_extract(url, new_video=False)
1669
1670                 # Retrieve video webpage to extract further information
1671                 request = urllib2.Request(url)
1672                 try:
1673                         self.report_download_webpage(video_id)
1674                         webpage = urllib2.urlopen(request).read()
1675                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1677                         return
1678
1679                 # Extract uploader and title from webpage
1680                 self.report_extraction(video_id)
1681                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1682                 if mobj is None:
1683                         self._downloader.trouble(u'ERROR: unable to extract video title')
1684                         return
1685                 video_title = mobj.group(1).decode('utf-8')
1686                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1687
1688                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1689                 if mobj is None:
1690                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1691                         return
1692                 video_uploader = mobj.group(1).decode('utf-8')
1693
1694                 # Extract video thumbnail
1695                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1696                 if mobj is None:
1697                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1698                         return
1699                 video_thumbnail = mobj.group(1).decode('utf-8')
1700
1701                 # Extract video description
1702                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1703                 if mobj is None:
1704                         self._downloader.trouble(u'ERROR: unable to extract video description')
1705                         return
1706                 video_description = mobj.group(1).decode('utf-8')
1707                 if not video_description: video_description = 'No description available.'
1708
1709                 # Extract video height and width
1710                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1711                 if mobj is None:
1712                         self._downloader.trouble(u'ERROR: unable to extract video height')
1713                         return
1714                 yv_video_height = mobj.group(1)
1715
1716                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1717                 if mobj is None:
1718                         self._downloader.trouble(u'ERROR: unable to extract video width')
1719                         return
1720                 yv_video_width = mobj.group(1)
1721
1722                 # Retrieve video playlist to extract media URL
1723                 # I'm not completely sure what all these options are, but we
1724                 # seem to need most of them, otherwise the server sends a 401.
1725                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1726                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1727                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1728                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1729                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1730                 try:
1731                         self.report_download_webpage(video_id)
1732                         webpage = urllib2.urlopen(request).read()
1733                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1734                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1735                         return
1736
1737                 # Extract media URL from playlist XML
1738                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1739                 if mobj is None:
1740                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1741                         return
1742                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1743                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1744
1745                 try:
1746                         # Process video information
1747                         self._downloader.process_info({
1748                                 'id':           video_id.decode('utf-8'),
1749                                 'url':          video_url,
1750                                 'uploader':     video_uploader,
1751                                 'upload_date':  u'NA',
1752                                 'title':        video_title,
1753                                 'stitle':       simple_title,
1754                                 'ext':          video_extension.decode('utf-8'),
1755                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1756                                 'description':  video_description,
1757                                 'thumbnail':    video_thumbnail,
1758                                 'description':  video_description,
1759                                 'player_url':   None,
1760                         })
1761                 except UnavailableVideoError:
1762                         self._downloader.trouble(u'\nERROR: unable to download video')
1763
1764
1765 class GenericIE(InfoExtractor):
1766         """Generic last-resort information extractor."""
1767
1768         def __init__(self, downloader=None):
1769                 InfoExtractor.__init__(self, downloader)
1770
1771         @staticmethod
1772         def suitable(url):
1773                 return True
1774
1775         def report_download_webpage(self, video_id):
1776                 """Report webpage download."""
1777                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1778                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1779
1780         def report_extraction(self, video_id):
1781                 """Report information extraction."""
1782                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1783
1784         def _real_initialize(self):
1785                 return
1786
1787         def _real_extract(self, url):
1788                 # At this point we have a new video
1789                 self._downloader.increment_downloads()
1790
1791                 video_id = url.split('/')[-1]
1792                 request = urllib2.Request(url)
1793                 try:
1794                         self.report_download_webpage(video_id)
1795                         webpage = urllib2.urlopen(request).read()
1796                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1797                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1798                         return
1799                 except ValueError, err:
1800                         # since this is the last-resort InfoExtractor, if
1801                         # this error is thrown, it'll be thrown here
1802                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1803                         return
1804
1805                 self.report_extraction(video_id)
1806                 # Start with something easy: JW Player in SWFObject
1807                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1808                 if mobj is None:
1809                         # Broaden the search a little bit
1810                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1811                 if mobj is None:
1812                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1813                         return
1814
1815                 # It's possible that one of the regexes
1816                 # matched, but returned an empty group:
1817                 if mobj.group(1) is None:
1818                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1819                         return
1820
1821                 video_url = urllib.unquote(mobj.group(1))
1822                 video_id  = os.path.basename(video_url)
1823
1824                 # here's a fun little line of code for you:
1825                 video_extension = os.path.splitext(video_id)[1][1:]
1826                 video_id        = os.path.splitext(video_id)[0]
1827
1828                 # it's tempting to parse this further, but you would
1829                 # have to take into account all the variations like
1830                 #   Video Title - Site Name
1831                 #   Site Name | Video Title
1832                 #   Video Title - Tagline | Site Name
1833                 # and so on and so forth; it's just not practical
1834                 mobj = re.search(r'<title>(.*)</title>', webpage)
1835                 if mobj is None:
1836                         self._downloader.trouble(u'ERROR: unable to extract title')
1837                         return
1838                 video_title = mobj.group(1).decode('utf-8')
1839                 video_title = sanitize_title(video_title)
1840                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1841
1842                 # video uploader is domain name
1843                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1844                 if mobj is None:
1845                         self._downloader.trouble(u'ERROR: unable to extract title')
1846                         return
1847                 video_uploader = mobj.group(1).decode('utf-8')
1848
1849                 try:
1850                         # Process video information
1851                         self._downloader.process_info({
1852                                 'id':           video_id.decode('utf-8'),
1853                                 'url':          video_url.decode('utf-8'),
1854                                 'uploader':     video_uploader,
1855                                 'upload_date':  u'NA',
1856                                 'title':        video_title,
1857                                 'stitle':       simple_title,
1858                                 'ext':          video_extension.decode('utf-8'),
1859                                 'format':       u'NA',
1860                                 'player_url':   None,
1861                         })
1862                 except UnavailableVideoError, err:
1863                         self._downloader.trouble(u'\nERROR: unable to download video')
1864
1865
1866 class YoutubeSearchIE(InfoExtractor):
1867         """Information Extractor for YouTube search queries."""
1868         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1869         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1870         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1871         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1872         _youtube_ie = None
1873         _max_youtube_results = 1000
1874
1875         def __init__(self, youtube_ie, downloader=None):
1876                 InfoExtractor.__init__(self, downloader)
1877                 self._youtube_ie = youtube_ie
1878
1879         @staticmethod
1880         def suitable(url):
1881                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1882
1883         def report_download_page(self, query, pagenum):
1884                 """Report attempt to download playlist page with given number."""
1885                 query = query.decode(preferredencoding())
1886                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1887
1888         def _real_initialize(self):
1889                 self._youtube_ie.initialize()
1890
1891         def _real_extract(self, query):
1892                 mobj = re.match(self._VALID_QUERY, query)
1893                 if mobj is None:
1894                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1895                         return
1896
1897                 prefix, query = query.split(':')
1898                 prefix = prefix[8:]
1899                 query  = query.encode('utf-8')
1900                 if prefix == '':
1901                         self._download_n_results(query, 1)
1902                         return
1903                 elif prefix == 'all':
1904                         self._download_n_results(query, self._max_youtube_results)
1905                         return
1906                 else:
1907                         try:
1908                                 n = long(prefix)
1909                                 if n <= 0:
1910                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1911                                         return
1912                                 elif n > self._max_youtube_results:
1913                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1914                                         n = self._max_youtube_results
1915                                 self._download_n_results(query, n)
1916                                 return
1917                         except ValueError: # parsing prefix as integer fails
1918                                 self._download_n_results(query, 1)
1919                                 return
1920
1921         def _download_n_results(self, query, n):
1922                 """Downloads a specified number of results for a query"""
1923
1924                 video_ids = []
1925                 already_seen = set()
1926                 pagenum = 1
1927
1928                 while True:
1929                         self.report_download_page(query, pagenum)
1930                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1931                         request = urllib2.Request(result_url)
1932                         try:
1933                                 page = urllib2.urlopen(request).read()
1934                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1935                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1936                                 return
1937
1938                         # Extract video identifiers
1939                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1940                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1941                                 if video_id not in already_seen:
1942                                         video_ids.append(video_id)
1943                                         already_seen.add(video_id)
1944                                         if len(video_ids) == n:
1945                                                 # Specified n videos reached
1946                                                 for id in video_ids:
1947                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1948                                                 return
1949
1950                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1951                                 for id in video_ids:
1952                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1953                                 return
1954
1955                         pagenum = pagenum + 1
1956
1957 class GoogleSearchIE(InfoExtractor):
1958         """Information Extractor for Google Video search queries."""
1959         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1960         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1961         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1962         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1963         _google_ie = None
1964         _max_google_results = 1000
1965
1966         def __init__(self, google_ie, downloader=None):
1967                 InfoExtractor.__init__(self, downloader)
1968                 self._google_ie = google_ie
1969
1970         @staticmethod
1971         def suitable(url):
1972                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1973
1974         def report_download_page(self, query, pagenum):
1975                 """Report attempt to download playlist page with given number."""
1976                 query = query.decode(preferredencoding())
1977                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1978
1979         def _real_initialize(self):
1980                 self._google_ie.initialize()
1981
1982         def _real_extract(self, query):
1983                 mobj = re.match(self._VALID_QUERY, query)
1984                 if mobj is None:
1985                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1986                         return
1987
1988                 prefix, query = query.split(':')
1989                 prefix = prefix[8:]
1990                 query  = query.encode('utf-8')
1991                 if prefix == '':
1992                         self._download_n_results(query, 1)
1993                         return
1994                 elif prefix == 'all':
1995                         self._download_n_results(query, self._max_google_results)
1996                         return
1997                 else:
1998                         try:
1999                                 n = long(prefix)
2000                                 if n <= 0:
2001                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2002                                         return
2003                                 elif n > self._max_google_results:
2004                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2005                                         n = self._max_google_results
2006                                 self._download_n_results(query, n)
2007                                 return
2008                         except ValueError: # parsing prefix as integer fails
2009                                 self._download_n_results(query, 1)
2010                                 return
2011
2012         def _download_n_results(self, query, n):
2013                 """Downloads a specified number of results for a query"""
2014
2015                 video_ids = []
2016                 already_seen = set()
2017                 pagenum = 1
2018
2019                 while True:
2020                         self.report_download_page(query, pagenum)
2021                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2022                         request = urllib2.Request(result_url)
2023                         try:
2024                                 page = urllib2.urlopen(request).read()
2025                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2026                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2027                                 return
2028
2029                         # Extract video identifiers
2030                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2031                                 video_id = mobj.group(1)
2032                                 if video_id not in already_seen:
2033                                         video_ids.append(video_id)
2034                                         already_seen.add(video_id)
2035                                         if len(video_ids) == n:
2036                                                 # Specified n videos reached
2037                                                 for id in video_ids:
2038                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2039                                                 return
2040
2041                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2042                                 for id in video_ids:
2043                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2044                                 return
2045
2046                         pagenum = pagenum + 1
2047
2048 class YahooSearchIE(InfoExtractor):
2049         """Information Extractor for Yahoo! Video search queries."""
2050         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2051         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2052         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2053         _MORE_PAGES_INDICATOR = r'\s*Next'
2054         _yahoo_ie = None
2055         _max_yahoo_results = 1000
2056
2057         def __init__(self, yahoo_ie, downloader=None):
2058                 InfoExtractor.__init__(self, downloader)
2059                 self._yahoo_ie = yahoo_ie
2060
2061         @staticmethod
2062         def suitable(url):
2063                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2064
2065         def report_download_page(self, query, pagenum):
2066                 """Report attempt to download playlist page with given number."""
2067                 query = query.decode(preferredencoding())
2068                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2069
2070         def _real_initialize(self):
2071                 self._yahoo_ie.initialize()
2072
2073         def _real_extract(self, query):
2074                 mobj = re.match(self._VALID_QUERY, query)
2075                 if mobj is None:
2076                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2077                         return
2078
2079                 prefix, query = query.split(':')
2080                 prefix = prefix[8:]
2081                 query  = query.encode('utf-8')
2082                 if prefix == '':
2083                         self._download_n_results(query, 1)
2084                         return
2085                 elif prefix == 'all':
2086                         self._download_n_results(query, self._max_yahoo_results)
2087                         return
2088                 else:
2089                         try:
2090                                 n = long(prefix)
2091                                 if n <= 0:
2092                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2093                                         return
2094                                 elif n > self._max_yahoo_results:
2095                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2096                                         n = self._max_yahoo_results
2097                                 self._download_n_results(query, n)
2098                                 return
2099                         except ValueError: # parsing prefix as integer fails
2100                                 self._download_n_results(query, 1)
2101                                 return
2102
2103         def _download_n_results(self, query, n):
2104                 """Downloads a specified number of results for a query"""
2105
2106                 video_ids = []
2107                 already_seen = set()
2108                 pagenum = 1
2109
2110                 while True:
2111                         self.report_download_page(query, pagenum)
2112                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2113                         request = urllib2.Request(result_url)
2114                         try:
2115                                 page = urllib2.urlopen(request).read()
2116                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2117                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2118                                 return
2119
2120                         # Extract video identifiers
2121                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2122                                 video_id = mobj.group(1)
2123                                 if video_id not in already_seen:
2124                                         video_ids.append(video_id)
2125                                         already_seen.add(video_id)
2126                                         if len(video_ids) == n:
2127                                                 # Specified n videos reached
2128                                                 for id in video_ids:
2129                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2130                                                 return
2131
2132                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2133                                 for id in video_ids:
2134                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2135                                 return
2136
2137                         pagenum = pagenum + 1
2138
2139 class YoutubePlaylistIE(InfoExtractor):
2140         """Information Extractor for YouTube playlists."""
2141
2142         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2143         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2144         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2145         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2146         _youtube_ie = None
2147
2148         def __init__(self, youtube_ie, downloader=None):
2149                 InfoExtractor.__init__(self, downloader)
2150                 self._youtube_ie = youtube_ie
2151
2152         @staticmethod
2153         def suitable(url):
2154                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2155
2156         def report_download_page(self, playlist_id, pagenum):
2157                 """Report attempt to download playlist page with given number."""
2158                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2159
2160         def _real_initialize(self):
2161                 self._youtube_ie.initialize()
2162
2163         def _real_extract(self, url):
2164                 # Extract playlist id
2165                 mobj = re.match(self._VALID_URL, url)
2166                 if mobj is None:
2167                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2168                         return
2169
2170                 # Single video case
2171                 if mobj.group(3) is not None:
2172                         self._youtube_ie.extract(mobj.group(3))
2173                         return
2174
2175                 # Download playlist pages
2176                 # prefix is 'p' as default for playlists but there are other types that need extra care
2177                 playlist_prefix = mobj.group(1)
2178                 if playlist_prefix == 'a':
2179                         playlist_access = 'artist'
2180                 else:
2181                         playlist_prefix = 'p'
2182                         playlist_access = 'view_play_list'
2183                 playlist_id = mobj.group(2)
2184                 video_ids = []
2185                 pagenum = 1
2186
2187                 while True:
2188                         self.report_download_page(playlist_id, pagenum)
2189                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2190                         try:
2191                                 page = urllib2.urlopen(request).read()
2192                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2193                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2194                                 return
2195
2196                         # Extract video identifiers
2197                         ids_in_page = []
2198                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2199                                 if mobj.group(1) not in ids_in_page:
2200                                         ids_in_page.append(mobj.group(1))
2201                         video_ids.extend(ids_in_page)
2202
2203                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2204                                 break
2205                         pagenum = pagenum + 1
2206
2207                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2208                 playlistend = self._downloader.params.get('playlistend', -1)
2209                 video_ids = video_ids[playliststart:playlistend]
2210
2211                 for id in video_ids:
2212                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2213                 return
2214
2215 class YoutubeUserIE(InfoExtractor):
2216         """Information Extractor for YouTube users."""
2217
2218         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2219         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2220         _GDATA_PAGE_SIZE = 50
2221         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2222         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2223         _youtube_ie = None
2224
2225         def __init__(self, youtube_ie, downloader=None):
2226                 InfoExtractor.__init__(self, downloader)
2227                 self._youtube_ie = youtube_ie
2228
2229         @staticmethod
2230         def suitable(url):
2231                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2232
2233         def report_download_page(self, username, start_index):
2234                 """Report attempt to download user page."""
2235                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2236                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2237
2238         def _real_initialize(self):
2239                 self._youtube_ie.initialize()
2240
2241         def _real_extract(self, url):
2242                 # Extract username
2243                 mobj = re.match(self._VALID_URL, url)
2244                 if mobj is None:
2245                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2246                         return
2247
2248                 username = mobj.group(1)
2249
2250                 # Download video ids using YouTube Data API. Result size per
2251                 # query is limited (currently to 50 videos) so we need to query
2252                 # page by page until there are no video ids - it means we got
2253                 # all of them.
2254
2255                 video_ids = []
2256                 pagenum = 0
2257
2258                 while True:
2259                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2260                         self.report_download_page(username, start_index)
2261
2262                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2263
2264                         try:
2265                                 page = urllib2.urlopen(request).read()
2266                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2267                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2268                                 return
2269
2270                         # Extract video identifiers
2271                         ids_in_page = []
2272
2273                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2274                                 if mobj.group(1) not in ids_in_page:
2275                                         ids_in_page.append(mobj.group(1))
2276
2277                         video_ids.extend(ids_in_page)
2278
2279                         # A little optimization - if current page is not
2280                         # "full", ie. does not contain PAGE_SIZE video ids then
2281                         # we can assume that this page is the last one - there
2282                         # are no more ids on further pages - no need to query
2283                         # again.
2284
2285                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2286                                 break
2287
2288                         pagenum += 1
2289
2290                 all_ids_count = len(video_ids)
2291                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2292                 playlistend = self._downloader.params.get('playlistend', -1)
2293
2294                 if playlistend == -1:
2295                         video_ids = video_ids[playliststart:]
2296                 else:
2297                         video_ids = video_ids[playliststart:playlistend]
2298
2299                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2300                                            (username, all_ids_count, len(video_ids)))
2301
2302                 for video_id in video_ids:
2303                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2304
2305
2306 class DepositFilesIE(InfoExtractor):
2307         """Information extractor for depositfiles.com"""
2308
2309         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2310
2311         def __init__(self, downloader=None):
2312                 InfoExtractor.__init__(self, downloader)
2313
2314         @staticmethod
2315         def suitable(url):
2316                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2317
2318         def report_download_webpage(self, file_id):
2319                 """Report webpage download."""
2320                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2321
2322         def report_extraction(self, file_id):
2323                 """Report information extraction."""
2324                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2325
2326         def _real_initialize(self):
2327                 return
2328
2329         def _real_extract(self, url):
2330                 # At this point we have a new file
2331                 self._downloader.increment_downloads()
2332
2333                 file_id = url.split('/')[-1]
2334                 # Rebuild url in english locale
2335                 url = 'http://depositfiles.com/en/files/' + file_id
2336
2337                 # Retrieve file webpage with 'Free download' button pressed
2338                 free_download_indication = { 'gateway_result' : '1' }
2339                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2340                 try:
2341                         self.report_download_webpage(file_id)
2342                         webpage = urllib2.urlopen(request).read()
2343                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2344                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2345                         return
2346
2347                 # Search for the real file URL
2348                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2349                 if (mobj is None) or (mobj.group(1) is None):
2350                         # Try to figure out reason of the error.
2351                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2352                         if (mobj is not None) and (mobj.group(1) is not None):
2353                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2354                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2355                         else:
2356                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2357                         return
2358
2359                 file_url = mobj.group(1)
2360                 file_extension = os.path.splitext(file_url)[1][1:]
2361
2362                 # Search for file title
2363                 mobj = re.search(r'<b title="(.*?)">', webpage)
2364                 if mobj is None:
2365                         self._downloader.trouble(u'ERROR: unable to extract title')
2366                         return
2367                 file_title = mobj.group(1).decode('utf-8')
2368
2369                 try:
2370                         # Process file information
2371                         self._downloader.process_info({
2372                                 'id':           file_id.decode('utf-8'),
2373                                 'url':          file_url.decode('utf-8'),
2374                                 'uploader':     u'NA',
2375                                 'upload_date':  u'NA',
2376                                 'title':        file_title,
2377                                 'stitle':       file_title,
2378                                 'ext':          file_extension.decode('utf-8'),
2379                                 'format':       u'NA',
2380                                 'player_url':   None,
2381                         })
2382                 except UnavailableVideoError, err:
2383                         self._downloader.trouble(u'ERROR: unable to download file')
2384
2385 class FacebookIE(InfoExtractor):
2386         """Information Extractor for Facebook"""
2387
2388         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2389         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2390         _NETRC_MACHINE = 'facebook'
2391         _available_formats = ['highqual', 'lowqual']
2392         _video_extensions = {
2393                 'highqual': 'mp4',
2394                 'lowqual': 'mp4',
2395         }
2396
2397         def __init__(self, downloader=None):
2398                 InfoExtractor.__init__(self, downloader)
2399
2400         @staticmethod
2401         def suitable(url):
2402                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2403
2404         def _reporter(self, message):
2405                 """Add header and report message."""
2406                 self._downloader.to_screen(u'[facebook] %s' % message)
2407
2408         def report_login(self):
2409                 """Report attempt to log in."""
2410                 self._reporter(u'Logging in')
2411
2412         def report_video_webpage_download(self, video_id):
2413                 """Report attempt to download video webpage."""
2414                 self._reporter(u'%s: Downloading video webpage' % video_id)
2415
2416         def report_information_extraction(self, video_id):
2417                 """Report attempt to extract video information."""
2418                 self._reporter(u'%s: Extracting video information' % video_id)
2419
2420         def _parse_page(self, video_webpage):
2421                 """Extract video information from page"""
2422                 # General data
2423                 data = {'title': r'class="video_title datawrap">(.*?)</',
2424                         'description': r'<div class="datawrap">(.*?)</div>',
2425                         'owner': r'\("video_owner_name", "(.*?)"\)',
2426                         'upload_date': r'data-date="(.*?)"',
2427                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2428                         }
2429                 video_info = {}
2430                 for piece in data.keys():
2431                         mobj = re.search(data[piece], video_webpage)
2432                         if mobj is not None:
2433                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2434
2435                 # Video urls
2436                 video_urls = {}
2437                 for fmt in self._available_formats:
2438                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2439                         if mobj is not None:
2440                                 # URL is in a Javascript segment inside an escaped Unicode format within
2441                                 # the generally utf-8 page
2442                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2443                 video_info['video_urls'] = video_urls
2444
2445                 return video_info
2446
2447         def _real_initialize(self):
2448                 if self._downloader is None:
2449                         return
2450
2451                 useremail = None
2452                 password = None
2453                 downloader_params = self._downloader.params
2454
2455                 # Attempt to use provided username and password or .netrc data
2456                 if downloader_params.get('username', None) is not None:
2457                         useremail = downloader_params['username']
2458                         password = downloader_params['password']
2459                 elif downloader_params.get('usenetrc', False):
2460                         try:
2461                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2462                                 if info is not None:
2463                                         useremail = info[0]
2464                                         password = info[2]
2465                                 else:
2466                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2467                         except (IOError, netrc.NetrcParseError), err:
2468                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2469                                 return
2470
2471                 if useremail is None:
2472                         return
2473
2474                 # Log in
2475                 login_form = {
2476                         'email': useremail,
2477                         'pass': password,
2478                         'login': 'Log+In'
2479                         }
2480                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2481                 try:
2482                         self.report_login()
2483                         login_results = urllib2.urlopen(request).read()
2484                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2485                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2486                                 return
2487                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2488                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2489                         return
2490
2491         def _real_extract(self, url):
2492                 mobj = re.match(self._VALID_URL, url)
2493                 if mobj is None:
2494                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2495                         return
2496                 video_id = mobj.group('ID')
2497
2498                 # Get video webpage
2499                 self.report_video_webpage_download(video_id)
2500                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2501                 try:
2502                         page = urllib2.urlopen(request)
2503                         video_webpage = page.read()
2504                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2505                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2506                         return
2507
2508                 # Start extracting information
2509                 self.report_information_extraction(video_id)
2510
2511                 # Extract information
2512                 video_info = self._parse_page(video_webpage)
2513
2514                 # uploader
2515                 if 'owner' not in video_info:
2516                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2517                         return
2518                 video_uploader = video_info['owner']
2519
2520                 # title
2521                 if 'title' not in video_info:
2522                         self._downloader.trouble(u'ERROR: unable to extract video title')
2523                         return
2524                 video_title = video_info['title']
2525                 video_title = video_title.decode('utf-8')
2526                 video_title = sanitize_title(video_title)
2527
2528                 # simplified title
2529                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2530                 simple_title = simple_title.strip(ur'_')
2531
2532                 # thumbnail image
2533                 if 'thumbnail' not in video_info:
2534                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2535                         video_thumbnail = ''
2536                 else:
2537                         video_thumbnail = video_info['thumbnail']
2538
2539                 # upload date
2540                 upload_date = u'NA'
2541                 if 'upload_date' in video_info:
2542                         upload_time = video_info['upload_date']
2543                         timetuple = email.utils.parsedate_tz(upload_time)
2544                         if timetuple is not None:
2545                                 try:
2546                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2547                                 except:
2548                                         pass
2549
2550                 # description
2551                 video_description = video_info.get('description', 'No description available.')
2552
2553                 url_map = video_info['video_urls']
2554                 if len(url_map.keys()) > 0:
2555                         # Decide which formats to download
2556                         req_format = self._downloader.params.get('format', None)
2557                         format_limit = self._downloader.params.get('format_limit', None)
2558
2559                         if format_limit is not None and format_limit in self._available_formats:
2560                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2561                         else:
2562                                 format_list = self._available_formats
2563                         existing_formats = [x for x in format_list if x in url_map]
2564                         if len(existing_formats) == 0:
2565                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2566                                 return
2567                         if req_format is None:
2568                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2569                         elif req_format == '-1':
2570                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2571                         else:
2572                                 # Specific format
2573                                 if req_format not in url_map:
2574                                         self._downloader.trouble(u'ERROR: requested format not available')
2575                                         return
2576                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2577
2578                 for format_param, video_real_url in video_url_list:
2579
2580                         # At this point we have a new video
2581                         self._downloader.increment_downloads()
2582
2583                         # Extension
2584                         video_extension = self._video_extensions.get(format_param, 'mp4')
2585
2586                         # Find the video URL in fmt_url_map or conn paramters
2587                         try:
2588                                 # Process video information
2589                                 self._downloader.process_info({
2590                                         'id':           video_id.decode('utf-8'),
2591                                         'url':          video_real_url.decode('utf-8'),
2592                                         'uploader':     video_uploader.decode('utf-8'),
2593                                         'upload_date':  upload_date,
2594                                         'title':        video_title,
2595                                         'stitle':       simple_title,
2596                                         'ext':          video_extension.decode('utf-8'),
2597                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2598                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2599                                         'description':  video_description.decode('utf-8'),
2600                                         'player_url':   None,
2601                                 })
2602                         except UnavailableVideoError, err:
2603                                 self._downloader.trouble(u'\nERROR: unable to download video')
2604
2605 class BlipTVIE(InfoExtractor):
2606         """Information extractor for blip.tv"""
2607
2608         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2609         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2610
2611         @staticmethod
2612         def suitable(url):
2613                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2614
2615         def report_extraction(self, file_id):
2616                 """Report information extraction."""
2617                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2618
2619         def _simplify_title(self, title):
2620                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2621                 res = res.strip(ur'_')
2622                 return res
2623
2624         def _real_extract(self, url):
2625                 mobj = re.match(self._VALID_URL, url)
2626                 if mobj is None:
2627                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2628                         return
2629
2630                 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2631                 request = urllib2.Request(json_url)
2632                 self.report_extraction(mobj.group(1))
2633                 try:
2634                         json_code = urllib2.urlopen(request).read()
2635                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2636                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2637                         return
2638                 try:
2639                         json_data = json.loads(json_code)
2640                         data = json_data['Post'] if 'Post' in json_data else json_data
2641
2642                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2643                         video_url = data['media']['url']
2644                         umobj = re.match(self._URL_EXT, video_url)
2645                         if umobj is None:
2646                                 raise ValueError('Can not determine filename extension')
2647                         ext = umobj.group(1)
2648
2649                         info = {
2650                                 'id': data['item_id'],
2651                                 'url': video_url,
2652                                 'uploader': data['display_name'],
2653                                 'upload_date': upload_date,
2654                                 'title': data['title'],
2655                                 'stitle': self._simplify_title(data['title']),
2656                                 'ext': ext,
2657                                 'format': data['media']['mimeType'],
2658                                 'thumbnail': data['thumbnailUrl'],
2659                                 'description': data['description'],
2660                                 'player_url': data['embedUrl']
2661                         }
2662                 except (ValueError,KeyError), err:
2663                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2664                         return
2665
2666                 try:
2667                         self._downloader.process_info(info)
2668                 except UnavailableVideoError, err:
2669                         self._downloader.trouble(u'\nERROR: unable to download video')
2670
2671
2672 class PostProcessor(object):
2673         """Post Processor class.
2674
2675         PostProcessor objects can be added to downloaders with their
2676         add_post_processor() method. When the downloader has finished a
2677         successful download, it will take its internal chain of PostProcessors
2678         and start calling the run() method on each one of them, first with
2679         an initial argument and then with the returned value of the previous
2680         PostProcessor.
2681
2682         The chain will be stopped if one of them ever returns None or the end
2683         of the chain is reached.
2684
2685         PostProcessor objects follow a "mutual registration" process similar
2686         to InfoExtractor objects.
2687         """
2688
2689         _downloader = None
2690
2691         def __init__(self, downloader=None):
2692                 self._downloader = downloader
2693
2694         def set_downloader(self, downloader):
2695                 """Sets the downloader for this PP."""
2696                 self._downloader = downloader
2697
2698         def run(self, information):
2699                 """Run the PostProcessor.
2700
2701                 The "information" argument is a dictionary like the ones
2702                 composed by InfoExtractors. The only difference is that this
2703                 one has an extra field called "filepath" that points to the
2704                 downloaded file.
2705
2706                 When this method returns None, the postprocessing chain is
2707                 stopped. However, this method may return an information
2708                 dictionary that will be passed to the next postprocessing
2709                 object in the chain. It can be the one it received after
2710                 changing some fields.
2711
2712                 In addition, this method may raise a PostProcessingError
2713                 exception that will be taken into account by the downloader
2714                 it was called from.
2715                 """
2716                 return information # by default, do nothing
2717
2718 class FFmpegExtractAudioPP(PostProcessor):
2719
2720         def __init__(self, downloader=None, preferredcodec=None):
2721                 PostProcessor.__init__(self, downloader)
2722                 if preferredcodec is None:
2723                         preferredcodec = 'best'
2724                 self._preferredcodec = preferredcodec
2725
2726         @staticmethod
2727         def get_audio_codec(path):
2728                 try:
2729                         cmd = ['ffprobe', '-show_streams', '--', path]
2730                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2731                         output = handle.communicate()[0]
2732                         if handle.wait() != 0:
2733                                 return None
2734                 except (IOError, OSError):
2735                         return None
2736                 audio_codec = None
2737                 for line in output.split('\n'):
2738                         if line.startswith('codec_name='):
2739                                 audio_codec = line.split('=')[1].strip()
2740                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2741                                 return audio_codec
2742                 return None
2743
2744         @staticmethod
2745         def run_ffmpeg(path, out_path, codec, more_opts):
2746                 try:
2747                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2748                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2749                         return (ret == 0)
2750                 except (IOError, OSError):
2751                         return False
2752
2753         def run(self, information):
2754                 path = information['filepath']
2755
2756                 filecodec = self.get_audio_codec(path)
2757                 if filecodec is None:
2758                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2759                         return None
2760
2761                 more_opts = []
2762                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2763                         if filecodec == 'aac' or filecodec == 'mp3':
2764                                 # Lossless if possible
2765                                 acodec = 'copy'
2766                                 extension = filecodec
2767                                 if filecodec == 'aac':
2768                                         more_opts = ['-f', 'adts']
2769                         else:
2770                                 # MP3 otherwise.
2771                                 acodec = 'libmp3lame'
2772                                 extension = 'mp3'
2773                                 more_opts = ['-ab', '128k']
2774                 else:
2775                         # We convert the audio (lossy)
2776                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2777                         extension = self._preferredcodec
2778                         more_opts = ['-ab', '128k']
2779                         if self._preferredcodec == 'aac':
2780                                 more_opts += ['-f', 'adts']
2781
2782                 (prefix, ext) = os.path.splitext(path)
2783                 new_path = prefix + '.' + extension
2784                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2785                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2786
2787                 if not status:
2788                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2789                         return None
2790
2791                 try:
2792                         os.remove(path)
2793                 except (IOError, OSError):
2794                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2795                         return None
2796
2797                 information['filepath'] = new_path
2798                 return information
2799
2800 ### MAIN PROGRAM ###
2801 if __name__ == '__main__':
2802         try:
2803                 # Modules needed only when running the main program
2804                 import getpass
2805                 import optparse
2806
2807                 # Function to update the program file with the latest version from the repository.
2808                 def update_self(downloader, filename):
2809                         # Note: downloader only used for options
2810                         if not os.access(filename, os.W_OK):
2811                                 sys.exit('ERROR: no write permissions on %s' % filename)
2812
2813                         downloader.to_screen('Updating to latest stable version...')
2814                         try:
2815                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2816                                 latest_version = urllib.urlopen(latest_url).read().strip()
2817                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2818                                 newcontent = urllib.urlopen(prog_url).read()
2819                         except (IOError, OSError), err:
2820                                 sys.exit('ERROR: unable to download latest version')
2821                         try:
2822                                 stream = open(filename, 'w')
2823                                 stream.write(newcontent)
2824                                 stream.close()
2825                         except (IOError, OSError), err:
2826                                 sys.exit('ERROR: unable to overwrite current version')
2827                         downloader.to_screen('Updated to version %s' % latest_version)
2828
2829                 # Parse command line
2830                 parser = optparse.OptionParser(
2831                         usage='Usage: %prog [options] url...',
2832                         version='2011.03.29',
2833                         conflict_handler='resolve',
2834                 )
2835
2836                 parser.add_option('-h', '--help',
2837                                 action='help', help='print this help text and exit')
2838                 parser.add_option('-v', '--version',
2839                                 action='version', help='print program version and exit')
2840                 parser.add_option('-U', '--update',
2841                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2842                 parser.add_option('-i', '--ignore-errors',
2843                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2844                 parser.add_option('-r', '--rate-limit',
2845                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2846                 parser.add_option('-R', '--retries',
2847                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2848                 parser.add_option('--playlist-start',
2849                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2850                 parser.add_option('--playlist-end',
2851                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2852                 parser.add_option('--dump-user-agent',
2853                                 action='store_true', dest='dump_user_agent',
2854                                 help='display the current browser identification', default=False)
2855
2856                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2857                 authentication.add_option('-u', '--username',
2858                                 dest='username', metavar='USERNAME', help='account username')
2859                 authentication.add_option('-p', '--password',
2860                                 dest='password', metavar='PASSWORD', help='account password')
2861                 authentication.add_option('-n', '--netrc',
2862                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2863                 parser.add_option_group(authentication)
2864
2865                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2866                 video_format.add_option('-f', '--format',
2867                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2868                 video_format.add_option('--all-formats',
2869                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2870                 video_format.add_option('--max-quality',
2871                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2872                 parser.add_option_group(video_format)
2873
2874                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2875                 verbosity.add_option('-q', '--quiet',
2876                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2877                 verbosity.add_option('-s', '--simulate',
2878                                 action='store_true', dest='simulate', help='do not download video', default=False)
2879                 verbosity.add_option('-g', '--get-url',
2880                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2881                 verbosity.add_option('-e', '--get-title',
2882                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2883                 verbosity.add_option('--get-thumbnail',
2884                                 action='store_true', dest='getthumbnail',
2885                                 help='simulate, quiet but print thumbnail URL', default=False)
2886                 verbosity.add_option('--get-description',
2887                                 action='store_true', dest='getdescription',
2888                                 help='simulate, quiet but print video description', default=False)
2889                 verbosity.add_option('--get-filename',
2890                                 action='store_true', dest='getfilename',
2891                                 help='simulate, quiet but print output filename', default=False)
2892                 verbosity.add_option('--no-progress',
2893                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2894                 verbosity.add_option('--console-title',
2895                                 action='store_true', dest='consoletitle',
2896                                 help='display progress in console titlebar', default=False)
2897                 parser.add_option_group(verbosity)
2898
2899                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2900                 filesystem.add_option('-t', '--title',
2901                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2902                 filesystem.add_option('-l', '--literal',
2903                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2904                 filesystem.add_option('-A', '--auto-number',
2905                                 action='store_true', dest='autonumber',
2906                                 help='number downloaded files starting from 00000', default=False)
2907                 filesystem.add_option('-o', '--output',
2908                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2909                 filesystem.add_option('-a', '--batch-file',
2910                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2911                 filesystem.add_option('-w', '--no-overwrites',
2912                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2913                 filesystem.add_option('-c', '--continue',
2914                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2915                 filesystem.add_option('--cookies',
2916                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2917                 filesystem.add_option('--no-part',
2918                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
2919                 filesystem.add_option('--no-mtime',
2920                                 action='store_false', dest='updatetime',
2921                                 help='do not use the Last-modified header to set the file modification time', default=True)
2922                 filesystem.add_option('--write-description',
2923                                 action='store_true', dest='writedescription',
2924                                 help='write video description to a .description file', default=False)
2925                 parser.add_option_group(filesystem)
2926
2927                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
2928                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
2929                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
2930                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
2931                                 help='"best", "aac" or "mp3"; best by default')
2932                 parser.add_option_group(postproc)
2933
2934                 (opts, args) = parser.parse_args()
2935
2936                 # Open appropriate CookieJar
2937                 if opts.cookiefile is None:
2938                         jar = cookielib.CookieJar()
2939                 else:
2940                         try:
2941                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2942                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2943                                         jar.load()
2944                         except (IOError, OSError), err:
2945                                 sys.exit(u'ERROR: unable to open cookie file')
2946
2947                 # Dump user agent
2948                 if opts.dump_user_agent:
2949                         print std_headers['User-Agent']
2950                         sys.exit(0)
2951
2952                 # General configuration
2953                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2954                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2955                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2956
2957                 # Batch file verification
2958                 batchurls = []
2959                 if opts.batchfile is not None:
2960                         try:
2961                                 if opts.batchfile == '-':
2962                                         batchfd = sys.stdin
2963                                 else:
2964                                         batchfd = open(opts.batchfile, 'r')
2965                                 batchurls = batchfd.readlines()
2966                                 batchurls = [x.strip() for x in batchurls]
2967                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2968                         except IOError:
2969                                 sys.exit(u'ERROR: batch file could not be read')
2970                 all_urls = batchurls + args
2971
2972                 # Conflicting, missing and erroneous options
2973                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2974                         parser.error(u'using .netrc conflicts with giving username/password')
2975                 if opts.password is not None and opts.username is None:
2976                         parser.error(u'account username missing')
2977                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2978                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2979                 if opts.usetitle and opts.useliteral:
2980                         parser.error(u'using title conflicts with using literal title')
2981                 if opts.username is not None and opts.password is None:
2982                         opts.password = getpass.getpass(u'Type account password and press return:')
2983                 if opts.ratelimit is not None:
2984                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2985                         if numeric_limit is None:
2986                                 parser.error(u'invalid rate limit specified')
2987                         opts.ratelimit = numeric_limit
2988                 if opts.retries is not None:
2989                         try:
2990                                 opts.retries = long(opts.retries)
2991                         except (TypeError, ValueError), err:
2992                                 parser.error(u'invalid retry count specified')
2993                 try:
2994                         opts.playliststart = long(opts.playliststart)
2995                         if opts.playliststart <= 0:
2996                                 raise ValueError
2997                 except (TypeError, ValueError), err:
2998                         parser.error(u'invalid playlist start number specified')
2999                 try:
3000                         opts.playlistend = long(opts.playlistend)
3001                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3002                                 raise ValueError
3003                 except (TypeError, ValueError), err:
3004                         parser.error(u'invalid playlist end number specified')
3005                 if opts.extractaudio:
3006                         if opts.audioformat not in ['best', 'aac', 'mp3']:
3007                                 parser.error(u'invalid audio format specified')
3008
3009                 # Information extractors
3010                 youtube_ie = YoutubeIE()
3011                 metacafe_ie = MetacafeIE(youtube_ie)
3012                 dailymotion_ie = DailymotionIE()
3013                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3014                 youtube_user_ie = YoutubeUserIE(youtube_ie)
3015                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3016                 google_ie = GoogleIE()
3017                 google_search_ie = GoogleSearchIE(google_ie)
3018                 photobucket_ie = PhotobucketIE()
3019                 yahoo_ie = YahooIE()
3020                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3021                 deposit_files_ie = DepositFilesIE()
3022                 facebook_ie = FacebookIE()
3023                 bliptv_ie = BlipTVIE()
3024                 generic_ie = GenericIE()
3025
3026                 # File downloader
3027                 fd = FileDownloader({
3028                         'usenetrc': opts.usenetrc,
3029                         'username': opts.username,
3030                         'password': opts.password,
3031                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3032                         'forceurl': opts.geturl,
3033                         'forcetitle': opts.gettitle,
3034                         'forcethumbnail': opts.getthumbnail,
3035                         'forcedescription': opts.getdescription,
3036                         'forcefilename': opts.getfilename,
3037                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3038                         'format': opts.format,
3039                         'format_limit': opts.format_limit,
3040                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3041                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3042                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3043                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3044                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3045                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3046                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3047                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3048                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3049                                 or u'%(id)s.%(ext)s'),
3050                         'ignoreerrors': opts.ignoreerrors,
3051                         'ratelimit': opts.ratelimit,
3052                         'nooverwrites': opts.nooverwrites,
3053                         'retries': opts.retries,
3054                         'continuedl': opts.continue_dl,
3055                         'noprogress': opts.noprogress,
3056                         'playliststart': opts.playliststart,
3057                         'playlistend': opts.playlistend,
3058                         'logtostderr': opts.outtmpl == '-',
3059                         'consoletitle': opts.consoletitle,
3060                         'nopart': opts.nopart,
3061                         'updatetime': opts.updatetime,
3062                         'writedescription': opts.writedescription,
3063                         })
3064                 fd.add_info_extractor(youtube_search_ie)
3065                 fd.add_info_extractor(youtube_pl_ie)
3066                 fd.add_info_extractor(youtube_user_ie)
3067                 fd.add_info_extractor(metacafe_ie)
3068                 fd.add_info_extractor(dailymotion_ie)
3069                 fd.add_info_extractor(youtube_ie)
3070                 fd.add_info_extractor(google_ie)
3071                 fd.add_info_extractor(google_search_ie)
3072                 fd.add_info_extractor(photobucket_ie)
3073                 fd.add_info_extractor(yahoo_ie)
3074                 fd.add_info_extractor(yahoo_search_ie)
3075                 fd.add_info_extractor(deposit_files_ie)
3076                 fd.add_info_extractor(facebook_ie)
3077                 fd.add_info_extractor(bliptv_ie)
3078
3079                 # This must come last since it's the
3080                 # fallback if none of the others work
3081                 fd.add_info_extractor(generic_ie)
3082
3083                 # PostProcessors
3084                 if opts.extractaudio:
3085                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3086
3087                 # Update version
3088                 if opts.update_self:
3089                         update_self(fd, sys.argv[0])
3090
3091                 # Maybe do nothing
3092                 if len(all_urls) < 1:
3093                         if not opts.update_self:
3094                                 parser.error(u'you must provide at least one URL')
3095                         else:
3096                                 sys.exit()
3097                 retcode = fd.download(all_urls)
3098
3099                 # Dump cookie jar if requested
3100                 if opts.cookiefile is not None:
3101                         try:
3102                                 jar.save()
3103                         except (IOError, OSError), err:
3104                                 sys.exit(u'ERROR: unable to save cookie jar')
3105
3106                 sys.exit(retcode)
3107
3108         except DownloadError:
3109                 sys.exit(1)
3110         except SameFileError:
3111                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3112         except KeyboardInterrupt:
3113                 sys.exit(u'\nERROR: Interrupted by user')