youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # License: Public domain code
   8 import cookielib
   9 import ctypes
  10 import datetime
  11 import gzip
  12 import htmlentitydefs
  13 import httplib
  14 import locale
  15 import math
  16 import netrc
  17 import os
  18 import os.path
  19 import re
  20 import socket
  21 import string
  22 import StringIO
  23 import subprocess
  24 import sys
  25 import time
  26 import urllib
  27 import urllib2
  28 import zlib
  29
  30 # parse_qs was moved from the cgi module to the urlparse module recently.
  31 try:
  32         from urlparse import parse_qs
  33 except ImportError:
  34         from cgi import parse_qs
  35
  36 std_headers = {
  37         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
  38         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  39         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  40         'Accept-Encoding': 'gzip, deflate',
  41         'Accept-Language': 'en-us,en;q=0.5',
  42 }
  43
  44 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  45
  46 def preferredencoding():
  47         """Get preferred encoding.
  48
  49         Returns the best encoding scheme for the system, based on
  50         locale.getpreferredencoding() and some further tweaks.
  51         """
  52         def yield_preferredencoding():
  53                 try:
  54                         pref = locale.getpreferredencoding()
  55                         u'TEST'.encode(pref)
  56                 except:
  57                         pref = 'UTF-8'
  58                 while True:
  59                         yield pref
  60         return yield_preferredencoding().next()
  61
  62 def htmlentity_transform(matchobj):
  63         """Transforms an HTML entity to a Unicode character.
  64
  65         This function receives a match object and is intended to be used with
  66         the re.sub() function.
  67         """
  68         entity = matchobj.group(1)
  69
  70         # Known non-numeric HTML entity
  71         if entity in htmlentitydefs.name2codepoint:
  72                 return unichr(htmlentitydefs.name2codepoint[entity])
  73
  74         # Unicode character
  75         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  76         if mobj is not None:
  77                 numstr = mobj.group(1)
  78                 if numstr.startswith(u'x'):
  79                         base = 16
  80                         numstr = u'0%s' % numstr
  81                 else:
  82                         base = 10
  83                 return unichr(long(numstr, base))
  84
  85         # Unknown entity in name, return its literal representation
  86         return (u'&%s;' % entity)
  87
  88 def sanitize_title(utitle):
  89         """Sanitizes a video title so it could be used as part of a filename."""
  90         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  91         return utitle.replace(unicode(os.sep), u'%')
  92
  93 def sanitize_open(filename, open_mode):
  94         """Try to open the given filename, and slightly tweak it if this fails.
  95
  96         Attempts to open the given filename. If this fails, it tries to change
  97         the filename slightly, step by step, until it's either able to open it
  98         or it fails and raises a final exception, like the standard open()
  99         function.
 100
 101         It returns the tuple (stream, definitive_file_name).
 102         """
 103         try:
 104                 if filename == u'-':
 105                         if sys.platform == 'win32':
 106                                 import msvcrt
 107                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 108                         return (sys.stdout, filename)
 109                 stream = open(filename, open_mode)
 110                 return (stream, filename)
 111         except (IOError, OSError), err:
 112                 # In case of error, try to remove win32 forbidden chars
 113                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 114
 115                 # An exception here should be caught in the caller
 116                 stream = open(filename, open_mode)
 117                 return (stream, filename)
 118
 119 class DownloadError(Exception):
 120         """Download Error exception.
 121
 122         This exception may be thrown by FileDownloader objects if they are not
 123         configured to continue on errors. They will contain the appropriate
 124         error message.
 125         """
 126         pass
 127
 128 class SameFileError(Exception):
 129         """Same File exception.
 130
 131         This exception will be thrown by FileDownloader objects if they detect
 132         multiple files would have to be downloaded to the same file on disk.
 133         """
 134         pass
 135
 136 class PostProcessingError(Exception):
 137         """Post Processing exception.
 138
 139         This exception may be raised by PostProcessor's .run() method to
 140         indicate an error in the postprocessing task.
 141         """
 142         pass
 143
 144 class UnavailableVideoError(Exception):
 145         """Unavailable Format exception.
 146
 147         This exception will be thrown when a video is requested
 148         in a format that is not available for that video.
 149         """
 150         pass
 151
 152 class ContentTooShortError(Exception):
 153         """Content Too Short exception.
 154
 155         This exception may be raised by FileDownloader objects when a file they
 156         download is too small for what the server announced first, indicating
 157         the connection was probably interrupted.
 158         """
 159         # Both in bytes
 160         downloaded = None
 161         expected = None
 162
 163         def __init__(self, downloaded, expected):
 164                 self.downloaded = downloaded
 165                 self.expected = expected
 166
 167 class YoutubeDLHandler(urllib2.HTTPHandler):
 168         """Handler for HTTP requests and responses.
 169
 170         This class, when installed with an OpenerDirector, automatically adds
 171         the standard headers to every HTTP request and handles gzipped and
 172         deflated responses from web servers. If compression is to be avoided in
 173         a particular request, the original request in the program code only has
 174         to include the HTTP header "Youtubedl-No-Compression", which will be
 175         removed before making the real request.
 176
 177         Part of this code was copied from:
 178
 179           http://techknack.net/python-urllib2-handlers/
 180
 181         Andrew Rowls, the author of that code, agreed to release it to the
 182         public domain.
 183         """
 184
 185         @staticmethod
 186         def deflate(data):
 187                 try:
 188                         return zlib.decompress(data, -zlib.MAX_WBITS)
 189                 except zlib.error:
 190                         return zlib.decompress(data)
 191
 192         def http_request(self, req):
 193                 for h in std_headers:
 194                         if h in req.headers:
 195                                 del req.headers[h]
 196                         req.add_header(h, std_headers[h])
 197                 if 'Youtubedl-no-compression' in req.headers:
 198                         if 'Accept-encoding' in req.headers:
 199                                 del req.headers['Accept-encoding']
 200                         del req.headers['Youtubedl-no-compression']
 201                 return req
 202
 203         def http_response(self, req, resp):
 204                 old_resp = resp
 205                 # gzip
 206                 if resp.headers.get('Content-encoding', '') == 'gzip':
 207                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 208                         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
 209                         resp.msg = old_resp.msg
 210                 # deflate
 211                 if resp.headers.get('Content-encoding', '') == 'deflate':
 212                         gz = StringIO.StringIO(self.deflate(resp.read()))
 213                         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
 214                         resp.msg = old_resp.msg
 215                 return resp
 216
 217 class FileDownloader(object):
 218         """File Downloader class.
 219
 220         File downloader objects are the ones responsible of downloading the
 221         actual video file and writing it to disk if the user has requested
 222         it, among some other tasks. In most cases there should be one per
 223         program. As, given a video URL, the downloader doesn't know how to
 224         extract all the needed information, task that InfoExtractors do, it
 225         has to pass the URL to one of them.
 226
 227         For this, file downloader objects have a method that allows
 228         InfoExtractors to be registered in a given order. When it is passed
 229         a URL, the file downloader handles it to the first InfoExtractor it
 230         finds that reports being able to handle it. The InfoExtractor extracts
 231         all the information about the video or videos the URL refers to, and
 232         asks the FileDownloader to process the video information, possibly
 233         downloading the video.
 234
 235         File downloaders accept a lot of parameters. In order not to saturate
 236         the object constructor with arguments, it receives a dictionary of
 237         options instead. These options are available through the params
 238         attribute for the InfoExtractors to use. The FileDownloader also
 239         registers itself as the downloader in charge for the InfoExtractors
 240         that are added to it, so this is a "mutual registration".
 241
 242         Available options:
 243
 244         username:         Username for authentication purposes.
 245         password:         Password for authentication purposes.
 246         usenetrc:         Use netrc for authentication instead.
 247         quiet:            Do not print messages to stdout.
 248         forceurl:         Force printing final URL.
 249         forcetitle:       Force printing title.
 250         forcethumbnail:   Force printing thumbnail URL.
 251         forcedescription: Force printing description.
 252         simulate:         Do not download the video files.
 253         format:           Video format code.
 254         format_limit:     Highest quality format to try.
 255         outtmpl:          Template for output names.
 256         ignoreerrors:     Do not stop on download errors.
 257         ratelimit:        Download speed limit, in bytes/sec.
 258         nooverwrites:     Prevent overwriting files.
 259         retries:          Number of times to retry for HTTP error 5xx
 260         continuedl:       Try to continue downloads if possible.
 261         noprogress:       Do not print the progress bar.
 262         playliststart:    Playlist item to start at.
 263         playlistend:      Playlist item to end at.
 264         logtostderr:      Log messages to stderr instead of stdout.
 265         consoletitle:     Display progress in console window's titlebar.
 266         nopart:           Do not use temporary .part files.
 267         """
 268
 269         params = None
 270         _ies = []
 271         _pps = []
 272         _download_retcode = None
 273         _num_downloads = None
 274         _screen_file = None
 275
 276         def __init__(self, params):
 277                 """Create a FileDownloader object with the given options."""
 278                 self._ies = []
 279                 self._pps = []
 280                 self._download_retcode = 0
 281                 self._num_downloads = 0
 282                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 283                 self.params = params
 284
 285         @staticmethod
 286         def pmkdir(filename):
 287                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 288                 components = filename.split(os.sep)
 289                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 290                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 291                 for dir in aggregate:
 292                         if not os.path.exists(dir):
 293                                 os.mkdir(dir)
 294
 295         @staticmethod
 296         def format_bytes(bytes):
 297                 if bytes is None:
 298                         return 'N/A'
 299                 if type(bytes) is str:
 300                         bytes = float(bytes)
 301                 if bytes == 0.0:
 302                         exponent = 0
 303                 else:
 304                         exponent = long(math.log(bytes, 1024.0))
 305                 suffix = 'bkMGTPEZY'[exponent]
 306                 converted = float(bytes) / float(1024**exponent)
 307                 return '%.2f%s' % (converted, suffix)
 308
 309         @staticmethod
 310         def calc_percent(byte_counter, data_len):
 311                 if data_len is None:
 312                         return '---.-%'
 313                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 314
 315         @staticmethod
 316         def calc_eta(start, now, total, current):
 317                 if total is None:
 318                         return '--:--'
 319                 dif = now - start
 320                 if current == 0 or dif < 0.001: # One millisecond
 321                         return '--:--'
 322                 rate = float(current) / dif
 323                 eta = long((float(total) - float(current)) / rate)
 324                 (eta_mins, eta_secs) = divmod(eta, 60)
 325                 if eta_mins > 99:
 326                         return '--:--'
 327                 return '%02d:%02d' % (eta_mins, eta_secs)
 328
 329         @staticmethod
 330         def calc_speed(start, now, bytes):
 331                 dif = now - start
 332                 if bytes == 0 or dif < 0.001: # One millisecond
 333                         return '%10s' % '---b/s'
 334                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 335
 336         @staticmethod
 337         def best_block_size(elapsed_time, bytes):
 338                 new_min = max(bytes / 2.0, 1.0)
 339                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 340                 if elapsed_time < 0.001:
 341                         return long(new_max)
 342                 rate = bytes / elapsed_time
 343                 if rate > new_max:
 344                         return long(new_max)
 345                 if rate < new_min:
 346                         return long(new_min)
 347                 return long(rate)
 348
 349         @staticmethod
 350         def parse_bytes(bytestr):
 351                 """Parse a string indicating a byte quantity into a long integer."""
 352                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 353                 if matchobj is None:
 354                         return None
 355                 number = float(matchobj.group(1))
 356                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 357                 return long(round(number * multiplier))
 358
 359         def add_info_extractor(self, ie):
 360                 """Add an InfoExtractor object to the end of the list."""
 361                 self._ies.append(ie)
 362                 ie.set_downloader(self)
 363
 364         def add_post_processor(self, pp):
 365                 """Add a PostProcessor object to the end of the chain."""
 366                 self._pps.append(pp)
 367                 pp.set_downloader(self)
 368
 369         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 370                 """Print message to stdout if not in quiet mode."""
 371                 try:
 372                         if not self.params.get('quiet', False):
 373                                 terminator = [u'\n', u''][skip_eol]
 374                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 375                         self._screen_file.flush()
 376                 except (UnicodeEncodeError), err:
 377                         if not ignore_encoding_errors:
 378                                 raise
 379
 380         def to_stderr(self, message):
 381                 """Print message to stderr."""
 382                 print >>sys.stderr, message.encode(preferredencoding())
 383
 384         def to_cons_title(self, message):
 385                 """Set console/terminal window title to message."""
 386                 if not self.params.get('consoletitle', False):
 387                         return
 388                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 389                         # c_wchar_p() might not be necessary if `message` is
 390                         # already of type unicode()
 391                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 392                 elif 'TERM' in os.environ:
 393                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 394
 395         def fixed_template(self):
 396                 """Checks if the output template is fixed."""
 397                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 398
 399         def trouble(self, message=None):
 400                 """Determine action to take when a download problem appears.
 401
 402                 Depending on if the downloader has been configured to ignore
 403                 download errors or not, this method may throw an exception or
 404                 not when errors are found, after printing the message.
 405                 """
 406                 if message is not None:
 407                         self.to_stderr(message)
 408                 if not self.params.get('ignoreerrors', False):
 409                         raise DownloadError(message)
 410                 self._download_retcode = 1
 411
 412         def slow_down(self, start_time, byte_counter):
 413                 """Sleep if the download speed is over the rate limit."""
 414                 rate_limit = self.params.get('ratelimit', None)
 415                 if rate_limit is None or byte_counter == 0:
 416                         return
 417                 now = time.time()
 418                 elapsed = now - start_time
 419                 if elapsed <= 0.0:
 420                         return
 421                 speed = float(byte_counter) / elapsed
 422                 if speed > rate_limit:
 423                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 424
 425         def temp_name(self, filename):
 426                 """Returns a temporary filename for the given filename."""
 427                 if self.params.get('nopart', False) or filename == u'-' or \
 428                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 429                         return filename
 430                 return filename + u'.part'
 431
 432         def try_rename(self, old_filename, new_filename):
 433                 try:
 434                         if old_filename == new_filename:
 435                                 return
 436                         os.rename(old_filename, new_filename)
 437                 except (IOError, OSError), err:
 438                         self.trouble(u'ERROR: unable to rename file')
 439
 440         def report_destination(self, filename):
 441                 """Report destination filename."""
 442                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 443
 444         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 445                 """Report download progress."""
 446                 if self.params.get('noprogress', False):
 447                         return
 448                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 449                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 450                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 451                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 452
 453         def report_resuming_byte(self, resume_len):
 454                 """Report attempt to resume at given byte."""
 455                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 456
 457         def report_retry(self, count, retries):
 458                 """Report retry in case of HTTP error 5xx"""
 459                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 460
 461         def report_file_already_downloaded(self, file_name):
 462                 """Report file has already been fully downloaded."""
 463                 try:
 464                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 465                 except (UnicodeEncodeError), err:
 466                         self.to_screen(u'[download] The file has already been downloaded')
 467
 468         def report_unable_to_resume(self):
 469                 """Report it was impossible to resume download."""
 470                 self.to_screen(u'[download] Unable to resume')
 471
 472         def report_finish(self):
 473                 """Report download finished."""
 474                 if self.params.get('noprogress', False):
 475                         self.to_screen(u'[download] Download completed')
 476                 else:
 477                         self.to_screen(u'')
 478
 479         def increment_downloads(self):
 480                 """Increment the ordinal that assigns a number to each file."""
 481                 self._num_downloads += 1
 482
 483         def process_info(self, info_dict):
 484                 """Process a single dictionary returned by an InfoExtractor."""
 485                 # Do nothing else if in simulate mode
 486                 if self.params.get('simulate', False):
 487                         # Forced printings
 488                         if self.params.get('forcetitle', False):
 489                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 490                         if self.params.get('forceurl', False):
 491                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 492                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 493                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 494                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 495                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 496
 497                         return
 498
 499                 try:
 500                         template_dict = dict(info_dict)
 501                         template_dict['epoch'] = unicode(long(time.time()))
 502                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 503                         filename = self.params['outtmpl'] % template_dict
 504                 except (ValueError, KeyError), err:
 505                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 506                         return
 507                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 508                         self.to_stderr(u'WARNING: file exists and will be skipped')
 509                         return
 510
 511                 try:
 512                         self.pmkdir(filename)
 513                 except (OSError, IOError), err:
 514                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 515                         return
 516
 517                 try:
 518                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 519                 except (OSError, IOError), err:
 520                         raise UnavailableVideoError
 521                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 522                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 523                         return
 524                 except (ContentTooShortError, ), err:
 525                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 526                         return
 527
 528                 if success:
 529                         try:
 530                                 self.post_process(filename, info_dict)
 531                         except (PostProcessingError), err:
 532                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 533                                 return
 534
 535         def download(self, url_list):
 536                 """Download a given list of URLs."""
 537                 if len(url_list) > 1 and self.fixed_template():
 538                         raise SameFileError(self.params['outtmpl'])
 539
 540                 for url in url_list:
 541                         suitable_found = False
 542                         for ie in self._ies:
 543                                 # Go to next InfoExtractor if not suitable
 544                                 if not ie.suitable(url):
 545                                         continue
 546
 547                                 # Suitable InfoExtractor found
 548                                 suitable_found = True
 549
 550                                 # Extract information from URL and process it
 551                                 ie.extract(url)
 552
 553                                 # Suitable InfoExtractor had been found; go to next URL
 554                                 break
 555
 556                         if not suitable_found:
 557                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 558
 559                 return self._download_retcode
 560
 561         def post_process(self, filename, ie_info):
 562                 """Run the postprocessing chain on the given file."""
 563                 info = dict(ie_info)
 564                 info['filepath'] = filename
 565                 for pp in self._pps:
 566                         info = pp.run(info)
 567                         if info is None:
 568                                 break
 569
 570         def _download_with_rtmpdump(self, filename, url, player_url):
 571                 self.report_destination(filename)
 572                 tmpfilename = self.temp_name(filename)
 573
 574                 # Check for rtmpdump first
 575                 try:
 576                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 577                 except (OSError, IOError):
 578                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 579                         return False
 580
 581                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 582                 # the connection was interrumpted and resuming appears to be
 583                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 584                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 585                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 586                 while retval == 2 or retval == 1:
 587                         prevsize = os.path.getsize(tmpfilename)
 588                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 589                         time.sleep(5.0) # This seems to be needed
 590                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 591                         cursize = os.path.getsize(tmpfilename)
 592                         if prevsize == cursize and retval == 1:
 593                                 break
 594                 if retval == 0:
 595                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 596                         self.try_rename(tmpfilename, filename)
 597                         return True
 598                 else:
 599                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 600                         return False
 601
 602         def _do_download(self, filename, url, player_url):
 603                 # Check file already present
 604                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 605                         self.report_file_already_downloaded(filename)
 606                         return True
 607
 608                 # Attempt to download using rtmpdump
 609                 if url.startswith('rtmp'):
 610                         return self._download_with_rtmpdump(filename, url, player_url)
 611
 612                 tmpfilename = self.temp_name(filename)
 613                 stream = None
 614                 open_mode = 'wb'
 615
 616                 # Do not include the Accept-Encoding header
 617                 headers = {'Youtubedl-no-compression': 'True'}
 618                 basic_request = urllib2.Request(url, None, headers)
 619                 request = urllib2.Request(url, None, headers)
 620
 621                 # Establish possible resume length
 622                 if os.path.isfile(tmpfilename):
 623                         resume_len = os.path.getsize(tmpfilename)
 624                 else:
 625                         resume_len = 0
 626
 627                 # Request parameters in case of being able to resume
 628                 if self.params.get('continuedl', False) and resume_len != 0:
 629                         self.report_resuming_byte(resume_len)
 630                         request.add_header('Range','bytes=%d-' % resume_len)
 631                         open_mode = 'ab'
 632
 633                 count = 0
 634                 retries = self.params.get('retries', 0)
 635                 while count <= retries:
 636                         # Establish connection
 637                         try:
 638                                 data = urllib2.urlopen(request)
 639                                 break
 640                         except (urllib2.HTTPError, ), err:
 641                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 642                                         # Unexpected HTTP error
 643                                         raise
 644                                 elif err.code == 416:
 645                                         # Unable to resume (requested range not satisfiable)
 646                                         try:
 647                                                 # Open the connection again without the range header
 648                                                 data = urllib2.urlopen(basic_request)
 649                                                 content_length = data.info()['Content-Length']
 650                                         except (urllib2.HTTPError, ), err:
 651                                                 if err.code < 500 or err.code >= 600:
 652                                                         raise
 653                                         else:
 654                                                 # Examine the reported length
 655                                                 if (content_length is not None and
 656                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 657                                                         # The file had already been fully downloaded.
 658                                                         # Explanation to the above condition: in issue #175 it was revealed that
 659                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 660                                                         # changing the file size slightly and causing problems for some users. So
 661                                                         # I decided to implement a suggested change and consider the file
 662                                                         # completely downloaded if the file size differs less than 100 bytes from
 663                                                         # the one in the hard drive.
 664                                                         self.report_file_already_downloaded(filename)
 665                                                         self.try_rename(tmpfilename, filename)
 666                                                         return True
 667                                                 else:
 668                                                         # The length does not match, we start the download over
 669                                                         self.report_unable_to_resume()
 670                                                         open_mode = 'wb'
 671                                                         break
 672                         # Retry
 673                         count += 1
 674                         if count <= retries:
 675                                 self.report_retry(count, retries)
 676
 677                 if count > retries:
 678                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 679                         return False
 680
 681                 data_len = data.info().get('Content-length', None)
 682                 if data_len is not None:
 683                         data_len = long(data_len) + resume_len
 684                 data_len_str = self.format_bytes(data_len)
 685                 byte_counter = 0 + resume_len
 686                 block_size = 1024
 687                 start = time.time()
 688                 while True:
 689                         # Download and write
 690                         before = time.time()
 691                         data_block = data.read(block_size)
 692                         after = time.time()
 693                         if len(data_block) == 0:
 694                                 break
 695                         byte_counter += len(data_block)
 696
 697                         # Open file just in time
 698                         if stream is None:
 699                                 try:
 700                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 701                                         self.report_destination(filename)
 702                                 except (OSError, IOError), err:
 703                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 704                                         return False
 705                         try:
 706                                 stream.write(data_block)
 707                         except (IOError, OSError), err:
 708                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 709                                 return False
 710                         block_size = self.best_block_size(after - before, len(data_block))
 711
 712                         # Progress message
 713                         percent_str = self.calc_percent(byte_counter, data_len)
 714                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 715                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 716                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 717
 718                         # Apply rate limit
 719                         self.slow_down(start, byte_counter - resume_len)
 720
 721                 stream.close()
 722                 self.report_finish()
 723                 if data_len is not None and byte_counter != data_len:
 724                         raise ContentTooShortError(byte_counter, long(data_len))
 725                 self.try_rename(tmpfilename, filename)
 726                 return True
 727
 728 class InfoExtractor(object):
 729         """Information Extractor class.
 730
 731         Information extractors are the classes that, given a URL, extract
 732         information from the video (or videos) the URL refers to. This
 733         information includes the real video URL, the video title and simplified
 734         title, author and others. The information is stored in a dictionary
 735         which is then passed to the FileDownloader. The FileDownloader
 736         processes this information possibly downloading the video to the file
 737         system, among other possible outcomes. The dictionaries must include
 738         the following fields:
 739
 740         id:             Video identifier.
 741         url:            Final video URL.
 742         uploader:       Nickname of the video uploader.
 743         title:          Literal title.
 744         stitle:         Simplified title.
 745         ext:            Video filename extension.
 746         format:         Video format.
 747         player_url:     SWF Player URL (may be None).
 748
 749         The following fields are optional. Their primary purpose is to allow
 750         youtube-dl to serve as the backend for a video search function, such
 751         as the one in youtube2mp3.  They are only used when their respective
 752         forced printing functions are called:
 753
 754         thumbnail:      Full URL to a video thumbnail image.
 755         description:    One-line video description.
 756
 757         Subclasses of this one should re-define the _real_initialize() and
 758         _real_extract() methods, as well as the suitable() static method.
 759         Probably, they should also be instantiated and added to the main
 760         downloader.
 761         """
 762
 763         _ready = False
 764         _downloader = None
 765
 766         def __init__(self, downloader=None):
 767                 """Constructor. Receives an optional downloader."""
 768                 self._ready = False
 769                 self.set_downloader(downloader)
 770
 771         @staticmethod
 772         def suitable(url):
 773                 """Receives a URL and returns True if suitable for this IE."""
 774                 return False
 775
 776         def initialize(self):
 777                 """Initializes an instance (authentication, etc)."""
 778                 if not self._ready:
 779                         self._real_initialize()
 780                         self._ready = True
 781
 782         def extract(self, url):
 783                 """Extracts URL information and returns it in list of dicts."""
 784                 self.initialize()
 785                 return self._real_extract(url)
 786
 787         def set_downloader(self, downloader):
 788                 """Sets the downloader for this IE."""
 789                 self._downloader = downloader
 790
 791         def _real_initialize(self):
 792                 """Real initialization process. Redefine in subclasses."""
 793                 pass
 794
 795         def _real_extract(self, url):
 796                 """Real extraction process. Redefine in subclasses."""
 797                 pass
 798
 799 class YoutubeIE(InfoExtractor):
 800         """Information extractor for youtube.com."""
 801
 802         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 803         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 804         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 805         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 806         _NETRC_MACHINE = 'youtube'
 807         # Listed in order of quality
 808         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 809         _video_extensions = {
 810                 '13': '3gp',
 811                 '17': 'mp4',
 812                 '18': 'mp4',
 813                 '22': 'mp4',
 814                 '37': 'mp4',
 815                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 816                 '43': 'webm',
 817                 '45': 'webm',
 818         }
 819
 820         @staticmethod
 821         def suitable(url):
 822                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 823
 824         def report_lang(self):
 825                 """Report attempt to set language."""
 826                 self._downloader.to_screen(u'[youtube] Setting language')
 827
 828         def report_login(self):
 829                 """Report attempt to log in."""
 830                 self._downloader.to_screen(u'[youtube] Logging in')
 831
 832         def report_age_confirmation(self):
 833                 """Report attempt to confirm age."""
 834                 self._downloader.to_screen(u'[youtube] Confirming age')
 835
 836         def report_video_webpage_download(self, video_id):
 837                 """Report attempt to download video webpage."""
 838                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 839
 840         def report_video_info_webpage_download(self, video_id):
 841                 """Report attempt to download video info webpage."""
 842                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 843
 844         def report_information_extraction(self, video_id):
 845                 """Report attempt to extract video information."""
 846                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 847
 848         def report_unavailable_format(self, video_id, format):
 849                 """Report extracted video URL."""
 850                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 851
 852         def report_rtmp_download(self):
 853                 """Indicate the download will use the RTMP protocol."""
 854                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 855
 856         def _real_initialize(self):
 857                 if self._downloader is None:
 858                         return
 859
 860                 username = None
 861                 password = None
 862                 downloader_params = self._downloader.params
 863
 864                 # Attempt to use provided username and password or .netrc data
 865                 if downloader_params.get('username', None) is not None:
 866                         username = downloader_params['username']
 867                         password = downloader_params['password']
 868                 elif downloader_params.get('usenetrc', False):
 869                         try:
 870                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 871                                 if info is not None:
 872                                         username = info[0]
 873                                         password = info[2]
 874                                 else:
 875                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 876                         except (IOError, netrc.NetrcParseError), err:
 877                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 878                                 return
 879
 880                 # Set language
 881                 request = urllib2.Request(self._LANG_URL)
 882                 try:
 883                         self.report_lang()
 884                         urllib2.urlopen(request).read()
 885                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 886                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 887                         return
 888
 889                 # No authentication to be performed
 890                 if username is None:
 891                         return
 892
 893                 # Log in
 894                 login_form = {
 895                                 'current_form': 'loginForm',
 896                                 'next':         '/',
 897                                 'action_login': 'Log In',
 898                                 'username':     username,
 899                                 'password':     password,
 900                                 }
 901                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 902                 try:
 903                         self.report_login()
 904                         login_results = urllib2.urlopen(request).read()
 905                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 906                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 907                                 return
 908                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 909                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 910                         return
 911
 912                 # Confirm age
 913                 age_form = {
 914                                 'next_url':             '/',
 915                                 'action_confirm':       'Confirm',
 916                                 }
 917                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 918                 try:
 919                         self.report_age_confirmation()
 920                         age_results = urllib2.urlopen(request).read()
 921                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 922                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 923                         return
 924
 925         def _real_extract(self, url):
 926                 # Extract video id from URL
 927                 mobj = re.match(self._VALID_URL, url)
 928                 if mobj is None:
 929                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 930                         return
 931                 video_id = mobj.group(2)
 932
 933                 # Get video webpage
 934                 self.report_video_webpage_download(video_id)
 935                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
 936                 try:
 937                         video_webpage = urllib2.urlopen(request).read()
 938                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 939                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 940                         return
 941
 942                 # Attempt to extract SWF player URL
 943                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 944                 if mobj is not None:
 945                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 946                 else:
 947                         player_url = None
 948
 949                 # Get video info
 950                 self.report_video_info_webpage_download(video_id)
 951                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 952                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 953                                            % (video_id, el_type))
 954                         request = urllib2.Request(video_info_url)
 955                         try:
 956                                 video_info_webpage = urllib2.urlopen(request).read()
 957                                 video_info = parse_qs(video_info_webpage)
 958                                 if 'token' in video_info:
 959                                         break
 960                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 961                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 962                                 return
 963                 if 'token' not in video_info:
 964                         if 'reason' in video_info:
 965                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 966                         else:
 967                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 968                         return
 969
 970                 # Start extracting information
 971                 self.report_information_extraction(video_id)
 972
 973                 # uploader
 974                 if 'author' not in video_info:
 975                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 976                         return
 977                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 978
 979                 # title
 980                 if 'title' not in video_info:
 981                         self._downloader.trouble(u'ERROR: unable to extract video title')
 982                         return
 983                 video_title = urllib.unquote_plus(video_info['title'][0])
 984                 video_title = video_title.decode('utf-8')
 985                 video_title = sanitize_title(video_title)
 986
 987                 # simplified title
 988                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 989                 simple_title = simple_title.strip(ur'_')
 990
 991                 # thumbnail image
 992                 if 'thumbnail_url' not in video_info:
 993                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 994                         video_thumbnail = ''
 995                 else:   # don't panic if we can't find it
 996                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 997
 998                 # upload date
 999                 upload_date = u'NA'
1000                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1001                 if mobj is not None:
1002                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1003                         format_expressions = ['%d %B %Y', '%B %d %Y']
1004                         for expression in format_expressions:
1005                                 try:
1006                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1007                                 except:
1008                                         pass
1009
1010                 # description
1011                 video_description = 'No description available.'
1012                 if self._downloader.params.get('forcedescription', False):
1013                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1014                         if mobj is not None:
1015                                 video_description = mobj.group(1)
1016
1017                 # token
1018                 video_token = urllib.unquote_plus(video_info['token'][0])
1019
1020                 # Decide which formats to download
1021                 req_format = self._downloader.params.get('format', None)
1022
1023                 if 'fmt_url_map' in video_info:
1024                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1025                         format_limit = self._downloader.params.get('format_limit', None)
1026                         if format_limit is not None and format_limit in self._available_formats:
1027                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1028                         else:
1029                                 format_list = self._available_formats
1030                         existing_formats = [x for x in format_list if x in url_map]
1031                         if len(existing_formats) == 0:
1032                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1033                                 return
1034                         if req_format is None:
1035                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1036                         elif req_format == '-1':
1037                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1038                         else:
1039                                 # Specific format
1040                                 if req_format not in url_map:
1041                                         self._downloader.trouble(u'ERROR: requested format not available')
1042                                         return
1043                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1044
1045                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1046                         self.report_rtmp_download()
1047                         video_url_list = [(None, video_info['conn'][0])]
1048
1049                 else:
1050                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1051                         return
1052
1053                 for format_param, video_real_url in video_url_list:
1054                         # At this point we have a new video
1055                         self._downloader.increment_downloads()
1056
1057                         # Extension
1058                         video_extension = self._video_extensions.get(format_param, 'flv')
1059
1060                         # Find the video URL in fmt_url_map or conn paramters
1061                         try:
1062                                 # Process video information
1063                                 self._downloader.process_info({
1064                                         'id':           video_id.decode('utf-8'),
1065                                         'url':          video_real_url.decode('utf-8'),
1066                                         'uploader':     video_uploader.decode('utf-8'),
1067                                         'upload_date':  upload_date,
1068                                         'title':        video_title,
1069                                         'stitle':       simple_title,
1070                                         'ext':          video_extension.decode('utf-8'),
1071                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1072                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1073                                         'description':  video_description.decode('utf-8'),
1074                                         'player_url':   player_url,
1075                                 })
1076                         except UnavailableVideoError, err:
1077                                 self._downloader.trouble(u'\nERROR: unable to download video')
1078
1079
1080 class MetacafeIE(InfoExtractor):
1081         """Information Extractor for metacafe.com."""
1082
1083         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1084         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1085         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1086         _youtube_ie = None
1087
1088         def __init__(self, youtube_ie, downloader=None):
1089                 InfoExtractor.__init__(self, downloader)
1090                 self._youtube_ie = youtube_ie
1091
1092         @staticmethod
1093         def suitable(url):
1094                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1095
1096         def report_disclaimer(self):
1097                 """Report disclaimer retrieval."""
1098                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1099
1100         def report_age_confirmation(self):
1101                 """Report attempt to confirm age."""
1102                 self._downloader.to_screen(u'[metacafe] Confirming age')
1103
1104         def report_download_webpage(self, video_id):
1105                 """Report webpage download."""
1106                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1107
1108         def report_extraction(self, video_id):
1109                 """Report information extraction."""
1110                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1111
1112         def _real_initialize(self):
1113                 # Retrieve disclaimer
1114                 request = urllib2.Request(self._DISCLAIMER)
1115                 try:
1116                         self.report_disclaimer()
1117                         disclaimer = urllib2.urlopen(request).read()
1118                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1119                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1120                         return
1121
1122                 # Confirm age
1123                 disclaimer_form = {
1124                         'filters': '0',
1125                         'submit': "Continue - I'm over 18",
1126                         }
1127                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1128                 try:
1129                         self.report_age_confirmation()
1130                         disclaimer = urllib2.urlopen(request).read()
1131                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1132                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1133                         return
1134
1135         def _real_extract(self, url):
1136                 # Extract id and simplified title from URL
1137                 mobj = re.match(self._VALID_URL, url)
1138                 if mobj is None:
1139                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1140                         return
1141
1142                 video_id = mobj.group(1)
1143
1144                 # Check if video comes from YouTube
1145                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1146                 if mobj2 is not None:
1147                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1148                         return
1149
1150                 # At this point we have a new video
1151                 self._downloader.increment_downloads()
1152
1153                 simple_title = mobj.group(2).decode('utf-8')
1154
1155                 # Retrieve video webpage to extract further information
1156                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1157                 try:
1158                         self.report_download_webpage(video_id)
1159                         webpage = urllib2.urlopen(request).read()
1160                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1161                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1162                         return
1163
1164                 # Extract URL, uploader and title from webpage
1165                 self.report_extraction(video_id)
1166                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1167                 if mobj is not None:
1168                         mediaURL = urllib.unquote(mobj.group(1))
1169                         video_extension = mediaURL[-3:]
1170
1171                         # Extract gdaKey if available
1172                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1173                         if mobj is None:
1174                                 video_url = mediaURL
1175                         else:
1176                                 gdaKey = mobj.group(1)
1177                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1178                 else:
1179                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1180                         if mobj is None:
1181                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1182                                 return
1183                         vardict = parse_qs(mobj.group(1))
1184                         if 'mediaData' not in vardict:
1185                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1186                                 return
1187                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1188                         if mobj is None:
1189                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1190                                 return
1191                         mediaURL = mobj.group(1).replace('\\/', '/')
1192                         video_extension = mediaURL[-3:]
1193                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1194
1195                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1196                 if mobj is None:
1197                         self._downloader.trouble(u'ERROR: unable to extract title')
1198                         return
1199                 video_title = mobj.group(1).decode('utf-8')
1200                 video_title = sanitize_title(video_title)
1201
1202                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1203                 if mobj is None:
1204                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1205                         return
1206                 video_uploader = mobj.group(1)
1207
1208                 try:
1209                         # Process video information
1210                         self._downloader.process_info({
1211                                 'id':           video_id.decode('utf-8'),
1212                                 'url':          video_url.decode('utf-8'),
1213                                 'uploader':     video_uploader.decode('utf-8'),
1214                                 'upload_date':  u'NA',
1215                                 'title':        video_title,
1216                                 'stitle':       simple_title,
1217                                 'ext':          video_extension.decode('utf-8'),
1218                                 'format':       u'NA',
1219                                 'player_url':   None,
1220                         })
1221                 except UnavailableVideoError:
1222                         self._downloader.trouble(u'\nERROR: unable to download video')
1223
1224
1225 class DailymotionIE(InfoExtractor):
1226         """Information Extractor for Dailymotion"""
1227
1228         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1229
1230         def __init__(self, downloader=None):
1231                 InfoExtractor.__init__(self, downloader)
1232
1233         @staticmethod
1234         def suitable(url):
1235                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1236
1237         def report_download_webpage(self, video_id):
1238                 """Report webpage download."""
1239                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1240
1241         def report_extraction(self, video_id):
1242                 """Report information extraction."""
1243                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1244
1245         def _real_initialize(self):
1246                 return
1247
1248         def _real_extract(self, url):
1249                 # Extract id and simplified title from URL
1250                 mobj = re.match(self._VALID_URL, url)
1251                 if mobj is None:
1252                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1253                         return
1254
1255                 # At this point we have a new video
1256                 self._downloader.increment_downloads()
1257                 video_id = mobj.group(1)
1258
1259                 simple_title = mobj.group(2).decode('utf-8')
1260                 video_extension = 'flv'
1261
1262                 # Retrieve video webpage to extract further information
1263                 request = urllib2.Request(url)
1264                 try:
1265                         self.report_download_webpage(video_id)
1266                         webpage = urllib2.urlopen(request).read()
1267                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1268                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1269                         return
1270
1271                 # Extract URL, uploader and title from webpage
1272                 self.report_extraction(video_id)
1273                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1274                 if mobj is None:
1275                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1276                         return
1277                 mediaURL = urllib.unquote(mobj.group(1))
1278
1279                 # if needed add http://www.dailymotion.com/ if relative URL
1280
1281                 video_url = mediaURL
1282
1283                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1284                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1285                 if mobj is None:
1286                         self._downloader.trouble(u'ERROR: unable to extract title')
1287                         return
1288                 video_title = mobj.group(1).decode('utf-8')
1289                 video_title = sanitize_title(video_title)
1290
1291                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1292                 if mobj is None:
1293                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1294                         return
1295                 video_uploader = mobj.group(1)
1296
1297                 try:
1298                         # Process video information
1299                         self._downloader.process_info({
1300                                 'id':           video_id.decode('utf-8'),
1301                                 'url':          video_url.decode('utf-8'),
1302                                 'uploader':     video_uploader.decode('utf-8'),
1303                                 'upload_date':  u'NA',
1304                                 'title':        video_title,
1305                                 'stitle':       simple_title,
1306                                 'ext':          video_extension.decode('utf-8'),
1307                                 'format':       u'NA',
1308                                 'player_url':   None,
1309                         })
1310                 except UnavailableVideoError:
1311                         self._downloader.trouble(u'\nERROR: unable to download video')
1312
1313 class GoogleIE(InfoExtractor):
1314         """Information extractor for video.google.com."""
1315
1316         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1317
1318         def __init__(self, downloader=None):
1319                 InfoExtractor.__init__(self, downloader)
1320
1321         @staticmethod
1322         def suitable(url):
1323                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1324
1325         def report_download_webpage(self, video_id):
1326                 """Report webpage download."""
1327                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1328
1329         def report_extraction(self, video_id):
1330                 """Report information extraction."""
1331                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1332
1333         def _real_initialize(self):
1334                 return
1335
1336         def _real_extract(self, url):
1337                 # Extract id from URL
1338                 mobj = re.match(self._VALID_URL, url)
1339                 if mobj is None:
1340                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1341                         return
1342
1343                 # At this point we have a new video
1344                 self._downloader.increment_downloads()
1345                 video_id = mobj.group(1)
1346
1347                 video_extension = 'mp4'
1348
1349                 # Retrieve video webpage to extract further information
1350                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1351                 try:
1352                         self.report_download_webpage(video_id)
1353                         webpage = urllib2.urlopen(request).read()
1354                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1355                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1356                         return
1357
1358                 # Extract URL, uploader, and title from webpage
1359                 self.report_extraction(video_id)
1360                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1361                 if mobj is None:
1362                         video_extension = 'flv'
1363                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1364                 if mobj is None:
1365                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1366                         return
1367                 mediaURL = urllib.unquote(mobj.group(1))
1368                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1369                 mediaURL = mediaURL.replace('\\x26', '\x26')
1370
1371                 video_url = mediaURL
1372
1373                 mobj = re.search(r'<title>(.*)</title>', webpage)
1374                 if mobj is None:
1375                         self._downloader.trouble(u'ERROR: unable to extract title')
1376                         return
1377                 video_title = mobj.group(1).decode('utf-8')
1378                 video_title = sanitize_title(video_title)
1379                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1380
1381                 # Extract video description
1382                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1383                 if mobj is None:
1384                         self._downloader.trouble(u'ERROR: unable to extract video description')
1385                         return
1386                 video_description = mobj.group(1).decode('utf-8')
1387                 if not video_description:
1388                         video_description = 'No description available.'
1389
1390                 # Extract video thumbnail
1391                 if self._downloader.params.get('forcethumbnail', False):
1392                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1393                         try:
1394                                 webpage = urllib2.urlopen(request).read()
1395                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1396                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1397                                 return
1398                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1399                         if mobj is None:
1400                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1401                                 return
1402                         video_thumbnail = mobj.group(1)
1403                 else:   # we need something to pass to process_info
1404                         video_thumbnail = ''
1405
1406
1407                 try:
1408                         # Process video information
1409                         self._downloader.process_info({
1410                                 'id':           video_id.decode('utf-8'),
1411                                 'url':          video_url.decode('utf-8'),
1412                                 'uploader':     u'NA',
1413                                 'upload_date':  u'NA',
1414                                 'title':        video_title,
1415                                 'stitle':       simple_title,
1416                                 'ext':          video_extension.decode('utf-8'),
1417                                 'format':       u'NA',
1418                                 'player_url':   None,
1419                         })
1420                 except UnavailableVideoError:
1421                         self._downloader.trouble(u'\nERROR: unable to download video')
1422
1423
1424 class PhotobucketIE(InfoExtractor):
1425         """Information extractor for photobucket.com."""
1426
1427         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1428
1429         def __init__(self, downloader=None):
1430                 InfoExtractor.__init__(self, downloader)
1431
1432         @staticmethod
1433         def suitable(url):
1434                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1435
1436         def report_download_webpage(self, video_id):
1437                 """Report webpage download."""
1438                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1439
1440         def report_extraction(self, video_id):
1441                 """Report information extraction."""
1442                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1443
1444         def _real_initialize(self):
1445                 return
1446
1447         def _real_extract(self, url):
1448                 # Extract id from URL
1449                 mobj = re.match(self._VALID_URL, url)
1450                 if mobj is None:
1451                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1452                         return
1453
1454                 # At this point we have a new video
1455                 self._downloader.increment_downloads()
1456                 video_id = mobj.group(1)
1457
1458                 video_extension = 'flv'
1459
1460                 # Retrieve video webpage to extract further information
1461                 request = urllib2.Request(url)
1462                 try:
1463                         self.report_download_webpage(video_id)
1464                         webpage = urllib2.urlopen(request).read()
1465                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1466                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1467                         return
1468
1469                 # Extract URL, uploader, and title from webpage
1470                 self.report_extraction(video_id)
1471                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1472                 if mobj is None:
1473                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1474                         return
1475                 mediaURL = urllib.unquote(mobj.group(1))
1476
1477                 video_url = mediaURL
1478
1479                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1480                 if mobj is None:
1481                         self._downloader.trouble(u'ERROR: unable to extract title')
1482                         return
1483                 video_title = mobj.group(1).decode('utf-8')
1484                 video_title = sanitize_title(video_title)
1485                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1486
1487                 video_uploader = mobj.group(2).decode('utf-8')
1488
1489                 try:
1490                         # Process video information
1491                         self._downloader.process_info({
1492                                 'id':           video_id.decode('utf-8'),
1493                                 'url':          video_url.decode('utf-8'),
1494                                 'uploader':     video_uploader,
1495                                 'upload_date':  u'NA',
1496                                 'title':        video_title,
1497                                 'stitle':       simple_title,
1498                                 'ext':          video_extension.decode('utf-8'),
1499                                 'format':       u'NA',
1500                                 'player_url':   None,
1501                         })
1502                 except UnavailableVideoError:
1503                         self._downloader.trouble(u'\nERROR: unable to download video')
1504
1505
1506 class YahooIE(InfoExtractor):
1507         """Information extractor for video.yahoo.com."""
1508
1509         # _VALID_URL matches all Yahoo! Video URLs
1510         # _VPAGE_URL matches only the extractable '/watch/' URLs
1511         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1512         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1513
1514         def __init__(self, downloader=None):
1515                 InfoExtractor.__init__(self, downloader)
1516
1517         @staticmethod
1518         def suitable(url):
1519                 return (re.match(YahooIE._VALID_URL, url) is not None)
1520
1521         def report_download_webpage(self, video_id):
1522                 """Report webpage download."""
1523                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1524
1525         def report_extraction(self, video_id):
1526                 """Report information extraction."""
1527                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1528
1529         def _real_initialize(self):
1530                 return
1531
1532         def _real_extract(self, url, new_video=True):
1533                 # Extract ID from URL
1534                 mobj = re.match(self._VALID_URL, url)
1535                 if mobj is None:
1536                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1537                         return
1538
1539                 # At this point we have a new video
1540                 self._downloader.increment_downloads()
1541                 video_id = mobj.group(2)
1542                 video_extension = 'flv'
1543
1544                 # Rewrite valid but non-extractable URLs as
1545                 # extractable English language /watch/ URLs
1546                 if re.match(self._VPAGE_URL, url) is None:
1547                         request = urllib2.Request(url)
1548                         try:
1549                                 webpage = urllib2.urlopen(request).read()
1550                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1551                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1552                                 return
1553
1554                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1555                         if mobj is None:
1556                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1557                                 return
1558                         yahoo_id = mobj.group(1)
1559
1560                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1561                         if mobj is None:
1562                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1563                                 return
1564                         yahoo_vid = mobj.group(1)
1565
1566                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1567                         return self._real_extract(url, new_video=False)
1568
1569                 # Retrieve video webpage to extract further information
1570                 request = urllib2.Request(url)
1571                 try:
1572                         self.report_download_webpage(video_id)
1573                         webpage = urllib2.urlopen(request).read()
1574                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1575                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1576                         return
1577
1578                 # Extract uploader and title from webpage
1579                 self.report_extraction(video_id)
1580                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1581                 if mobj is None:
1582                         self._downloader.trouble(u'ERROR: unable to extract video title')
1583                         return
1584                 video_title = mobj.group(1).decode('utf-8')
1585                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1586
1587                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1588                 if mobj is None:
1589                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1590                         return
1591                 video_uploader = mobj.group(1).decode('utf-8')
1592
1593                 # Extract video thumbnail
1594                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1595                 if mobj is None:
1596                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1597                         return
1598                 video_thumbnail = mobj.group(1).decode('utf-8')
1599
1600                 # Extract video description
1601                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1602                 if mobj is None:
1603                         self._downloader.trouble(u'ERROR: unable to extract video description')
1604                         return
1605                 video_description = mobj.group(1).decode('utf-8')
1606                 if not video_description: video_description = 'No description available.'
1607
1608                 # Extract video height and width
1609                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1610                 if mobj is None:
1611                         self._downloader.trouble(u'ERROR: unable to extract video height')
1612                         return
1613                 yv_video_height = mobj.group(1)
1614
1615                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1616                 if mobj is None:
1617                         self._downloader.trouble(u'ERROR: unable to extract video width')
1618                         return
1619                 yv_video_width = mobj.group(1)
1620
1621                 # Retrieve video playlist to extract media URL
1622                 # I'm not completely sure what all these options are, but we
1623                 # seem to need most of them, otherwise the server sends a 401.
1624                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1625                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1626                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1627                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1628                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1629                 try:
1630                         self.report_download_webpage(video_id)
1631                         webpage = urllib2.urlopen(request).read()
1632                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1633                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1634                         return
1635
1636                 # Extract media URL from playlist XML
1637                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1638                 if mobj is None:
1639                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1640                         return
1641                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1642                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1643
1644                 try:
1645                         # Process video information
1646                         self._downloader.process_info({
1647                                 'id':           video_id.decode('utf-8'),
1648                                 'url':          video_url,
1649                                 'uploader':     video_uploader,
1650                                 'upload_date':  u'NA',
1651                                 'title':        video_title,
1652                                 'stitle':       simple_title,
1653                                 'ext':          video_extension.decode('utf-8'),
1654                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1655                                 'description':  video_description,
1656                                 'thumbnail':    video_thumbnail,
1657                                 'description':  video_description,
1658                                 'player_url':   None,
1659                         })
1660                 except UnavailableVideoError:
1661                         self._downloader.trouble(u'\nERROR: unable to download video')
1662
1663
1664 class GenericIE(InfoExtractor):
1665         """Generic last-resort information extractor."""
1666
1667         def __init__(self, downloader=None):
1668                 InfoExtractor.__init__(self, downloader)
1669
1670         @staticmethod
1671         def suitable(url):
1672                 return True
1673
1674         def report_download_webpage(self, video_id):
1675                 """Report webpage download."""
1676                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1677                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1678
1679         def report_extraction(self, video_id):
1680                 """Report information extraction."""
1681                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1682
1683         def _real_initialize(self):
1684                 return
1685
1686         def _real_extract(self, url):
1687                 # At this point we have a new video
1688                 self._downloader.increment_downloads()
1689
1690                 video_id = url.split('/')[-1]
1691                 request = urllib2.Request(url)
1692                 try:
1693                         self.report_download_webpage(video_id)
1694                         webpage = urllib2.urlopen(request).read()
1695                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1696                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1697                         return
1698                 except ValueError, err:
1699                         # since this is the last-resort InfoExtractor, if
1700                         # this error is thrown, it'll be thrown here
1701                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1702                         return
1703
1704                 self.report_extraction(video_id)
1705                 # Start with something easy: JW Player in SWFObject
1706                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1707                 if mobj is None:
1708                         # Broaden the search a little bit
1709                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1710                 if mobj is None:
1711                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1712                         return
1713
1714                 # It's possible that one of the regexes
1715                 # matched, but returned an empty group:
1716                 if mobj.group(1) is None:
1717                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1718                         return
1719
1720                 video_url = urllib.unquote(mobj.group(1))
1721                 video_id  = os.path.basename(video_url)
1722
1723                 # here's a fun little line of code for you:
1724                 video_extension = os.path.splitext(video_id)[1][1:]
1725                 video_id        = os.path.splitext(video_id)[0]
1726
1727                 # it's tempting to parse this further, but you would
1728                 # have to take into account all the variations like
1729                 #   Video Title - Site Name
1730                 #   Site Name | Video Title
1731                 #   Video Title - Tagline | Site Name
1732                 # and so on and so forth; it's just not practical
1733                 mobj = re.search(r'<title>(.*)</title>', webpage)
1734                 if mobj is None:
1735                         self._downloader.trouble(u'ERROR: unable to extract title')
1736                         return
1737                 video_title = mobj.group(1).decode('utf-8')
1738                 video_title = sanitize_title(video_title)
1739                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1740
1741                 # video uploader is domain name
1742                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1743                 if mobj is None:
1744                         self._downloader.trouble(u'ERROR: unable to extract title')
1745                         return
1746                 video_uploader = mobj.group(1).decode('utf-8')
1747
1748                 try:
1749                         # Process video information
1750                         self._downloader.process_info({
1751                                 'id':           video_id.decode('utf-8'),
1752                                 'url':          video_url.decode('utf-8'),
1753                                 'uploader':     video_uploader,
1754                                 'upload_date':  u'NA',
1755                                 'title':        video_title,
1756                                 'stitle':       simple_title,
1757                                 'ext':          video_extension.decode('utf-8'),
1758                                 'format':       u'NA',
1759                                 'player_url':   None,
1760                         })
1761                 except UnavailableVideoError, err:
1762                         self._downloader.trouble(u'\nERROR: unable to download video')
1763
1764
1765 class YoutubeSearchIE(InfoExtractor):
1766         """Information Extractor for YouTube search queries."""
1767         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1768         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1769         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1770         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1771         _youtube_ie = None
1772         _max_youtube_results = 1000
1773
1774         def __init__(self, youtube_ie, downloader=None):
1775                 InfoExtractor.__init__(self, downloader)
1776                 self._youtube_ie = youtube_ie
1777
1778         @staticmethod
1779         def suitable(url):
1780                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1781
1782         def report_download_page(self, query, pagenum):
1783                 """Report attempt to download playlist page with given number."""
1784                 query = query.decode(preferredencoding())
1785                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1786
1787         def _real_initialize(self):
1788                 self._youtube_ie.initialize()
1789
1790         def _real_extract(self, query):
1791                 mobj = re.match(self._VALID_QUERY, query)
1792                 if mobj is None:
1793                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1794                         return
1795
1796                 prefix, query = query.split(':')
1797                 prefix = prefix[8:]
1798                 query  = query.encode('utf-8')
1799                 if prefix == '':
1800                         self._download_n_results(query, 1)
1801                         return
1802                 elif prefix == 'all':
1803                         self._download_n_results(query, self._max_youtube_results)
1804                         return
1805                 else:
1806                         try:
1807                                 n = long(prefix)
1808                                 if n <= 0:
1809                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1810                                         return
1811                                 elif n > self._max_youtube_results:
1812                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1813                                         n = self._max_youtube_results
1814                                 self._download_n_results(query, n)
1815                                 return
1816                         except ValueError: # parsing prefix as integer fails
1817                                 self._download_n_results(query, 1)
1818                                 return
1819
1820         def _download_n_results(self, query, n):
1821                 """Downloads a specified number of results for a query"""
1822
1823                 video_ids = []
1824                 already_seen = set()
1825                 pagenum = 1
1826
1827                 while True:
1828                         self.report_download_page(query, pagenum)
1829                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1830                         request = urllib2.Request(result_url)
1831                         try:
1832                                 page = urllib2.urlopen(request).read()
1833                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1834                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1835                                 return
1836
1837                         # Extract video identifiers
1838                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1839                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1840                                 if video_id not in already_seen:
1841                                         video_ids.append(video_id)
1842                                         already_seen.add(video_id)
1843                                         if len(video_ids) == n:
1844                                                 # Specified n videos reached
1845                                                 for id in video_ids:
1846                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1847                                                 return
1848
1849                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1850                                 for id in video_ids:
1851                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1852                                 return
1853
1854                         pagenum = pagenum + 1
1855
1856 class GoogleSearchIE(InfoExtractor):
1857         """Information Extractor for Google Video search queries."""
1858         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1859         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1860         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1861         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1862         _google_ie = None
1863         _max_google_results = 1000
1864
1865         def __init__(self, google_ie, downloader=None):
1866                 InfoExtractor.__init__(self, downloader)
1867                 self._google_ie = google_ie
1868
1869         @staticmethod
1870         def suitable(url):
1871                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1872
1873         def report_download_page(self, query, pagenum):
1874                 """Report attempt to download playlist page with given number."""
1875                 query = query.decode(preferredencoding())
1876                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1877
1878         def _real_initialize(self):
1879                 self._google_ie.initialize()
1880
1881         def _real_extract(self, query):
1882                 mobj = re.match(self._VALID_QUERY, query)
1883                 if mobj is None:
1884                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1885                         return
1886
1887                 prefix, query = query.split(':')
1888                 prefix = prefix[8:]
1889                 query  = query.encode('utf-8')
1890                 if prefix == '':
1891                         self._download_n_results(query, 1)
1892                         return
1893                 elif prefix == 'all':
1894                         self._download_n_results(query, self._max_google_results)
1895                         return
1896                 else:
1897                         try:
1898                                 n = long(prefix)
1899                                 if n <= 0:
1900                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1901                                         return
1902                                 elif n > self._max_google_results:
1903                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1904                                         n = self._max_google_results
1905                                 self._download_n_results(query, n)
1906                                 return
1907                         except ValueError: # parsing prefix as integer fails
1908                                 self._download_n_results(query, 1)
1909                                 return
1910
1911         def _download_n_results(self, query, n):
1912                 """Downloads a specified number of results for a query"""
1913
1914                 video_ids = []
1915                 already_seen = set()
1916                 pagenum = 1
1917
1918                 while True:
1919                         self.report_download_page(query, pagenum)
1920                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1921                         request = urllib2.Request(result_url)
1922                         try:
1923                                 page = urllib2.urlopen(request).read()
1924                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1925                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1926                                 return
1927
1928                         # Extract video identifiers
1929                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1930                                 video_id = mobj.group(1)
1931                                 if video_id not in already_seen:
1932                                         video_ids.append(video_id)
1933                                         already_seen.add(video_id)
1934                                         if len(video_ids) == n:
1935                                                 # Specified n videos reached
1936                                                 for id in video_ids:
1937                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1938                                                 return
1939
1940                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1941                                 for id in video_ids:
1942                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1943                                 return
1944
1945                         pagenum = pagenum + 1
1946
1947 class YahooSearchIE(InfoExtractor):
1948         """Information Extractor for Yahoo! Video search queries."""
1949         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1950         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1951         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1952         _MORE_PAGES_INDICATOR = r'\s*Next'
1953         _yahoo_ie = None
1954         _max_yahoo_results = 1000
1955
1956         def __init__(self, yahoo_ie, downloader=None):
1957                 InfoExtractor.__init__(self, downloader)
1958                 self._yahoo_ie = yahoo_ie
1959
1960         @staticmethod
1961         def suitable(url):
1962                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1963
1964         def report_download_page(self, query, pagenum):
1965                 """Report attempt to download playlist page with given number."""
1966                 query = query.decode(preferredencoding())
1967                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1968
1969         def _real_initialize(self):
1970                 self._yahoo_ie.initialize()
1971
1972         def _real_extract(self, query):
1973                 mobj = re.match(self._VALID_QUERY, query)
1974                 if mobj is None:
1975                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1976                         return
1977
1978                 prefix, query = query.split(':')
1979                 prefix = prefix[8:]
1980                 query  = query.encode('utf-8')
1981                 if prefix == '':
1982                         self._download_n_results(query, 1)
1983                         return
1984                 elif prefix == 'all':
1985                         self._download_n_results(query, self._max_yahoo_results)
1986                         return
1987                 else:
1988                         try:
1989                                 n = long(prefix)
1990                                 if n <= 0:
1991                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1992                                         return
1993                                 elif n > self._max_yahoo_results:
1994                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1995                                         n = self._max_yahoo_results
1996                                 self._download_n_results(query, n)
1997                                 return
1998                         except ValueError: # parsing prefix as integer fails
1999                                 self._download_n_results(query, 1)
2000                                 return
2001
2002         def _download_n_results(self, query, n):
2003                 """Downloads a specified number of results for a query"""
2004
2005                 video_ids = []
2006                 already_seen = set()
2007                 pagenum = 1
2008
2009                 while True:
2010                         self.report_download_page(query, pagenum)
2011                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2012                         request = urllib2.Request(result_url)
2013                         try:
2014                                 page = urllib2.urlopen(request).read()
2015                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2017                                 return
2018
2019                         # Extract video identifiers
2020                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2021                                 video_id = mobj.group(1)
2022                                 if video_id not in already_seen:
2023                                         video_ids.append(video_id)
2024                                         already_seen.add(video_id)
2025                                         if len(video_ids) == n:
2026                                                 # Specified n videos reached
2027                                                 for id in video_ids:
2028                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2029                                                 return
2030
2031                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2032                                 for id in video_ids:
2033                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2034                                 return
2035
2036                         pagenum = pagenum + 1
2037
2038 class YoutubePlaylistIE(InfoExtractor):
2039         """Information Extractor for YouTube playlists."""
2040
2041         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
2042         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2043         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2044         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2045         _youtube_ie = None
2046
2047         def __init__(self, youtube_ie, downloader=None):
2048                 InfoExtractor.__init__(self, downloader)
2049                 self._youtube_ie = youtube_ie
2050
2051         @staticmethod
2052         def suitable(url):
2053                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2054
2055         def report_download_page(self, playlist_id, pagenum):
2056                 """Report attempt to download playlist page with given number."""
2057                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2058
2059         def _real_initialize(self):
2060                 self._youtube_ie.initialize()
2061
2062         def _real_extract(self, url):
2063                 # Extract playlist id
2064                 mobj = re.match(self._VALID_URL, url)
2065                 if mobj is None:
2066                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2067                         return
2068
2069                 # Download playlist pages
2070                 playlist_id = mobj.group(1)
2071                 video_ids = []
2072                 pagenum = 1
2073
2074                 while True:
2075                         self.report_download_page(playlist_id, pagenum)
2076                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2077                         try:
2078                                 page = urllib2.urlopen(request).read()
2079                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2080                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2081                                 return
2082
2083                         # Extract video identifiers
2084                         ids_in_page = []
2085                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2086                                 if mobj.group(1) not in ids_in_page:
2087                                         ids_in_page.append(mobj.group(1))
2088                         video_ids.extend(ids_in_page)
2089
2090                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2091                                 break
2092                         pagenum = pagenum + 1
2093
2094                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2095                 playlistend = self._downloader.params.get('playlistend', -1)
2096                 video_ids = video_ids[playliststart:playlistend]
2097
2098                 for id in video_ids:
2099                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2100                 return
2101
2102 class YoutubeUserIE(InfoExtractor):
2103         """Information Extractor for YouTube users."""
2104
2105         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2106         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2107         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2108         _youtube_ie = None
2109
2110         def __init__(self, youtube_ie, downloader=None):
2111                 InfoExtractor.__init__(self, downloader)
2112                 self._youtube_ie = youtube_ie
2113
2114         @staticmethod
2115         def suitable(url):
2116                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2117
2118         def report_download_page(self, username):
2119                 """Report attempt to download user page."""
2120                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2121
2122         def _real_initialize(self):
2123                 self._youtube_ie.initialize()
2124
2125         def _real_extract(self, url):
2126                 # Extract username
2127                 mobj = re.match(self._VALID_URL, url)
2128                 if mobj is None:
2129                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2130                         return
2131
2132                 # Download user page
2133                 username = mobj.group(1)
2134                 video_ids = []
2135                 pagenum = 1
2136
2137                 self.report_download_page(username)
2138                 request = urllib2.Request(self._TEMPLATE_URL % (username))
2139                 try:
2140                         page = urllib2.urlopen(request).read()
2141                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2143                         return
2144
2145                 # Extract video identifiers
2146                 ids_in_page = []
2147
2148                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2149                         if mobj.group(1) not in ids_in_page:
2150                                 ids_in_page.append(mobj.group(1))
2151                 video_ids.extend(ids_in_page)
2152
2153                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2154                 playlistend = self._downloader.params.get('playlistend', -1)
2155                 video_ids = video_ids[playliststart:playlistend]
2156
2157                 for id in video_ids:
2158                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2159                 return
2160
2161 class DepositFilesIE(InfoExtractor):
2162         """Information extractor for depositfiles.com"""
2163
2164         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2165
2166         def __init__(self, downloader=None):
2167                 InfoExtractor.__init__(self, downloader)
2168
2169         @staticmethod
2170         def suitable(url):
2171                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2172
2173         def report_download_webpage(self, file_id):
2174                 """Report webpage download."""
2175                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2176
2177         def report_extraction(self, file_id):
2178                 """Report information extraction."""
2179                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2180
2181         def _real_initialize(self):
2182                 return
2183
2184         def _real_extract(self, url):
2185                 # At this point we have a new file
2186                 self._downloader.increment_downloads()
2187
2188                 file_id = url.split('/')[-1]
2189                 # Rebuild url in english locale
2190                 url = 'http://depositfiles.com/en/files/' + file_id
2191
2192                 # Retrieve file webpage with 'Free download' button pressed
2193                 free_download_indication = { 'gateway_result' : '1' }
2194                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2195                 try:
2196                         self.report_download_webpage(file_id)
2197                         webpage = urllib2.urlopen(request).read()
2198                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2199                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2200                         return
2201
2202                 # Search for the real file URL
2203                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2204                 if (mobj is None) or (mobj.group(1) is None):
2205                         # Try to figure out reason of the error.
2206                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2207                         if (mobj is not None) and (mobj.group(1) is not None):
2208                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2209                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2210                         else:
2211                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2212                         return
2213
2214                 file_url = mobj.group(1)
2215                 file_extension = os.path.splitext(file_url)[1][1:]
2216
2217                 # Search for file title
2218                 mobj = re.search(r'<b title="(.*?)">', webpage)
2219                 if mobj is None:
2220                         self._downloader.trouble(u'ERROR: unable to extract title')
2221                         return
2222                 file_title = mobj.group(1).decode('utf-8')
2223
2224                 try:
2225                         # Process file information
2226                         self._downloader.process_info({
2227                                 'id':           file_id.decode('utf-8'),
2228                                 'url':          file_url.decode('utf-8'),
2229                                 'uploader':     u'NA',
2230                                 'upload_date':  u'NA',
2231                                 'title':        file_title,
2232                                 'stitle':       file_title,
2233                                 'ext':          file_extension.decode('utf-8'),
2234                                 'format':       u'NA',
2235                                 'player_url':   None,
2236                         })
2237                 except UnavailableVideoError, err:
2238                         self._downloader.trouble(u'ERROR: unable to download file')
2239
2240 class PostProcessor(object):
2241         """Post Processor class.
2242
2243         PostProcessor objects can be added to downloaders with their
2244         add_post_processor() method. When the downloader has finished a
2245         successful download, it will take its internal chain of PostProcessors
2246         and start calling the run() method on each one of them, first with
2247         an initial argument and then with the returned value of the previous
2248         PostProcessor.
2249
2250         The chain will be stopped if one of them ever returns None or the end
2251         of the chain is reached.
2252
2253         PostProcessor objects follow a "mutual registration" process similar
2254         to InfoExtractor objects.
2255         """
2256
2257         _downloader = None
2258
2259         def __init__(self, downloader=None):
2260                 self._downloader = downloader
2261
2262         def set_downloader(self, downloader):
2263                 """Sets the downloader for this PP."""
2264                 self._downloader = downloader
2265
2266         def run(self, information):
2267                 """Run the PostProcessor.
2268
2269                 The "information" argument is a dictionary like the ones
2270                 composed by InfoExtractors. The only difference is that this
2271                 one has an extra field called "filepath" that points to the
2272                 downloaded file.
2273
2274                 When this method returns None, the postprocessing chain is
2275                 stopped. However, this method may return an information
2276                 dictionary that will be passed to the next postprocessing
2277                 object in the chain. It can be the one it received after
2278                 changing some fields.
2279
2280                 In addition, this method may raise a PostProcessingError
2281                 exception that will be taken into account by the downloader
2282                 it was called from.
2283                 """
2284                 return information # by default, do nothing
2285
2286 ### MAIN PROGRAM ###
2287 if __name__ == '__main__':
2288         try:
2289                 # Modules needed only when running the main program
2290                 import getpass
2291                 import optparse
2292
2293                 # Function to update the program file with the latest version from bitbucket.org
2294                 def update_self(downloader, filename):
2295                         # Note: downloader only used for options
2296                         if not os.access (filename, os.W_OK):
2297                                 sys.exit('ERROR: no write permissions on %s' % filename)
2298
2299                         downloader.to_screen('Updating to latest stable version...')
2300                         latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2301                         latest_version = urllib.urlopen(latest_url).read().strip()
2302                         prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2303                         newcontent = urllib.urlopen(prog_url).read()
2304                         stream = open(filename, 'w')
2305                         stream.write(newcontent)
2306                         stream.close()
2307                         downloader.to_screen('Updated to version %s' % latest_version)
2308
2309                 # Parse command line
2310                 parser = optparse.OptionParser(
2311                         usage='Usage: %prog [options] url...',
2312                         version='2010.12.09',
2313                         conflict_handler='resolve',
2314                 )
2315
2316                 parser.add_option('-h', '--help',
2317                                 action='help', help='print this help text and exit')
2318                 parser.add_option('-v', '--version',
2319                                 action='version', help='print program version and exit')
2320                 parser.add_option('-U', '--update',
2321                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2322                 parser.add_option('-i', '--ignore-errors',
2323                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2324                 parser.add_option('-r', '--rate-limit',
2325                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2326                 parser.add_option('-R', '--retries',
2327                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2328                 parser.add_option('--playlist-start',
2329                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2330                 parser.add_option('--playlist-end',
2331                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2332                 parser.add_option('--dump-user-agent',
2333                                 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2334
2335                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2336                 authentication.add_option('-u', '--username',
2337                                 dest='username', metavar='USERNAME', help='account username')
2338                 authentication.add_option('-p', '--password',
2339                                 dest='password', metavar='PASSWORD', help='account password')
2340                 authentication.add_option('-n', '--netrc',
2341                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2342                 parser.add_option_group(authentication)
2343
2344                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2345                 video_format.add_option('-f', '--format',
2346                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2347                 video_format.add_option('--all-formats',
2348                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2349                 video_format.add_option('--max-quality',
2350                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2351                 parser.add_option_group(video_format)
2352
2353                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2354                 verbosity.add_option('-q', '--quiet',
2355                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2356                 verbosity.add_option('-s', '--simulate',
2357                                 action='store_true', dest='simulate', help='do not download video', default=False)
2358                 verbosity.add_option('-g', '--get-url',
2359                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2360                 verbosity.add_option('-e', '--get-title',
2361                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2362                 verbosity.add_option('--get-thumbnail',
2363                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2364                 verbosity.add_option('--get-description',
2365                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2366                 verbosity.add_option('--no-progress',
2367                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2368                 verbosity.add_option('--console-title',
2369                                 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2370                 parser.add_option_group(verbosity)
2371
2372                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2373                 filesystem.add_option('-t', '--title',
2374                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2375                 filesystem.add_option('-l', '--literal',
2376                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2377                 filesystem.add_option('-A', '--auto-number',
2378                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2379                 filesystem.add_option('-o', '--output',
2380                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2381                 filesystem.add_option('-a', '--batch-file',
2382                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2383                 filesystem.add_option('-w', '--no-overwrites',
2384                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2385                 filesystem.add_option('-c', '--continue',
2386                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2387                 filesystem.add_option('--cookies',
2388                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2389                 filesystem.add_option('--no-part',
2390                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
2391                 parser.add_option_group(filesystem)
2392
2393                 (opts, args) = parser.parse_args()
2394
2395                 # Open appropriate CookieJar
2396                 if opts.cookiefile is None:
2397                         jar = cookielib.CookieJar()
2398                 else:
2399                         try:
2400                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2401                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2402                                         jar.load()
2403                         except (IOError, OSError), err:
2404                                 sys.exit(u'ERROR: unable to open cookie file')
2405
2406                 # Dump user agent
2407                 if opts.dump_user_agent:
2408                         print std_headers['User-Agent']
2409                         sys.exit(0)
2410
2411                 # General configuration
2412                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2413                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2414                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2415
2416                 # Batch file verification
2417                 batchurls = []
2418                 if opts.batchfile is not None:
2419                         try:
2420                                 if opts.batchfile == '-':
2421                                         batchfd = sys.stdin
2422                                 else:
2423                                         batchfd = open(opts.batchfile, 'r')
2424                                 batchurls = batchfd.readlines()
2425                                 batchurls = [x.strip() for x in batchurls]
2426                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2427                         except IOError:
2428                                 sys.exit(u'ERROR: batch file could not be read')
2429                 all_urls = batchurls + args
2430
2431                 # Conflicting, missing and erroneous options
2432                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2433                         parser.error(u'using .netrc conflicts with giving username/password')
2434                 if opts.password is not None and opts.username is None:
2435                         parser.error(u'account username missing')
2436                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2437                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2438                 if opts.usetitle and opts.useliteral:
2439                         parser.error(u'using title conflicts with using literal title')
2440                 if opts.username is not None and opts.password is None:
2441                         opts.password = getpass.getpass(u'Type account password and press return:')
2442                 if opts.ratelimit is not None:
2443                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2444                         if numeric_limit is None:
2445                                 parser.error(u'invalid rate limit specified')
2446                         opts.ratelimit = numeric_limit
2447                 if opts.retries is not None:
2448                         try:
2449                                 opts.retries = long(opts.retries)
2450                         except (TypeError, ValueError), err:
2451                                 parser.error(u'invalid retry count specified')
2452                 try:
2453                         opts.playliststart = long(opts.playliststart)
2454                         if opts.playliststart <= 0:
2455                                 raise ValueError
2456                 except (TypeError, ValueError), err:
2457                         parser.error(u'invalid playlist start number specified')
2458                 try:
2459                         opts.playlistend = long(opts.playlistend)
2460                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2461                                 raise ValueError
2462                 except (TypeError, ValueError), err:
2463                         parser.error(u'invalid playlist end number specified')
2464
2465                 # Information extractors
2466                 youtube_ie = YoutubeIE()
2467                 metacafe_ie = MetacafeIE(youtube_ie)
2468                 dailymotion_ie = DailymotionIE()
2469                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2470                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2471                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2472                 google_ie = GoogleIE()
2473                 google_search_ie = GoogleSearchIE(google_ie)
2474                 photobucket_ie = PhotobucketIE()
2475                 yahoo_ie = YahooIE()
2476                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2477                 deposit_files_ie = DepositFilesIE()
2478                 generic_ie = GenericIE()
2479
2480                 # File downloader
2481                 fd = FileDownloader({
2482                         'usenetrc': opts.usenetrc,
2483                         'username': opts.username,
2484                         'password': opts.password,
2485                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2486                         'forceurl': opts.geturl,
2487                         'forcetitle': opts.gettitle,
2488                         'forcethumbnail': opts.getthumbnail,
2489                         'forcedescription': opts.getdescription,
2490                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2491                         'format': opts.format,
2492                         'format_limit': opts.format_limit,
2493                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2494                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2495                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2496                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2497                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2498                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2499                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2500                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2501                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2502                                 or u'%(id)s.%(ext)s'),
2503                         'ignoreerrors': opts.ignoreerrors,
2504                         'ratelimit': opts.ratelimit,
2505                         'nooverwrites': opts.nooverwrites,
2506                         'retries': opts.retries,
2507                         'continuedl': opts.continue_dl,
2508                         'noprogress': opts.noprogress,
2509                         'playliststart': opts.playliststart,
2510                         'playlistend': opts.playlistend,
2511                         'logtostderr': opts.outtmpl == '-',
2512                         'consoletitle': opts.consoletitle,
2513                         'nopart': opts.nopart,
2514                         })
2515                 fd.add_info_extractor(youtube_search_ie)
2516                 fd.add_info_extractor(youtube_pl_ie)
2517                 fd.add_info_extractor(youtube_user_ie)
2518                 fd.add_info_extractor(metacafe_ie)
2519                 fd.add_info_extractor(dailymotion_ie)
2520                 fd.add_info_extractor(youtube_ie)
2521                 fd.add_info_extractor(google_ie)
2522                 fd.add_info_extractor(google_search_ie)
2523                 fd.add_info_extractor(photobucket_ie)
2524                 fd.add_info_extractor(yahoo_ie)
2525                 fd.add_info_extractor(yahoo_search_ie)
2526                 fd.add_info_extractor(deposit_files_ie)
2527
2528                 # This must come last since it's the
2529                 # fallback if none of the others work
2530                 fd.add_info_extractor(generic_ie)
2531
2532                 # Update version
2533                 if opts.update_self:
2534                         update_self(fd, sys.argv[0])
2535
2536                 # Maybe do nothing
2537                 if len(all_urls) < 1:
2538                         if not opts.update_self:
2539                                 parser.error(u'you must provide at least one URL')
2540                         else:
2541                                 sys.exit()
2542                 retcode = fd.download(all_urls)
2543
2544                 # Dump cookie jar if requested
2545                 if opts.cookiefile is not None:
2546                         try:
2547                                 jar.save()
2548                         except (IOError, OSError), err:
2549                                 sys.exit(u'ERROR: unable to save cookie jar')
2550
2551                 sys.exit(retcode)
2552
2553         except DownloadError:
2554                 sys.exit(1)
2555         except SameFileError:
2556                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2557         except KeyboardInterrupt:
2558                 sys.exit(u'\nERROR: Interrupted by user')