youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class UnavailableFormatError(Exception):
  56         """Unavailable Format exception.
  57
  58         This exception will be thrown when a video is requested
  59         in a format that is not available for that video.
  60         """
  61
  62 class FileDownloader(object):
  63         """File Downloader class.
  64
  65         File downloader objects are the ones responsible of downloading the
  66         actual video file and writing it to disk if the user has requested
  67         it, among some other tasks. In most cases there should be one per
  68         program. As, given a video URL, the downloader doesn't know how to
  69         extract all the needed information, task that InfoExtractors do, it
  70         has to pass the URL to one of them.
  71
  72         For this, file downloader objects have a method that allows
  73         InfoExtractors to be registered in a given order. When it is passed
  74         a URL, the file downloader handles it to the first InfoExtractor it
  75         finds that reports being able to handle it. The InfoExtractor extracts
  76         all the information about the video or videos the URL refers to, and
  77         asks the FileDownloader to process the video information, possibly
  78         downloading the video.
  79
  80         File downloaders accept a lot of parameters. In order not to saturate
  81         the object constructor with arguments, it receives a dictionary of
  82         options instead. These options are available through the params
  83         attribute for the InfoExtractors to use. The FileDownloader also
  84         registers itself as the downloader in charge for the InfoExtractors
  85         that are added to it, so this is a "mutual registration".
  86
  87         Available options:
  88
  89         username:       Username for authentication purposes.
  90         password:       Password for authentication purposes.
  91         usenetrc:       Use netrc for authentication instead.
  92         quiet:          Do not print messages to stdout.
  93         forceurl:       Force printing final URL.
  94         forcetitle:     Force printing title.
  95         simulate:       Do not download the video files.
  96         format:         Video format code.
  97         outtmpl:        Template for output names.
  98         ignoreerrors:   Do not stop on download errors.
  99         ratelimit:      Download speed limit, in bytes/sec.
 100         nooverwrites:   Prevent overwriting files.
 101         """
 102
 103         params = None
 104         _ies = []
 105         _pps = []
 106         _download_retcode = None
 107
 108         def __init__(self, params):
 109                 """Create a FileDownloader object with the given options."""
 110                 self._ies = []
 111                 self._pps = []
 112                 self._download_retcode = 0
 113                 self.params = params
 114
 115         @staticmethod
 116         def pmkdir(filename):
 117                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 118                 components = filename.split(os.sep)
 119                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 120                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 121                 for dir in aggregate:
 122                         if not os.path.exists(dir):
 123                                 os.mkdir(dir)
 124
 125         @staticmethod
 126         def format_bytes(bytes):
 127                 if bytes is None:
 128                         return 'N/A'
 129                 if bytes == 0:
 130                         exponent = 0
 131                 else:
 132                         exponent = long(math.log(float(bytes), 1024.0))
 133                 suffix = 'bkMGTPEZY'[exponent]
 134                 converted = float(bytes) / float(1024**exponent)
 135                 return '%.2f%s' % (converted, suffix)
 136
 137         @staticmethod
 138         def calc_percent(byte_counter, data_len):
 139                 if data_len is None:
 140                         return '---.-%'
 141                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 142
 143         @staticmethod
 144         def calc_eta(start, now, total, current):
 145                 if total is None:
 146                         return '--:--'
 147                 dif = now - start
 148                 if current == 0 or dif < 0.001: # One millisecond
 149                         return '--:--'
 150                 rate = float(current) / dif
 151                 eta = long((float(total) - float(current)) / rate)
 152                 (eta_mins, eta_secs) = divmod(eta, 60)
 153                 if eta_mins > 99:
 154                         return '--:--'
 155                 return '%02d:%02d' % (eta_mins, eta_secs)
 156
 157         @staticmethod
 158         def calc_speed(start, now, bytes):
 159                 dif = now - start
 160                 if bytes == 0 or dif < 0.001: # One millisecond
 161                         return '%10s' % '---b/s'
 162                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 163
 164         @staticmethod
 165         def best_block_size(elapsed_time, bytes):
 166                 new_min = max(bytes / 2.0, 1.0)
 167                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 168                 if elapsed_time < 0.001:
 169                         return int(new_max)
 170                 rate = bytes / elapsed_time
 171                 if rate > new_max:
 172                         return int(new_max)
 173                 if rate < new_min:
 174                         return int(new_min)
 175                 return int(rate)
 176
 177         @staticmethod
 178         def parse_bytes(bytestr):
 179                 """Parse a string indicating a byte quantity into a long integer."""
 180                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 181                 if matchobj is None:
 182                         return None
 183                 number = float(matchobj.group(1))
 184                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 185                 return long(round(number * multiplier))
 186
 187         def add_info_extractor(self, ie):
 188                 """Add an InfoExtractor object to the end of the list."""
 189                 self._ies.append(ie)
 190                 ie.set_downloader(self)
 191
 192         def add_post_processor(self, pp):
 193                 """Add a PostProcessor object to the end of the chain."""
 194                 self._pps.append(pp)
 195                 pp.set_downloader(self)
 196
 197         def to_stdout(self, message, skip_eol=False):
 198                 """Print message to stdout if not in quiet mode."""
 199                 if not self.params.get('quiet', False):
 200                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
 201                         sys.stdout.flush()
 202
 203         def to_stderr(self, message):
 204                 """Print message to stderr."""
 205                 print >>sys.stderr, message
 206
 207         def fixed_template(self):
 208                 """Checks if the output template is fixed."""
 209                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 210
 211         def trouble(self, message=None):
 212                 """Determine action to take when a download problem appears.
 213
 214                 Depending on if the downloader has been configured to ignore
 215                 download errors or not, this method may throw an exception or
 216                 not when errors are found, after printing the message.
 217                 """
 218                 if message is not None:
 219                         self.to_stderr(message)
 220                 if not self.params.get('ignoreerrors', False):
 221                         raise DownloadError(message)
 222                 self._download_retcode = 1
 223
 224         def slow_down(self, start_time, byte_counter):
 225                 """Sleep if the download speed is over the rate limit."""
 226                 rate_limit = self.params.get('ratelimit', None)
 227                 if rate_limit is None or byte_counter == 0:
 228                         return
 229                 now = time.time()
 230                 elapsed = now - start_time
 231                 if elapsed <= 0.0:
 232                         return
 233                 speed = float(byte_counter) / elapsed
 234                 if speed > rate_limit:
 235                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 236
 237         def report_destination(self, filename):
 238                 """Report destination filename."""
 239                 self.to_stdout(u'[download] Destination: %s' % filename)
 240
 241         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 242                 """Report download progress."""
 243                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 244                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 245
 246         def report_finish(self):
 247                 """Report download finished."""
 248                 self.to_stdout(u'')
 249
 250         def process_info(self, info_dict):
 251                 """Process a single dictionary returned by an InfoExtractor."""
 252                 # Forced printings
 253                 if self.params.get('forcetitle', False):
 254                         print info_dict['title'].encode(locale.getpreferredencoding())
 255                 if self.params.get('forceurl', False):
 256                         print info_dict['url'].encode(locale.getpreferredencoding())
 257
 258                 # Do nothing else if in simulate mode
 259                 if self.params.get('simulate', False):
 260                         return
 261
 262                 try:
 263                         template_dict = dict(info_dict)
 264                         template_dict['epoch'] = unicode(long(time.time()))
 265                         filename = self.params['outtmpl'] % template_dict
 266                         self.report_destination(filename)
 267                 except (ValueError, KeyError), err:
 268                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 269                 if self.params['nooverwrites'] and os.path.exists(filename):
 270                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 271                         return
 272
 273                 try:
 274                         self.pmkdir(filename)
 275                 except (OSError, IOError), err:
 276                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 277                         return
 278
 279                 try:
 280                         outstream = open(filename, 'wb')
 281                 except (OSError, IOError), err:
 282                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 283                         return
 284
 285                 try:
 286                         self._do_download(outstream, info_dict['url'])
 287                         outstream.close()
 288                 except (OSError, IOError), err:
 289                         os.remove(filename)
 290                         raise UnavailableFormatError
 291                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 292                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 293                         return
 294
 295                 try:
 296                         self.post_process(filename, info_dict)
 297                 except (PostProcessingError), err:
 298                         self.trouble('ERROR: postprocessing: %s' % str(err))
 299                         return
 300
 301         def download(self, url_list):
 302                 """Download a given list of URLs."""
 303                 if len(url_list) > 1 and self.fixed_template():
 304                         raise SameFileError(self.params['outtmpl'])
 305
 306                 for url in url_list:
 307                         suitable_found = False
 308                         for ie in self._ies:
 309                                 # Go to next InfoExtractor if not suitable
 310                                 if not ie.suitable(url):
 311                                         continue
 312
 313                                 # Suitable InfoExtractor found
 314                                 suitable_found = True
 315
 316                                 # Extract information from URL and process it
 317                                 ie.extract(url)
 318
 319                                 # Suitable InfoExtractor had been found; go to next URL
 320                                 break
 321
 322                         if not suitable_found:
 323                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 324
 325                 return self._download_retcode
 326
 327         def post_process(self, filename, ie_info):
 328                 """Run the postprocessing chain on the given file."""
 329                 info = dict(ie_info)
 330                 info['filepath'] = filename
 331                 for pp in self._pps:
 332                         info = pp.run(info)
 333                         if info is None:
 334                                 break
 335
 336         def _do_download(self, stream, url):
 337                 request = urllib2.Request(url, None, std_headers)
 338                 data = urllib2.urlopen(request)
 339                 data_len = data.info().get('Content-length', None)
 340                 data_len_str = self.format_bytes(data_len)
 341                 byte_counter = 0
 342                 block_size = 1024
 343                 start = time.time()
 344                 while True:
 345                         # Progress message
 346                         percent_str = self.calc_percent(byte_counter, data_len)
 347                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 348                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 349                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 350
 351                         # Download and write
 352                         before = time.time()
 353                         data_block = data.read(block_size)
 354                         after = time.time()
 355                         data_block_len = len(data_block)
 356                         if data_block_len == 0:
 357                                 break
 358                         byte_counter += data_block_len
 359                         stream.write(data_block)
 360                         block_size = self.best_block_size(after - before, data_block_len)
 361
 362                         # Apply rate limit
 363                         self.slow_down(start, byte_counter)
 364
 365                 self.report_finish()
 366                 if data_len is not None and str(byte_counter) != data_len:
 367                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 368
 369 class InfoExtractor(object):
 370         """Information Extractor class.
 371
 372         Information extractors are the classes that, given a URL, extract
 373         information from the video (or videos) the URL refers to. This
 374         information includes the real video URL, the video title and simplified
 375         title, author and others. The information is stored in a dictionary
 376         which is then passed to the FileDownloader. The FileDownloader
 377         processes this information possibly downloading the video to the file
 378         system, among other possible outcomes. The dictionaries must include
 379         the following fields:
 380
 381         id:             Video identifier.
 382         url:            Final video URL.
 383         uploader:       Nickname of the video uploader.
 384         title:          Literal title.
 385         stitle:         Simplified title.
 386         ext:            Video filename extension.
 387
 388         Subclasses of this one should re-define the _real_initialize() and
 389         _real_extract() methods, as well as the suitable() static method.
 390         Probably, they should also be instantiated and added to the main
 391         downloader.
 392         """
 393
 394         _ready = False
 395         _downloader = None
 396
 397         def __init__(self, downloader=None):
 398                 """Constructor. Receives an optional downloader."""
 399                 self._ready = False
 400                 self.set_downloader(downloader)
 401
 402         @staticmethod
 403         def suitable(url):
 404                 """Receives a URL and returns True if suitable for this IE."""
 405                 return False
 406
 407         def initialize(self):
 408                 """Initializes an instance (authentication, etc)."""
 409                 if not self._ready:
 410                         self._real_initialize()
 411                         self._ready = True
 412
 413         def extract(self, url):
 414                 """Extracts URL information and returns it in list of dicts."""
 415                 self.initialize()
 416                 return self._real_extract(url)
 417
 418         def set_downloader(self, downloader):
 419                 """Sets the downloader for this IE."""
 420                 self._downloader = downloader
 421
 422         def _real_initialize(self):
 423                 """Real initialization process. Redefine in subclasses."""
 424                 pass
 425
 426         def _real_extract(self, url):
 427                 """Real extraction process. Redefine in subclasses."""
 428                 pass
 429
 430 class YoutubeIE(InfoExtractor):
 431         """Information extractor for youtube.com."""
 432
 433         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 434         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 435         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 436         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 437         _NETRC_MACHINE = 'youtube'
 438         _available_formats = ['22', '35', '18', '17', '13'] # listed in order of priority for -b flag
 439         _video_extensions = {
 440                 '13': '3gp',
 441                 '17': 'mp4',
 442                 '18': 'mp4',
 443                 '22': 'mp4',
 444         }
 445
 446         @staticmethod
 447         def suitable(url):
 448                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 449
 450         @staticmethod
 451         def htmlentity_transform(matchobj):
 452                 """Transforms an HTML entity to a Unicode character."""
 453                 entity = matchobj.group(1)
 454
 455                 # Known non-numeric HTML entity
 456                 if entity in htmlentitydefs.name2codepoint:
 457                         return unichr(htmlentitydefs.name2codepoint[entity])
 458
 459                 # Unicode character
 460                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 461                 if mobj is not None:
 462                         numstr = mobj.group(1)
 463                         if numstr.startswith(u'x'):
 464                                 base = 16
 465                                 numstr = u'0%s' % numstr
 466                         else:
 467                                 base = 10
 468                         return unichr(long(numstr, base))
 469
 470                 # Unknown entity in name, return its literal representation
 471                 return (u'&%s;' % entity)
 472
 473         def report_lang(self):
 474                 """Report attempt to set language."""
 475                 self._downloader.to_stdout(u'[youtube] Setting language')
 476
 477         def report_login(self):
 478                 """Report attempt to log in."""
 479                 self._downloader.to_stdout(u'[youtube] Logging in')
 480
 481         def report_age_confirmation(self):
 482                 """Report attempt to confirm age."""
 483                 self._downloader.to_stdout(u'[youtube] Confirming age')
 484
 485         def report_webpage_download(self, video_id):
 486                 """Report attempt to download webpage."""
 487                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 488
 489         def report_information_extraction(self, video_id):
 490                 """Report attempt to extract video information."""
 491                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 492
 493         def report_video_url(self, video_id, video_real_url):
 494                 """Report extracted video URL."""
 495                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 496
 497         def report_unavailable_format(self, video_id, format):
 498                 """Report extracted video URL."""
 499                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 500
 501         def _real_initialize(self):
 502                 if self._downloader is None:
 503                         return
 504
 505                 username = None
 506                 password = None
 507                 downloader_params = self._downloader.params
 508
 509                 # Attempt to use provided username and password or .netrc data
 510                 if downloader_params.get('username', None) is not None:
 511                         username = downloader_params['username']
 512                         password = downloader_params['password']
 513                 elif downloader_params.get('usenetrc', False):
 514                         try:
 515                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 516                                 if info is not None:
 517                                         username = info[0]
 518                                         password = info[2]
 519                                 else:
 520                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 521                         except (IOError, netrc.NetrcParseError), err:
 522                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 523                                 return
 524
 525                 # Set language
 526                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 527                 try:
 528                         self.report_lang()
 529                         urllib2.urlopen(request).read()
 530                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 531                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 532                         return
 533
 534                 # No authentication to be performed
 535                 if username is None:
 536                         return
 537
 538                 # Log in
 539                 login_form = {
 540                                 'current_form': 'loginForm',
 541                                 'next':         '/',
 542                                 'action_login': 'Log In',
 543                                 'username':     username,
 544                                 'password':     password,
 545                                 }
 546                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 547                 try:
 548                         self.report_login()
 549                         login_results = urllib2.urlopen(request).read()
 550                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 551                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 552                                 return
 553                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 554                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 555                         return
 556
 557                 # Confirm age
 558                 age_form = {
 559                                 'next_url':             '/',
 560                                 'action_confirm':       'Confirm',
 561                                 }
 562                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 563                 try:
 564                         self.report_age_confirmation()
 565                         age_results = urllib2.urlopen(request).read()
 566                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 567                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 568                         return
 569
 570         def _real_extract(self, url):
 571                 # Extract video id from URL
 572                 mobj = re.match(self._VALID_URL, url)
 573                 if mobj is None:
 574                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 575                         return
 576                 video_id = mobj.group(2)
 577
 578                 # Downloader parameters
 579                 best_quality = False
 580                 format_param = None
 581                 quality_index = 0
 582                 if self._downloader is not None:
 583                         params = self._downloader.params
 584                         format_param = params.get('format', None)
 585                         if format_param == '0':
 586                                 format_param = self._available_formats[quality_index]
 587                                 best_quality = True
 588
 589                 while True:
 590                         try:
 591                                 # Extension
 592                                 video_extension = self._video_extensions.get(format_param, 'flv')
 593
 594                                 # Normalize URL, including format
 595                                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 596                                 if format_param is not None:
 597                                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 598                                 request = urllib2.Request(normalized_url, None, std_headers)
 599                                 try:
 600                                         self.report_webpage_download(video_id)
 601                                         video_webpage = urllib2.urlopen(request).read()
 602                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 603                                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 604                                         return
 605                                 self.report_information_extraction(video_id)
 606
 607                                 # "t" param
 608                                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 609                                 if mobj is None:
 610                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
 611                                         return
 612                                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
 613                                 if format_param is not None:
 614                                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 615                                 self.report_video_url(video_id, video_real_url)
 616
 617                                 # uploader
 618                                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 619                                 if mobj is None:
 620                                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 621                                         return
 622                                 video_uploader = mobj.group(1)
 623
 624                                 # title
 625                                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 626                                 if mobj is None:
 627                                         self._downloader.trouble(u'ERROR: unable to extract video title')
 628                                         return
 629                                 video_title = mobj.group(1).decode('utf-8')
 630                                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 631                                 video_title = video_title.replace(os.sep, u'%')
 632
 633                                 # simplified title
 634                                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 635                                 simple_title = simple_title.strip(ur'_')
 636
 637                                 # Process video information
 638                                 self._downloader.process_info({
 639                                         'id':           video_id.decode('utf-8'),
 640                                         'url':          video_real_url.decode('utf-8'),
 641                                         'uploader':     video_uploader.decode('utf-8'),
 642                                         'title':        video_title,
 643                                         'stitle':       simple_title,
 644                                         'ext':          video_extension.decode('utf-8'),
 645                                 })
 646
 647                                 return
 648
 649                         except UnavailableFormatError, err:
 650                                 if best_quality:
 651                                         if quality_index == len(self._available_formats) - 1:
 652                                                 # I don't ever expect this to happen
 653                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 654                                                 return
 655                                         else:
 656                                                 self.report_unavailable_format(video_id, format_param)
 657                                                 quality_index += 1
 658                                                 format_param = self._available_formats[quality_index]
 659                                                 continue
 660                                 else:
 661                                         self._downloader.trouble('ERROR: format not available for video')
 662                                         return
 663
 664
 665 class MetacafeIE(InfoExtractor):
 666         """Information Extractor for metacafe.com."""
 667
 668         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 669         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 670         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 671         _youtube_ie = None
 672
 673         def __init__(self, youtube_ie, downloader=None):
 674                 InfoExtractor.__init__(self, downloader)
 675                 self._youtube_ie = youtube_ie
 676
 677         @staticmethod
 678         def suitable(url):
 679                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 680
 681         def report_disclaimer(self):
 682                 """Report disclaimer retrieval."""
 683                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 684
 685         def report_age_confirmation(self):
 686                 """Report attempt to confirm age."""
 687                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 688
 689         def report_download_webpage(self, video_id):
 690                 """Report webpage download."""
 691                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 692
 693         def report_extraction(self, video_id):
 694                 """Report information extraction."""
 695                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 696
 697         def _real_initialize(self):
 698                 # Retrieve disclaimer
 699                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 700                 try:
 701                         self.report_disclaimer()
 702                         disclaimer = urllib2.urlopen(request).read()
 703                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 704                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 705                         return
 706
 707                 # Confirm age
 708                 disclaimer_form = {
 709                         'filters': '0',
 710                         'submit': "Continue - I'm over 18",
 711                         }
 712                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 713                 try:
 714                         self.report_age_confirmation()
 715                         disclaimer = urllib2.urlopen(request).read()
 716                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 717                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 718                         return
 719
 720         def _real_extract(self, url):
 721                 # Extract id and simplified title from URL
 722                 mobj = re.match(self._VALID_URL, url)
 723                 if mobj is None:
 724                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 725                         return
 726
 727                 video_id = mobj.group(1)
 728
 729                 # Check if video comes from YouTube
 730                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 731                 if mobj2 is not None:
 732                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 733                         return
 734
 735                 simple_title = mobj.group(2).decode('utf-8')
 736                 video_extension = 'flv'
 737
 738                 # Retrieve video webpage to extract further information
 739                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 740                 try:
 741                         self.report_download_webpage(video_id)
 742                         webpage = urllib2.urlopen(request).read()
 743                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 744                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 745                         return
 746
 747                 # Extract URL, uploader and title from webpage
 748                 self.report_extraction(video_id)
 749                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
 750                 if mobj is None:
 751                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 752                         return
 753                 mediaURL = urllib.unquote(mobj.group(1))
 754
 755                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 756                 if mobj is None:
 757                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 758                         return
 759                 gdaKey = mobj.group(1)
 760
 761                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 762
 763                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 764                 if mobj is None:
 765                         self._downloader.trouble(u'ERROR: unable to extract title')
 766                         return
 767                 video_title = mobj.group(1).decode('utf-8')
 768
 769                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
 770                 if mobj is None:
 771                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 772                         return
 773                 video_uploader = mobj.group(1)
 774
 775                 try:
 776                         # Process video information
 777                         self._downloader.process_info({
 778                                 'id':           video_id.decode('utf-8'),
 779                                 'url':          video_url.decode('utf-8'),
 780                                 'uploader':     video_uploader.decode('utf-8'),
 781                                 'title':        video_title,
 782                                 'stitle':       simple_title,
 783                                 'ext':          video_extension.decode('utf-8'),
 784                         })
 785                 except UnavailableFormatError:
 786                         self._downloader.trouble(u'ERROR: format not available for video')
 787
 788
 789 class YoutubeSearchIE(InfoExtractor):
 790         """Information Extractor for YouTube search queries."""
 791         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 792         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 793         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 794         _MORE_PAGES_INDICATOR = r'>Next</a>'
 795         _youtube_ie = None
 796         _max_youtube_results = 1000
 797
 798         def __init__(self, youtube_ie, downloader=None):
 799                 InfoExtractor.__init__(self, downloader)
 800                 self._youtube_ie = youtube_ie
 801
 802         @staticmethod
 803         def suitable(url):
 804                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 805
 806         def report_download_page(self, query, pagenum):
 807                 """Report attempt to download playlist page with given number."""
 808                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 809
 810         def _real_initialize(self):
 811                 self._youtube_ie.initialize()
 812
 813         def _real_extract(self, query):
 814                 mobj = re.match(self._VALID_QUERY, query)
 815                 if mobj is None:
 816                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 817                         return
 818
 819                 prefix, query = query.split(':')
 820                 prefix = prefix[8:]
 821                 if prefix == '':
 822                         self._download_n_results(query, 1)
 823                         return
 824                 elif prefix == 'all':
 825                         self._download_n_results(query, self._max_youtube_results)
 826                         return
 827                 else:
 828                         try:
 829                                 n = int(prefix)
 830                                 if n <= 0:
 831                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 832                                         return
 833                                 elif n > self._max_youtube_results:
 834                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 835                                         n = self._max_youtube_results
 836                                 self._download_n_results(query, n)
 837                                 return
 838                         except ValueError: # parsing prefix as int fails
 839                                 self._download_n_results(query, 1)
 840                                 return
 841
 842         def _download_n_results(self, query, n):
 843                 """Downloads a specified number of results for a query"""
 844
 845                 video_ids = []
 846                 already_seen = set()
 847                 pagenum = 1
 848
 849                 while True:
 850                         self.report_download_page(query, pagenum)
 851                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 852                         request = urllib2.Request(result_url, None, std_headers)
 853                         try:
 854                                 page = urllib2.urlopen(request).read()
 855                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 856                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 857                                 return
 858
 859                         # Extract video identifiers
 860                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 861                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 862                                 if video_id not in already_seen:
 863                                         video_ids.append(video_id)
 864                                         already_seen.add(video_id)
 865                                         if len(video_ids) == n:
 866                                                 # Specified n videos reached
 867                                                 for id in video_ids:
 868                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 869                                                 return
 870
 871                         if self._MORE_PAGES_INDICATOR not in page:
 872                                 for id in video_ids:
 873                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 874                                 return
 875
 876                         pagenum = pagenum + 1
 877
 878 class YoutubePlaylistIE(InfoExtractor):
 879         """Information Extractor for YouTube playlists."""
 880
 881         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 882         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 883         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 884         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 885         _youtube_ie = None
 886
 887         def __init__(self, youtube_ie, downloader=None):
 888                 InfoExtractor.__init__(self, downloader)
 889                 self._youtube_ie = youtube_ie
 890
 891         @staticmethod
 892         def suitable(url):
 893                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 894
 895         def report_download_page(self, playlist_id, pagenum):
 896                 """Report attempt to download playlist page with given number."""
 897                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 898
 899         def _real_initialize(self):
 900                 self._youtube_ie.initialize()
 901
 902         def _real_extract(self, url):
 903                 # Extract playlist id
 904                 mobj = re.match(self._VALID_URL, url)
 905                 if mobj is None:
 906                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 907                         return
 908
 909                 # Download playlist pages
 910                 playlist_id = mobj.group(1)
 911                 video_ids = []
 912                 pagenum = 1
 913
 914                 while True:
 915                         self.report_download_page(playlist_id, pagenum)
 916                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 917                         try:
 918                                 page = urllib2.urlopen(request).read()
 919                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 920                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 921                                 return
 922
 923                         # Extract video identifiers
 924                         ids_in_page = []
 925                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 926                                 if mobj.group(1) not in ids_in_page:
 927                                         ids_in_page.append(mobj.group(1))
 928                         video_ids.extend(ids_in_page)
 929
 930                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 931                                 break
 932                         pagenum = pagenum + 1
 933
 934                 for id in video_ids:
 935                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 936                 return
 937
 938 class PostProcessor(object):
 939         """Post Processor class.
 940
 941         PostProcessor objects can be added to downloaders with their
 942         add_post_processor() method. When the downloader has finished a
 943         successful download, it will take its internal chain of PostProcessors
 944         and start calling the run() method on each one of them, first with
 945         an initial argument and then with the returned value of the previous
 946         PostProcessor.
 947
 948         The chain will be stopped if one of them ever returns None or the end
 949         of the chain is reached.
 950
 951         PostProcessor objects follow a "mutual registration" process similar
 952         to InfoExtractor objects.
 953         """
 954
 955         _downloader = None
 956
 957         def __init__(self, downloader=None):
 958                 self._downloader = downloader
 959
 960         def set_downloader(self, downloader):
 961                 """Sets the downloader for this PP."""
 962                 self._downloader = downloader
 963
 964         def run(self, information):
 965                 """Run the PostProcessor.
 966
 967                 The "information" argument is a dictionary like the ones
 968                 composed by InfoExtractors. The only difference is that this
 969                 one has an extra field called "filepath" that points to the
 970                 downloaded file.
 971
 972                 When this method returns None, the postprocessing chain is
 973                 stopped. However, this method may return an information
 974                 dictionary that will be passed to the next postprocessing
 975                 object in the chain. It can be the one it received after
 976                 changing some fields.
 977
 978                 In addition, this method may raise a PostProcessingError
 979                 exception that will be taken into account by the downloader
 980                 it was called from.
 981                 """
 982                 return information # by default, do nothing
 983
 984 ### MAIN PROGRAM ###
 985 if __name__ == '__main__':
 986         try:
 987                 # Modules needed only when running the main program
 988                 import getpass
 989                 import optparse
 990
 991                 # General configuration
 992                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 993                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 994                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 995
 996                 # Parse command line
 997                 parser = optparse.OptionParser(
 998                         usage='Usage: %prog [options] url...',
 999                         version='INTERNAL',
1000                         conflict_handler='resolve',
1001                 )
1002
1003                 parser.add_option('-h', '--help',
1004                                 action='help', help='print this help text and exit')
1005                 parser.add_option('-v', '--version',
1006                                 action='version', help='print program version and exit')
1007                 parser.add_option('-i', '--ignore-errors',
1008                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1009                 parser.add_option('-r', '--rate-limit',
1010                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1011
1012                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1013                 authentication.add_option('-u', '--username',
1014                                 dest='username', metavar='UN', help='account username')
1015                 authentication.add_option('-p', '--password',
1016                                 dest='password', metavar='PW', help='account password')
1017                 authentication.add_option('-n', '--netrc',
1018                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1019                 parser.add_option_group(authentication)
1020
1021                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1022                 video_format.add_option('-f', '--format',
1023                                 action='append', dest='format', metavar='FMT', help='video format code')
1024                 video_format.add_option('-b', '--best-quality',
1025                                 action='append_const', dest='format', help='download the best quality video possible', const='0')
1026                 video_format.add_option('-m', '--mobile-version',
1027                                 action='append_const', dest='format', help='alias for -f 17', const='17')
1028                 video_format.add_option('-d', '--high-def',
1029                                 action='append_const', dest='format', help='alias for -f 22', const='22')
1030                 parser.add_option_group(video_format)
1031
1032                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1033                 verbosity.add_option('-q', '--quiet',
1034                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1035                 verbosity.add_option('-s', '--simulate',
1036                                 action='store_true', dest='simulate', help='do not download video', default=False)
1037                 verbosity.add_option('-g', '--get-url',
1038                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1039                 verbosity.add_option('-e', '--get-title',
1040                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1041                 parser.add_option_group(verbosity)
1042
1043                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1044                 filesystem.add_option('-t', '--title',
1045                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1046                 filesystem.add_option('-l', '--literal',
1047                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1048                 filesystem.add_option('-o', '--output',
1049                                 dest='outtmpl', metavar='TPL', help='output filename template')
1050                 filesystem.add_option('-a', '--batch-file',
1051                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1052                 filesystem.add_option('-w', '--no-overwrites',
1053                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1054                 parser.add_option_group(filesystem)
1055
1056                 (opts, args) = parser.parse_args()
1057
1058                 # Batch file verification
1059                 batchurls = []
1060                 if opts.batchfile is not None:
1061                         try:
1062                                 batchurls = open(opts.batchfile, 'r').readlines()
1063                                 batchurls = [x.strip() for x in batchurls]
1064                                 batchurls = [x for x in batchurls if len(x) > 0]
1065                         except IOError:
1066                                 sys.exit(u'ERROR: batch file could not be read')
1067                 all_urls = batchurls + args
1068
1069                 # Conflicting, missing and erroneous options
1070                 if len(all_urls) < 1:
1071                         parser.error(u'you must provide at least one URL')
1072                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1073                         parser.error(u'using .netrc conflicts with giving username/password')
1074                 if opts.password is not None and opts.username is None:
1075                         parser.error(u'account username missing')
1076                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1077                         parser.error(u'using output template conflicts with using title or literal title')
1078                 if opts.usetitle and opts.useliteral:
1079                         parser.error(u'using title conflicts with using literal title')
1080                 if opts.username is not None and opts.password is None:
1081                         opts.password = getpass.getpass(u'Type account password and press return:')
1082                 if opts.ratelimit is not None:
1083                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1084                         if numeric_limit is None:
1085                                 parser.error(u'invalid rate limit specified')
1086                         opts.ratelimit = numeric_limit
1087                 if opts.format is not None and len(opts.format) > 1:
1088                         parser.error(u'pass at most one of the video format option flags (-f, -b, -m, -d)')
1089                 if opts.format is None:
1090                         real_format = None
1091                 else:
1092                         real_format = opts.format[0]
1093
1094
1095                 # Information extractors
1096                 youtube_ie = YoutubeIE()
1097                 metacafe_ie = MetacafeIE(youtube_ie)
1098                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1099                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1100
1101                 # File downloader
1102                 fd = FileDownloader({
1103                         'usenetrc': opts.usenetrc,
1104                         'username': opts.username,
1105                         'password': opts.password,
1106                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1107                         'forceurl': opts.geturl,
1108                         'forcetitle': opts.gettitle,
1109                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1110                         'format': real_format,
1111                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1112                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1113                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1114                                 or u'%(id)s.%(ext)s'),
1115                         'ignoreerrors': opts.ignoreerrors,
1116                         'ratelimit': opts.ratelimit,
1117                         'nooverwrites': opts.nooverwrites,
1118                         })
1119                 fd.add_info_extractor(youtube_search_ie)
1120                 fd.add_info_extractor(youtube_pl_ie)
1121                 fd.add_info_extractor(metacafe_ie)
1122                 fd.add_info_extractor(youtube_ie)
1123                 retcode = fd.download(all_urls)
1124                 sys.exit(retcode)
1125
1126         except DownloadError:
1127                 sys.exit(1)
1128         except SameFileError:
1129                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1130         except KeyboardInterrupt:
1131                 sys.exit(u'\nERROR: Interrupted by user')