youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class FileDownloader(object):
  56         """File Downloader class.
  57
  58         File downloader objects are the ones responsible of downloading the
  59         actual video file and writing it to disk if the user has requested
  60         it, among some other tasks. In most cases there should be one per
  61         program. As, given a video URL, the downloader doesn't know how to
  62         extract all the needed information, task that InfoExtractors do, it
  63         has to pass the URL to one of them.
  64
  65         For this, file downloader objects have a method that allows
  66         InfoExtractors to be registered in a given order. When it is passed
  67         a URL, the file downloader handles it to the first InfoExtractor it
  68         finds that reports being able to handle it. The InfoExtractor returns
  69         all the information to the FileDownloader and the latter downloads the
  70         file or does whatever it's instructed to do.
  71
  72         File downloaders accept a lot of parameters. In order not to saturate
  73         the object constructor with arguments, it receives a dictionary of
  74         options instead. These options are available through the get_params()
  75         method for the InfoExtractors to use. The FileDownloader also registers
  76         itself as the downloader in charge for the InfoExtractors that are
  77         added to it, so this is a "mutual registration".
  78
  79         Available options:
  80
  81         username:       Username for authentication purposes.
  82         password:       Password for authentication purposes.
  83         usenetrc:       Use netrc for authentication instead.
  84         quiet:          Do not print messages to stdout.
  85         forceurl:       Force printing final URL.
  86         forcetitle:     Force printing title.
  87         simulate:       Do not download the video files.
  88         format:         Video format code.
  89         outtmpl:        Template for output names.
  90         ignoreerrors:   Do not stop on download errors.
  91         ratelimit:      Download speed limit, in bytes/sec.
  92         nooverwrites:   Prevent overwriting files.
  93         """
  94
  95         _params = None
  96         _ies = []
  97         _pps = []
  98
  99         def __init__(self, params):
 100                 """Create a FileDownloader object with the given options."""
 101                 self._ies = []
 102                 self._pps = []
 103                 self.set_params(params)
 104
 105         @staticmethod
 106         def pmkdir(filename):
 107                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 108                 components = filename.split(os.sep)
 109                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 110                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 111                 for dir in aggregate:
 112                         if not os.path.exists(dir):
 113                                 os.mkdir(dir)
 114
 115         @staticmethod
 116         def format_bytes(bytes):
 117                 if bytes is None:
 118                         return 'N/A'
 119                 if bytes == 0:
 120                         exponent = 0
 121                 else:
 122                         exponent = long(math.log(float(bytes), 1024.0))
 123                 suffix = 'bkMGTPEZY'[exponent]
 124                 converted = float(bytes) / float(1024**exponent)
 125                 return '%.2f%s' % (converted, suffix)
 126
 127         @staticmethod
 128         def calc_percent(byte_counter, data_len):
 129                 if data_len is None:
 130                         return '---.-%'
 131                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 132
 133         @staticmethod
 134         def calc_eta(start, now, total, current):
 135                 if total is None:
 136                         return '--:--'
 137                 dif = now - start
 138                 if current == 0 or dif < 0.001: # One millisecond
 139                         return '--:--'
 140                 rate = float(current) / dif
 141                 eta = long((float(total) - float(current)) / rate)
 142                 (eta_mins, eta_secs) = divmod(eta, 60)
 143                 if eta_mins > 99:
 144                         return '--:--'
 145                 return '%02d:%02d' % (eta_mins, eta_secs)
 146
 147         @staticmethod
 148         def calc_speed(start, now, bytes):
 149                 dif = now - start
 150                 if bytes == 0 or dif < 0.001: # One millisecond
 151                         return '%10s' % '---b/s'
 152                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 153
 154         @staticmethod
 155         def best_block_size(elapsed_time, bytes):
 156                 new_min = max(bytes / 2.0, 1.0)
 157                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 158                 if elapsed_time < 0.001:
 159                         return int(new_max)
 160                 rate = bytes / elapsed_time
 161                 if rate > new_max:
 162                         return int(new_max)
 163                 if rate < new_min:
 164                         return int(new_min)
 165                 return int(rate)
 166
 167         @staticmethod
 168         def parse_bytes(bytestr):
 169                 """Parse a string indicating a byte quantity into a long integer."""
 170                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 171                 if matchobj is None:
 172                         return None
 173                 number = float(matchobj.group(1))
 174                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 175                 return long(round(number * multiplier))
 176
 177         def set_params(self, params):
 178                 """Sets parameters."""
 179                 if type(params) != dict:
 180                         raise ValueError('params: dictionary expected')
 181                 self._params = params
 182
 183         def get_params(self):
 184                 """Get parameters."""
 185                 return self._params
 186
 187         def add_info_extractor(self, ie):
 188                 """Add an InfoExtractor object to the end of the list."""
 189                 self._ies.append(ie)
 190                 ie.set_downloader(self)
 191
 192         def add_post_processor(self, pp):
 193                 """Add a PostProcessor object to the end of the chain."""
 194                 self._pps.append(pp)
 195                 pp.set_downloader(self)
 196
 197         def to_stdout(self, message, skip_eol=False):
 198                 """Print message to stdout if not in quiet mode."""
 199                 if not self._params.get('quiet', False):
 200                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
 201                         sys.stdout.flush()
 202
 203         def to_stderr(self, message):
 204                 """Print message to stderr."""
 205                 print >>sys.stderr, message
 206
 207         def fixed_template(self):
 208                 """Checks if the output template is fixed."""
 209                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 210
 211         def trouble(self, message=None):
 212                 """Determine action to take when a download problem appears.
 213
 214                 Depending on if the downloader has been configured to ignore
 215                 download errors or not, this method may throw an exception or
 216                 not when errors are found, after printing the message. If it
 217                 doesn't raise, it returns an error code suitable to be returned
 218                 later as a program exit code to indicate error.
 219                 """
 220                 if message is not None:
 221                         self.to_stderr(message)
 222                 if not self._params.get('ignoreerrors', False):
 223                         raise DownloadError(message)
 224                 return 1
 225
 226         def slow_down(self, start_time, byte_counter):
 227                 """Sleep if the download speed is over the rate limit."""
 228                 rate_limit = self._params.get('ratelimit', None)
 229                 if rate_limit is None or byte_counter == 0:
 230                         return
 231                 now = time.time()
 232                 elapsed = now - start_time
 233                 if elapsed <= 0.0:
 234                         return
 235                 speed = float(byte_counter) / elapsed
 236                 if speed > rate_limit:
 237                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 238
 239         def report_destination(self, filename):
 240                 """Report destination filename."""
 241                 self.to_stdout(u'[download] Destination: %s' % filename)
 242
 243         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 244                 """Report download progress."""
 245                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 246                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 247
 248         def report_finish(self):
 249                 """Report download finished."""
 250                 self.to_stdout(u'')
 251
 252         def download(self, url_list):
 253                 """Download a given list of URLs."""
 254                 retcode = 0
 255                 if len(url_list) > 1 and self.fixed_template():
 256                         raise SameFileError(self._params['outtmpl'])
 257
 258                 for url in url_list:
 259                         suitable_found = False
 260                         for ie in self._ies:
 261                                 if not ie.suitable(url):
 262                                         continue
 263                                 # Suitable InfoExtractor found
 264                                 suitable_found = True
 265                                 all_results = ie.extract(url)
 266                                 results = [x for x in all_results if x is not None]
 267                                 if len(results) != len(all_results):
 268                                         retcode = self.trouble()
 269
 270                                 if len(results) > 1 and self.fixed_template():
 271                                         raise SameFileError(self._params['outtmpl'])
 272
 273                                 for result in results:
 274                                         # Forced printings
 275                                         if self._params.get('forcetitle', False):
 276                                                 print result['title']
 277                                         if self._params.get('forceurl', False):
 278                                                 print result['url']
 279
 280                                         # Do nothing else if in simulate mode
 281                                         if self._params.get('simulate', False):
 282                                                 continue
 283
 284                                         try:
 285                                                 filename = self._params['outtmpl'] % result
 286                                                 self.report_destination(filename)
 287                                         except (ValueError, KeyError), err:
 288                                                 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 289                                                 continue
 290                                         if self._params['nooverwrites'] and os.path.exists(filename):
 291                                                 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 292                                                 continue
 293                                         try:
 294                                                 self.pmkdir(filename)
 295                                         except (OSError, IOError), err:
 296                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
 297                                                 continue
 298                                         try:
 299                                                 outstream = open(filename, 'wb')
 300                                         except (OSError, IOError), err:
 301                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
 302                                                 continue
 303                                         try:
 304                                                 self._do_download(outstream, result['url'])
 305                                                 outstream.close()
 306                                         except (OSError, IOError), err:
 307                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
 308                                                 continue
 309                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 310                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
 311                                                 continue
 312                                         try:
 313                                                 self.post_process(filename, result)
 314                                         except (PostProcessingError), err:
 315                                                 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
 316                                                 continue
 317
 318                                 break
 319                         if not suitable_found:
 320                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 321
 322                 return retcode
 323
 324         def post_process(self, filename, ie_info):
 325                 """Run the postprocessing chain on the given file."""
 326                 info = dict(ie_info)
 327                 info['filepath'] = filename
 328                 for pp in self._pps:
 329                         info = pp.run(info)
 330                         if info is None:
 331                                 break
 332
 333         def _do_download(self, stream, url):
 334                 request = urllib2.Request(url, None, std_headers)
 335                 data = urllib2.urlopen(request)
 336                 data_len = data.info().get('Content-length', None)
 337                 data_len_str = self.format_bytes(data_len)
 338                 byte_counter = 0
 339                 block_size = 1024
 340                 start = time.time()
 341                 while True:
 342                         # Progress message
 343                         percent_str = self.calc_percent(byte_counter, data_len)
 344                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 345                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 346                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 347
 348                         # Download and write
 349                         before = time.time()
 350                         data_block = data.read(block_size)
 351                         after = time.time()
 352                         data_block_len = len(data_block)
 353                         if data_block_len == 0:
 354                                 break
 355                         byte_counter += data_block_len
 356                         stream.write(data_block)
 357                         block_size = self.best_block_size(after - before, data_block_len)
 358
 359                         # Apply rate limit
 360                         self.slow_down(start, byte_counter)
 361
 362                 self.report_finish()
 363                 if data_len is not None and str(byte_counter) != data_len:
 364                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 365
 366 class InfoExtractor(object):
 367         """Information Extractor class.
 368
 369         Information extractors are the classes that, given a URL, extract
 370         information from the video (or videos) the URL refers to. This
 371         information includes the real video URL, the video title and simplified
 372         title, author and others. It is returned in a list of dictionaries when
 373         calling its extract() method. It is a list because a URL can refer to
 374         more than one video (think of playlists). The dictionaries must include
 375         the following fields:
 376
 377         id:             Video identifier.
 378         url:            Final video URL.
 379         uploader:       Nickname of the video uploader.
 380         title:          Literal title.
 381         stitle:         Simplified title.
 382         ext:            Video filename extension.
 383
 384         Subclasses of this one should re-define the _real_initialize() and
 385         _real_extract() methods, as well as the suitable() static method.
 386         Probably, they should also be instantiated and added to the main
 387         downloader.
 388         """
 389
 390         _ready = False
 391         _downloader = None
 392
 393         def __init__(self, downloader=None):
 394                 """Constructor. Receives an optional downloader."""
 395                 self._ready = False
 396                 self.set_downloader(downloader)
 397
 398         @staticmethod
 399         def suitable(url):
 400                 """Receives a URL and returns True if suitable for this IE."""
 401                 return False
 402
 403         def initialize(self):
 404                 """Initializes an instance (authentication, etc)."""
 405                 if not self._ready:
 406                         self._real_initialize()
 407                         self._ready = True
 408
 409         def extract(self, url):
 410                 """Extracts URL information and returns it in list of dicts."""
 411                 self.initialize()
 412                 return self._real_extract(url)
 413
 414         def set_downloader(self, downloader):
 415                 """Sets the downloader for this IE."""
 416                 self._downloader = downloader
 417
 418         def to_stdout(self, message):
 419                 """Print message to stdout if downloader is not in quiet mode."""
 420                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 421                         print message
 422
 423         def to_stderr(self, message):
 424                 """Print message to stderr."""
 425                 print >>sys.stderr, message
 426
 427         def _real_initialize(self):
 428                 """Real initialization process. Redefine in subclasses."""
 429                 pass
 430
 431         def _real_extract(self, url):
 432                 """Real extraction process. Redefine in subclasses."""
 433                 pass
 434
 435 class YoutubeIE(InfoExtractor):
 436         """Information extractor for youtube.com."""
 437
 438         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 439         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 440         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 441         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 442         _NETRC_MACHINE = 'youtube'
 443
 444         @staticmethod
 445         def suitable(url):
 446                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 447
 448         def report_lang(self):
 449                 """Report attempt to set language."""
 450                 self.to_stdout(u'[youtube] Setting language')
 451
 452         def report_login(self):
 453                 """Report attempt to log in."""
 454                 self.to_stdout(u'[youtube] Logging in')
 455
 456         def report_age_confirmation(self):
 457                 """Report attempt to confirm age."""
 458                 self.to_stdout(u'[youtube] Confirming age')
 459
 460         def report_webpage_download(self, video_id):
 461                 """Report attempt to download webpage."""
 462                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 463
 464         def report_information_extraction(self, video_id):
 465                 """Report attempt to extract video information."""
 466                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 467
 468         def report_video_url(self, video_id, video_real_url):
 469                 """Report extracted video URL."""
 470                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 471
 472         def _real_initialize(self):
 473                 if self._downloader is None:
 474                         return
 475
 476                 username = None
 477                 password = None
 478                 downloader_params = self._downloader.get_params()
 479
 480                 # Attempt to use provided username and password or .netrc data
 481                 if downloader_params.get('username', None) is not None:
 482                         username = downloader_params['username']
 483                         password = downloader_params['password']
 484                 elif downloader_params.get('usenetrc', False):
 485                         try:
 486                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 487                                 if info is not None:
 488                                         username = info[0]
 489                                         password = info[2]
 490                                 else:
 491                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 492                         except (IOError, netrc.NetrcParseError), err:
 493                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 494                                 return
 495
 496                 # No authentication to be performed
 497                 if username is None:
 498                         return
 499
 500                 # Set language
 501                 request = urllib2.Request(self._LOGIN_URL, None, std_headers)
 502                 try:
 503                         self.report_lang()
 504                         urllib2.urlopen(request).read()
 505                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 506                         self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 507                         return
 508
 509                 # Log in
 510                 login_form = {
 511                                 'current_form': 'loginForm',
 512                                 'next':         '/',
 513                                 'action_login': 'Log In',
 514                                 'username':     username,
 515                                 'password':     password,
 516                                 }
 517                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 518                 try:
 519                         self.report_login()
 520                         login_results = urllib2.urlopen(request).read()
 521                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 522                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
 523                                 return
 524                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 525                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 526                         return
 527
 528                 # Confirm age
 529                 age_form = {
 530                                 'next_url':             '/',
 531                                 'action_confirm':       'Confirm',
 532                                 }
 533                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 534                 try:
 535                         self.report_age_confirmation()
 536                         age_results = urllib2.urlopen(request).read()
 537                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 538                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 539                         return
 540
 541         def _real_extract(self, url):
 542                 # Extract video id from URL
 543                 mobj = re.match(self._VALID_URL, url)
 544                 if mobj is None:
 545                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 546                         return [None]
 547                 video_id = mobj.group(2)
 548
 549                 # Downloader parameters
 550                 format_param = None
 551                 if self._downloader is not None:
 552                         params = self._downloader.get_params()
 553                         format_param = params.get('format', None)
 554
 555                 # Extension
 556                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
 557
 558                 # Normalize URL, including format
 559                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 560                 if format_param is not None:
 561                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 562                 request = urllib2.Request(normalized_url, None, std_headers)
 563                 try:
 564                         self.report_webpage_download(video_id)
 565                         video_webpage = urllib2.urlopen(request).read()
 566                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 567                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
 568                         return [None]
 569                 self.report_information_extraction(video_id)
 570
 571                 # "t" param
 572                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 573                 if mobj is None:
 574                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
 575                         return [None]
 576                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 577                 if format_param is not None:
 578                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 579                 self.report_video_url(video_id, video_real_url)
 580
 581                 # uploader
 582                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 583                 if mobj is None:
 584                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 585                         return [None]
 586                 video_uploader = mobj.group(1)
 587
 588                 # title
 589                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 590                 if mobj is None:
 591                         self.to_stderr(u'ERROR: unable to extract video title')
 592                         return [None]
 593                 video_title = mobj.group(1).decode('utf-8')
 594                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 595                 video_title = video_title.replace(os.sep, u'%')
 596
 597                 # simplified title
 598                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 599                 simple_title = simple_title.strip(ur'_')
 600
 601                 # Return information
 602                 return [{
 603                         'id':           video_id.decode('utf-8'),
 604                         'url':          video_real_url.decode('utf-8'),
 605                         'uploader':     video_uploader.decode('utf-8'),
 606                         'title':        video_title,
 607                         'stitle':       simple_title,
 608                         'ext':          video_extension.decode('utf-8'),
 609                         }]
 610
 611 class MetacafeIE(InfoExtractor):
 612         """Information Extractor for metacafe.com."""
 613
 614         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 615         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 616         _youtube_ie = None
 617
 618         def __init__(self, youtube_ie, downloader=None):
 619                 InfoExtractor.__init__(self, downloader)
 620                 self._youtube_ie = youtube_ie
 621
 622         @staticmethod
 623         def suitable(url):
 624                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 625
 626         def report_disclaimer(self):
 627                 """Report disclaimer retrieval."""
 628                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
 629
 630         def report_age_confirmation(self):
 631                 """Report attempt to confirm age."""
 632                 self.to_stdout(u'[metacafe] Confirming age')
 633
 634         def report_download_webpage(self, video_id):
 635                 """Report webpage download."""
 636                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 637
 638         def report_extraction(self, video_id):
 639                 """Report information extraction."""
 640                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 641
 642         def _real_initialize(self):
 643                 # Retrieve disclaimer
 644                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 645                 try:
 646                         self.report_disclaimer()
 647                         disclaimer = urllib2.urlopen(request).read()
 648                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 649                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 650                         return
 651
 652                 # Confirm age
 653                 disclaimer_form = {
 654                         'filters': '0',
 655                         'submit': "Continue - I'm over 18",
 656                         }
 657                 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
 658                 try:
 659                         self.report_age_confirmation()
 660                         disclaimer = urllib2.urlopen(request).read()
 661                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 662                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 663                         return
 664
 665         def _real_extract(self, url):
 666                 # Extract id and simplified title from URL
 667                 mobj = re.match(self._VALID_URL, url)
 668                 if mobj is None:
 669                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 670                         return [None]
 671
 672                 video_id = mobj.group(1)
 673
 674                 # Check if video comes from YouTube
 675                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 676                 if mobj2 is not None:
 677                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 678
 679                 simple_title = mobj.group(2).decode('utf-8')
 680                 video_extension = 'flv'
 681
 682                 # Retrieve video webpage to extract further information
 683                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 684                 try:
 685                         self.report_download_webpage(video_id)
 686                         webpage = urllib2.urlopen(request).read()
 687                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 688                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
 689                         return [None]
 690
 691                 # Extract URL, uploader and title from webpage
 692                 self.report_extraction(video_id)
 693                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
 694                 if mobj is None:
 695                         self.to_stderr(u'ERROR: unable to extract media URL')
 696                         return [None]
 697                 mediaURL = mobj.group(1).replace('\\', '')
 698
 699                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
 700                 if mobj is None:
 701                         self.to_stderr(u'ERROR: unable to extract gdaKey')
 702                         return [None]
 703                 gdaKey = mobj.group(1)
 704
 705                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 706
 707                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 708                 if mobj is None:
 709                         self.to_stderr(u'ERROR: unable to extract title')
 710                         return [None]
 711                 video_title = mobj.group(1).decode('utf-8')
 712
 713                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
 714                 if mobj is None:
 715                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 716                         return [None]
 717                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
 718
 719                 # Return information
 720                 return [{
 721                         'id':           video_id.decode('utf-8'),
 722                         'url':          video_url.decode('utf-8'),
 723                         'uploader':     video_uploader.decode('utf-8'),
 724                         'title':        video_title,
 725                         'stitle':       simple_title,
 726                         'ext':          video_extension.decode('utf-8'),
 727                         }]
 728
 729
 730 class YoutubeSearchIE(InfoExtractor):
 731         """Information Extractor for YouTube search queries."""
 732         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 733         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 734         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 735         _MORE_PAGES_INDICATOR = r'>Next</a>'
 736         _youtube_ie = None
 737
 738         def __init__(self, youtube_ie, downloader=None):
 739                 InfoExtractor.__init__(self, downloader)
 740                 self._youtube_ie = youtube_ie
 741
 742         @staticmethod
 743         def suitable(url):
 744                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 745
 746         def report_download_page(self, query, pagenum):
 747                 """Report attempt to download playlist page with given number."""
 748                 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 749
 750         def _real_initialize(self):
 751                 self._youtube_ie.initialize()
 752
 753         def _real_extract(self, query):
 754                 mobj = re.match(self._VALID_QUERY, query)
 755                 if mobj is None:
 756                         self.to_stderr(u'ERROR: invalid search query "%s"' % query)
 757                         return [None]
 758
 759                 prefix, query = query.split(':')
 760                 prefix = prefix[8:]
 761                 if prefix == '':
 762                         return self._download_n_results(query, 1)
 763                 elif prefix == 'all':
 764                         return self._download_n_results(query, -1)
 765                 else:
 766                         try:
 767                                 n = int(prefix)
 768                                 if n <= 0:
 769                                         self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 770                                         return [None]
 771                                 return self._download_n_results(query, n)
 772                         except ValueError: # parsing prefix as int fails
 773                                 return self._download_n_results(query, 1)
 774
 775         def _download_n_results(self, query, n):
 776                 """Downloads a specified number of results for a query"""
 777
 778                 video_ids = []
 779                 already_seen = set()
 780                 pagenum = 1
 781
 782                 while True:
 783                         self.report_download_page(query, pagenum)
 784                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 785                         request = urllib2.Request(result_url, None, std_headers)
 786                         try:
 787                                 page = urllib2.urlopen(request).read()
 788                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 789                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 790                                 return [None]
 791
 792                         # Extract video identifiers
 793                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 794                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 795                                 if video_id not in already_seen:
 796                                         video_ids.append(video_id)
 797                                         already_seen.add(video_id)
 798                                         if len(video_ids) == n:
 799                                                 # Specified n videos reached
 800                                                 information = []
 801                                                 for id in video_ids:
 802                                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 803                                                 return information
 804
 805                         if self._MORE_PAGES_INDICATOR not in page:
 806                                 information = []
 807                                 for id in video_ids:
 808                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 809                                 return information
 810
 811                         pagenum = pagenum + 1
 812
 813 class YoutubePlaylistIE(InfoExtractor):
 814         """Information Extractor for YouTube playlists."""
 815
 816         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 817         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 818         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 819         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 820         _youtube_ie = None
 821
 822         def __init__(self, youtube_ie, downloader=None):
 823                 InfoExtractor.__init__(self, downloader)
 824                 self._youtube_ie = youtube_ie
 825
 826         @staticmethod
 827         def suitable(url):
 828                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 829
 830         def report_download_page(self, playlist_id, pagenum):
 831                 """Report attempt to download playlist page with given number."""
 832                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 833
 834         def _real_initialize(self):
 835                 self._youtube_ie.initialize()
 836
 837         def _real_extract(self, url):
 838                 # Extract playlist id
 839                 mobj = re.match(self._VALID_URL, url)
 840                 if mobj is None:
 841                         self.to_stderr(u'ERROR: invalid url: %s' % url)
 842                         return [None]
 843
 844                 # Download playlist pages
 845                 playlist_id = mobj.group(1)
 846                 video_ids = []
 847                 pagenum = 1
 848
 849                 while True:
 850                         self.report_download_page(playlist_id, pagenum)
 851                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 852                         try:
 853                                 page = urllib2.urlopen(request).read()
 854                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 855                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 856                                 return [None]
 857
 858                         # Extract video identifiers
 859                         ids_in_page = []
 860                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 861                                 if mobj.group(1) not in ids_in_page:
 862                                         ids_in_page.append(mobj.group(1))
 863                         video_ids.extend(ids_in_page)
 864
 865                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 866                                 break
 867                         pagenum = pagenum + 1
 868
 869                 information = []
 870                 for id in video_ids:
 871                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 872                 return information
 873
 874 class PostProcessor(object):
 875         """Post Processor class.
 876
 877         PostProcessor objects can be added to downloaders with their
 878         add_post_processor() method. When the downloader has finished a
 879         successful download, it will take its internal chain of PostProcessors
 880         and start calling the run() method on each one of them, first with
 881         an initial argument and then with the returned value of the previous
 882         PostProcessor.
 883
 884         The chain will be stopped if one of them ever returns None or the end
 885         of the chain is reached.
 886
 887         PostProcessor objects follow a "mutual registration" process similar
 888         to InfoExtractor objects.
 889         """
 890
 891         _downloader = None
 892
 893         def __init__(self, downloader=None):
 894                 self._downloader = downloader
 895
 896         def to_stdout(self, message):
 897                 """Print message to stdout if downloader is not in quiet mode."""
 898                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 899                         print message
 900
 901         def to_stderr(self, message):
 902                 """Print message to stderr."""
 903                 print >>sys.stderr, message
 904
 905         def set_downloader(self, downloader):
 906                 """Sets the downloader for this PP."""
 907                 self._downloader = downloader
 908
 909         def run(self, information):
 910                 """Run the PostProcessor.
 911
 912                 The "information" argument is a dictionary like the ones
 913                 returned by InfoExtractors. The only difference is that this
 914                 one has an extra field called "filepath" that points to the
 915                 downloaded file.
 916
 917                 When this method returns None, the postprocessing chain is
 918                 stopped. However, this method may return an information
 919                 dictionary that will be passed to the next postprocessing
 920                 object in the chain. It can be the one it received after
 921                 changing some fields.
 922
 923                 In addition, this method may raise a PostProcessingError
 924                 exception that will be taken into account by the downloader
 925                 it was called from.
 926                 """
 927                 return information # by default, do nothing
 928
 929 ### MAIN PROGRAM ###
 930 if __name__ == '__main__':
 931         try:
 932                 # Modules needed only when running the main program
 933                 import getpass
 934                 import optparse
 935
 936                 # General configuration
 937                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 938                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 939                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 940
 941                 # Parse command line
 942                 parser = optparse.OptionParser(
 943                                 usage='Usage: %prog [options] url...',
 944                                 version='2009.02.07',
 945                                 conflict_handler='resolve',
 946                                 )
 947                 parser.add_option('-h', '--help',
 948                                 action='help', help='print this help text and exit')
 949                 parser.add_option('-v', '--version',
 950                                 action='version', help='print program version and exit')
 951                 parser.add_option('-u', '--username',
 952                                 dest='username', metavar='UN', help='account username')
 953                 parser.add_option('-p', '--password',
 954                                 dest='password', metavar='PW', help='account password')
 955                 parser.add_option('-o', '--output',
 956                                 dest='outtmpl', metavar='TPL', help='output filename template')
 957                 parser.add_option('-q', '--quiet',
 958                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 959                 parser.add_option('-s', '--simulate',
 960                                 action='store_true', dest='simulate', help='do not download video', default=False)
 961                 parser.add_option('-t', '--title',
 962                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 963                 parser.add_option('-l', '--literal',
 964                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 965                 parser.add_option('-n', '--netrc',
 966                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 967                 parser.add_option('-g', '--get-url',
 968                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 969                 parser.add_option('-e', '--get-title',
 970                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 971                 parser.add_option('-f', '--format',
 972                                 dest='format', metavar='FMT', help='video format code')
 973                 parser.add_option('-b', '--best-quality',
 974                                 action='store_const', dest='format', help='alias for -f 18', const='18')
 975                 parser.add_option('-m', '--mobile-version',
 976                                 action='store_const', dest='format', help='alias for -f 17', const='17')
 977                 parser.add_option('-i', '--ignore-errors',
 978                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 979                 parser.add_option('-r', '--rate-limit',
 980                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
 981                 parser.add_option('-a', '--batch-file',
 982                                 dest='batchfile', metavar='F', help='file containing URLs to download')
 983                 parser.add_option('-w', '--no-overwrites',
 984                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
 985                 (opts, args) = parser.parse_args()
 986
 987                 # Batch file verification
 988                 batchurls = []
 989                 if opts.batchfile is not None:
 990                         try:
 991                                 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
 992                         except IOError:
 993                                 sys.exit(u'ERROR: batch file could not be read')
 994                 all_urls = batchurls + args
 995
 996                 # Conflicting, missing and erroneous options
 997                 if len(all_urls) < 1:
 998                         sys.exit(u'ERROR: you must provide at least one URL')
 999                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1000                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1001                 if opts.password is not None and opts.username is None:
1002                         sys.exit(u'ERROR: account username missing')
1003                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1004                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1005                 if opts.usetitle and opts.useliteral:
1006                         sys.exit(u'ERROR: using title conflicts with using literal title')
1007                 if opts.username is not None and opts.password is None:
1008                         opts.password = getpass.getpass(u'Type account password and press return:')
1009                 if opts.ratelimit is not None:
1010                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1011                         if numeric_limit is None:
1012                                 sys.exit(u'ERROR: invalid rate limit specified')
1013                         opts.ratelimit = numeric_limit
1014
1015                 # Information extractors
1016                 youtube_ie = YoutubeIE()
1017                 metacafe_ie = MetacafeIE(youtube_ie)
1018                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1019                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1020
1021                 # File downloader
1022                 charset = locale.getdefaultlocale()[1]
1023                 if charset is None:
1024                         charset = 'ascii'
1025                 fd = FileDownloader({
1026                         'usenetrc': opts.usenetrc,
1027                         'username': opts.username,
1028                         'password': opts.password,
1029                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1030                         'forceurl': opts.geturl,
1031                         'forcetitle': opts.gettitle,
1032                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1033                         'format': opts.format,
1034                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1035                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1036                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1037                                 or u'%(id)s.%(ext)s'),
1038                         'ignoreerrors': opts.ignoreerrors,
1039                         'ratelimit': opts.ratelimit,
1040                         'nooverwrites': opts.nooverwrites,
1041                         })
1042                 fd.add_info_extractor(youtube_search_ie)
1043                 fd.add_info_extractor(youtube_pl_ie)
1044                 fd.add_info_extractor(metacafe_ie)
1045                 fd.add_info_extractor(youtube_ie)
1046                 retcode = fd.download(all_urls)
1047                 sys.exit(retcode)
1048
1049         except DownloadError:
1050                 sys.exit(1)
1051         except SameFileError:
1052                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1053         except KeyboardInterrupt:
1054                 sys.exit(u'\nERROR: Interrupted by user')