Bump version number
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
6 import htmlentitydefs
7 import httplib
8 import locale
9 import math
10 import netrc
11 import os
12 import os.path
13 import re
14 import socket
15 import string
16 import sys
17 import time
18 import urllib
19 import urllib2
20
21 std_headers = {
22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25         'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
29
30 class DownloadError(Exception):
31         """Download Error exception.
32         
33         This exception may be thrown by FileDownloader objects if they are not
34         configured to continue on errors. They will contain the appropriate
35         error message.
36         """
37         pass
38
39 class SameFileError(Exception):
40         """Same File exception.
41
42         This exception will be thrown by FileDownloader objects if they detect
43         multiple files would have to be downloaded to the same file on disk.
44         """
45         pass
46
47 class PostProcessingError(Exception):
48         """Post Processing exception.
49
50         This exception may be raised by PostProcessor's .run() method to
51         indicate an error in the postprocessing task.
52         """
53         pass
54
55 class UnavailableFormatError(Exception):
56         """Unavailable Format exception.
57
58         This exception will be thrown when a video is requested
59         in a format that is not available for that video.
60         """
61         pass
62
63 class ContentTooShortError(Exception):
64         """Content Too Short exception.
65
66         This exception may be raised by FileDownloader objects when a file they
67         download is too small for what the server announced first, indicating
68         the connection was probably interrupted.
69         """
70         # Both in bytes
71         downloaded = None
72         expected = None
73
74         def __init__(self, downloaded, expected):
75                 self.downloaded = downloaded
76                 self.expected = expected
77
78 class FileDownloader(object):
79         """File Downloader class.
80
81         File downloader objects are the ones responsible of downloading the
82         actual video file and writing it to disk if the user has requested
83         it, among some other tasks. In most cases there should be one per
84         program. As, given a video URL, the downloader doesn't know how to
85         extract all the needed information, task that InfoExtractors do, it
86         has to pass the URL to one of them.
87
88         For this, file downloader objects have a method that allows
89         InfoExtractors to be registered in a given order. When it is passed
90         a URL, the file downloader handles it to the first InfoExtractor it
91         finds that reports being able to handle it. The InfoExtractor extracts
92         all the information about the video or videos the URL refers to, and
93         asks the FileDownloader to process the video information, possibly
94         downloading the video.
95
96         File downloaders accept a lot of parameters. In order not to saturate
97         the object constructor with arguments, it receives a dictionary of
98         options instead. These options are available through the params
99         attribute for the InfoExtractors to use. The FileDownloader also
100         registers itself as the downloader in charge for the InfoExtractors
101         that are added to it, so this is a "mutual registration".
102
103         Available options:
104
105         username:       Username for authentication purposes.
106         password:       Password for authentication purposes.
107         usenetrc:       Use netrc for authentication instead.
108         quiet:          Do not print messages to stdout.
109         forceurl:       Force printing final URL.
110         forcetitle:     Force printing title.
111         simulate:       Do not download the video files.
112         format:         Video format code.
113         outtmpl:        Template for output names.
114         ignoreerrors:   Do not stop on download errors.
115         ratelimit:      Download speed limit, in bytes/sec.
116         nooverwrites:   Prevent overwriting files.
117         continuedl:     Try to continue downloads if possible.
118         """
119
120         params = None
121         _ies = []
122         _pps = []
123         _download_retcode = None
124
125         def __init__(self, params):
126                 """Create a FileDownloader object with the given options."""
127                 self._ies = []
128                 self._pps = []
129                 self._download_retcode = 0
130                 self.params = params
131         
132         @staticmethod
133         def pmkdir(filename):
134                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
135                 components = filename.split(os.sep)
136                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
137                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
138                 for dir in aggregate:
139                         if not os.path.exists(dir):
140                                 os.mkdir(dir)
141         
142         @staticmethod
143         def format_bytes(bytes):
144                 if bytes is None:
145                         return 'N/A'
146                 if bytes == 0:
147                         exponent = 0
148                 else:
149                         exponent = long(math.log(float(bytes), 1024.0))
150                 suffix = 'bkMGTPEZY'[exponent]
151                 converted = float(bytes) / float(1024**exponent)
152                 return '%.2f%s' % (converted, suffix)
153
154         @staticmethod
155         def calc_percent(byte_counter, data_len):
156                 if data_len is None:
157                         return '---.-%'
158                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
159
160         @staticmethod
161         def calc_eta(start, now, total, current):
162                 if total is None:
163                         return '--:--'
164                 dif = now - start
165                 if current == 0 or dif < 0.001: # One millisecond
166                         return '--:--'
167                 rate = float(current) / dif
168                 eta = long((float(total) - float(current)) / rate)
169                 (eta_mins, eta_secs) = divmod(eta, 60)
170                 if eta_mins > 99:
171                         return '--:--'
172                 return '%02d:%02d' % (eta_mins, eta_secs)
173
174         @staticmethod
175         def calc_speed(start, now, bytes):
176                 dif = now - start
177                 if bytes == 0 or dif < 0.001: # One millisecond
178                         return '%10s' % '---b/s'
179                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
180
181         @staticmethod
182         def best_block_size(elapsed_time, bytes):
183                 new_min = max(bytes / 2.0, 1.0)
184                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
185                 if elapsed_time < 0.001:
186                         return long(new_max)
187                 rate = bytes / elapsed_time
188                 if rate > new_max:
189                         return long(new_max)
190                 if rate < new_min:
191                         return long(new_min)
192                 return long(rate)
193
194         @staticmethod
195         def parse_bytes(bytestr):
196                 """Parse a string indicating a byte quantity into a long integer."""
197                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
198                 if matchobj is None:
199                         return None
200                 number = float(matchobj.group(1))
201                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
202                 return long(round(number * multiplier))
203
204         @staticmethod
205         def verify_url(url):
206                 """Verify a URL is valid and data could be downloaded."""
207                 request = urllib2.Request(url, None, std_headers)
208                 data = urllib2.urlopen(request)
209                 data.read(1)
210                 data.close()
211
212         def add_info_extractor(self, ie):
213                 """Add an InfoExtractor object to the end of the list."""
214                 self._ies.append(ie)
215                 ie.set_downloader(self)
216         
217         def add_post_processor(self, pp):
218                 """Add a PostProcessor object to the end of the chain."""
219                 self._pps.append(pp)
220                 pp.set_downloader(self)
221         
222         def to_stdout(self, message, skip_eol=False):
223                 """Print message to stdout if not in quiet mode."""
224                 if not self.params.get('quiet', False):
225                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
226                         sys.stdout.flush()
227         
228         def to_stderr(self, message):
229                 """Print message to stderr."""
230                 print >>sys.stderr, message
231         
232         def fixed_template(self):
233                 """Checks if the output template is fixed."""
234                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
235
236         def trouble(self, message=None):
237                 """Determine action to take when a download problem appears.
238
239                 Depending on if the downloader has been configured to ignore
240                 download errors or not, this method may throw an exception or
241                 not when errors are found, after printing the message.
242                 """
243                 if message is not None:
244                         self.to_stderr(message)
245                 if not self.params.get('ignoreerrors', False):
246                         raise DownloadError(message)
247                 self._download_retcode = 1
248
249         def slow_down(self, start_time, byte_counter):
250                 """Sleep if the download speed is over the rate limit."""
251                 rate_limit = self.params.get('ratelimit', None)
252                 if rate_limit is None or byte_counter == 0:
253                         return
254                 now = time.time()
255                 elapsed = now - start_time
256                 if elapsed <= 0.0:
257                         return
258                 speed = float(byte_counter) / elapsed
259                 if speed > rate_limit:
260                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
261
262         def report_destination(self, filename):
263                 """Report destination filename."""
264                 self.to_stdout(u'[download] Destination: %s' % filename)
265         
266         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
267                 """Report download progress."""
268                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
269                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
270
271         def report_resuming_byte(self, resume_len):
272                 """Report attemtp to resume at given byte."""
273                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
274         
275         def report_file_already_downloaded(self, file_name):
276                 """Report file has already been fully downloaded."""
277                 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
278         
279         def report_unable_to_resume(self):
280                 """Report it was impossible to resume download."""
281                 self.to_stdout(u'[download] Unable to resume')
282         
283         def report_finish(self):
284                 """Report download finished."""
285                 self.to_stdout(u'')
286
287         def process_info(self, info_dict):
288                 """Process a single dictionary returned by an InfoExtractor."""
289                 # Do nothing else if in simulate mode
290                 if self.params.get('simulate', False):
291                         try:
292                                 self.verify_url(info_dict['url'])
293                         except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
294                                 raise UnavailableFormatError
295
296                         # Forced printings
297                         if self.params.get('forcetitle', False):
298                                 print info_dict['title'].encode(locale.getpreferredencoding())
299                         if self.params.get('forceurl', False):
300                                 print info_dict['url'].encode(locale.getpreferredencoding())
301
302                         return
303                         
304                 try:
305                         template_dict = dict(info_dict)
306                         template_dict['epoch'] = unicode(long(time.time()))
307                         filename = self.params['outtmpl'] % template_dict
308                         self.report_destination(filename)
309                 except (ValueError, KeyError), err:
310                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
311                 if self.params['nooverwrites'] and os.path.exists(filename):
312                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
313                         return
314
315                 try:
316                         self.pmkdir(filename)
317                 except (OSError, IOError), err:
318                         self.trouble('ERROR: unable to create directories: %s' % str(err))
319                         return
320
321                 try:
322                         outstream = open(filename, 'ab')
323                 except (OSError, IOError), err:
324                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
325                         return
326
327                 try:
328                         self._do_download(outstream, info_dict['url'])
329                         outstream.close()
330                 except (OSError, IOError), err:
331                         outstream.close()
332                         os.remove(filename)
333                         raise UnavailableFormatError
334                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
335                         self.trouble('ERROR: unable to download video data: %s' % str(err))
336                         return
337                 except (ContentTooShortError, ), err:
338                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
339                         return
340
341                 try:
342                         self.post_process(filename, info_dict)
343                 except (PostProcessingError), err:
344                         self.trouble('ERROR: postprocessing: %s' % str(err))
345                         return
346
347         def download(self, url_list):
348                 """Download a given list of URLs."""
349                 if len(url_list) > 1 and self.fixed_template():
350                         raise SameFileError(self.params['outtmpl'])
351
352                 for url in url_list:
353                         suitable_found = False
354                         for ie in self._ies:
355                                 # Go to next InfoExtractor if not suitable
356                                 if not ie.suitable(url):
357                                         continue
358
359                                 # Suitable InfoExtractor found
360                                 suitable_found = True
361
362                                 # Extract information from URL and process it
363                                 ie.extract(url)
364
365                                 # Suitable InfoExtractor had been found; go to next URL
366                                 break
367
368                         if not suitable_found:
369                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
370
371                 return self._download_retcode
372
373         def post_process(self, filename, ie_info):
374                 """Run the postprocessing chain on the given file."""
375                 info = dict(ie_info)
376                 info['filepath'] = filename
377                 for pp in self._pps:
378                         info = pp.run(info)
379                         if info is None:
380                                 break
381         
382         def _do_download(self, stream, url):
383                 basic_request = urllib2.Request(url, None, std_headers)
384                 request = urllib2.Request(url, None, std_headers)
385
386                 # Resume transfer if filesize is non-zero
387                 resume_len = stream.tell()
388                 if self.params['continuedl'] and resume_len != 0:
389                         self.report_resuming_byte(resume_len)
390                         request.add_header('Range','bytes=%d-' % resume_len)
391                 else:
392                         stream.close()
393                         stream = open(stream.name,'wb')
394                 try:
395                         data = urllib2.urlopen(request)
396                 except urllib2.HTTPError, e:
397                         if not e.code == 416: #  416 is 'Requested range not satisfiable'
398                                 raise
399                         data = urllib2.urlopen(basic_request)
400                         content_length = data.info()['Content-Length']
401                         if content_length is not None and long(content_length) == resume_len:
402                                 self.report_file_already_downloaded(stream.name)
403                                 return
404                         else:
405                                 self.report_unable_to_resume()
406                                 stream.close()
407                                 stream = open(stream.name,'wb')
408
409                 data_len = data.info().get('Content-length', None)
410                 data_len_str = self.format_bytes(data_len)
411                 byte_counter = 0
412                 block_size = 1024
413                 start = time.time()
414                 while True:
415                         # Progress message
416                         percent_str = self.calc_percent(byte_counter, data_len)
417                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
418                         speed_str = self.calc_speed(start, time.time(), byte_counter)
419                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
420
421                         # Download and write
422                         before = time.time()
423                         data_block = data.read(block_size)
424                         after = time.time()
425                         data_block_len = len(data_block)
426                         if data_block_len == 0:
427                                 break
428                         byte_counter += data_block_len
429                         stream.write(data_block)
430                         block_size = self.best_block_size(after - before, data_block_len)
431
432                         # Apply rate limit
433                         self.slow_down(start, byte_counter)
434
435                 self.report_finish()
436                 if data_len is not None and str(byte_counter) != data_len:
437                         raise ContentTooShortError(byte_counter, long(data_len))
438
439 class InfoExtractor(object):
440         """Information Extractor class.
441
442         Information extractors are the classes that, given a URL, extract
443         information from the video (or videos) the URL refers to. This
444         information includes the real video URL, the video title and simplified
445         title, author and others. The information is stored in a dictionary
446         which is then passed to the FileDownloader. The FileDownloader
447         processes this information possibly downloading the video to the file
448         system, among other possible outcomes. The dictionaries must include
449         the following fields:
450
451         id:             Video identifier.
452         url:            Final video URL.
453         uploader:       Nickname of the video uploader.
454         title:          Literal title.
455         stitle:         Simplified title.
456         ext:            Video filename extension.
457
458         Subclasses of this one should re-define the _real_initialize() and
459         _real_extract() methods, as well as the suitable() static method.
460         Probably, they should also be instantiated and added to the main
461         downloader.
462         """
463
464         _ready = False
465         _downloader = None
466
467         def __init__(self, downloader=None):
468                 """Constructor. Receives an optional downloader."""
469                 self._ready = False
470                 self.set_downloader(downloader)
471
472         @staticmethod
473         def suitable(url):
474                 """Receives a URL and returns True if suitable for this IE."""
475                 return False
476
477         def initialize(self):
478                 """Initializes an instance (authentication, etc)."""
479                 if not self._ready:
480                         self._real_initialize()
481                         self._ready = True
482
483         def extract(self, url):
484                 """Extracts URL information and returns it in list of dicts."""
485                 self.initialize()
486                 return self._real_extract(url)
487
488         def set_downloader(self, downloader):
489                 """Sets the downloader for this IE."""
490                 self._downloader = downloader
491         
492         def _real_initialize(self):
493                 """Real initialization process. Redefine in subclasses."""
494                 pass
495
496         def _real_extract(self, url):
497                 """Real extraction process. Redefine in subclasses."""
498                 pass
499
500 class YoutubeIE(InfoExtractor):
501         """Information extractor for youtube.com."""
502
503         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
504         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
505         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
506         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
507         _NETRC_MACHINE = 'youtube'
508         _available_formats = ['22', '35', '18', '17', '13'] # listed in order of priority for -b flag
509         _video_extensions = {
510                 '13': '3gp',
511                 '17': 'mp4',
512                 '18': 'mp4',
513                 '22': 'mp4',
514         }
515
516         @staticmethod
517         def suitable(url):
518                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
519
520         @staticmethod
521         def htmlentity_transform(matchobj):
522                 """Transforms an HTML entity to a Unicode character."""
523                 entity = matchobj.group(1)
524
525                 # Known non-numeric HTML entity
526                 if entity in htmlentitydefs.name2codepoint:
527                         return unichr(htmlentitydefs.name2codepoint[entity])
528
529                 # Unicode character
530                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
531                 if mobj is not None:
532                         numstr = mobj.group(1)
533                         if numstr.startswith(u'x'):
534                                 base = 16
535                                 numstr = u'0%s' % numstr
536                         else:
537                                 base = 10
538                         return unichr(long(numstr, base))
539
540                 # Unknown entity in name, return its literal representation
541                 return (u'&%s;' % entity)
542
543         def report_lang(self):
544                 """Report attempt to set language."""
545                 self._downloader.to_stdout(u'[youtube] Setting language')
546
547         def report_login(self):
548                 """Report attempt to log in."""
549                 self._downloader.to_stdout(u'[youtube] Logging in')
550         
551         def report_age_confirmation(self):
552                 """Report attempt to confirm age."""
553                 self._downloader.to_stdout(u'[youtube] Confirming age')
554         
555         def report_webpage_download(self, video_id):
556                 """Report attempt to download webpage."""
557                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
558         
559         def report_information_extraction(self, video_id):
560                 """Report attempt to extract video information."""
561                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
562         
563         def report_video_url(self, video_id, video_real_url):
564                 """Report extracted video URL."""
565                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
566         
567         def report_unavailable_format(self, video_id, format):
568                 """Report extracted video URL."""
569                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
570         
571         def _real_initialize(self):
572                 if self._downloader is None:
573                         return
574
575                 username = None
576                 password = None
577                 downloader_params = self._downloader.params
578
579                 # Attempt to use provided username and password or .netrc data
580                 if downloader_params.get('username', None) is not None:
581                         username = downloader_params['username']
582                         password = downloader_params['password']
583                 elif downloader_params.get('usenetrc', False):
584                         try:
585                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
586                                 if info is not None:
587                                         username = info[0]
588                                         password = info[2]
589                                 else:
590                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
591                         except (IOError, netrc.NetrcParseError), err:
592                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
593                                 return
594
595                 # Set language
596                 request = urllib2.Request(self._LANG_URL, None, std_headers)
597                 try:
598                         self.report_lang()
599                         urllib2.urlopen(request).read()
600                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
601                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
602                         return
603
604                 # No authentication to be performed
605                 if username is None:
606                         return
607
608                 # Log in
609                 login_form = {
610                                 'current_form': 'loginForm',
611                                 'next':         '/',
612                                 'action_login': 'Log In',
613                                 'username':     username,
614                                 'password':     password,
615                                 }
616                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
617                 try:
618                         self.report_login()
619                         login_results = urllib2.urlopen(request).read()
620                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
621                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
622                                 return
623                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
624                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
625                         return
626         
627                 # Confirm age
628                 age_form = {
629                                 'next_url':             '/',
630                                 'action_confirm':       'Confirm',
631                                 }
632                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
633                 try:
634                         self.report_age_confirmation()
635                         age_results = urllib2.urlopen(request).read()
636                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
637                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
638                         return
639
640         def _real_extract(self, url):
641                 # Extract video id from URL
642                 mobj = re.match(self._VALID_URL, url)
643                 if mobj is None:
644                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
645                         return
646                 video_id = mobj.group(2)
647
648                 # Downloader parameters
649                 best_quality = False
650                 format_param = None
651                 quality_index = 0
652                 if self._downloader is not None:
653                         params = self._downloader.params
654                         format_param = params.get('format', None)
655                         if format_param == '0':
656                                 format_param = self._available_formats[quality_index]
657                                 best_quality = True
658
659                 while True:
660                         # Extension
661                         video_extension = self._video_extensions.get(format_param, 'flv')
662
663                         # Normalize URL, including format
664                         normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
665                         if format_param is not None:
666                                 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
667                         request = urllib2.Request(normalized_url, None, std_headers)
668                         try:
669                                 self.report_webpage_download(video_id)
670                                 video_webpage = urllib2.urlopen(request).read()
671                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
672                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
673                                 return
674                         self.report_information_extraction(video_id)
675                         
676                         # "t" param
677                         mobj = re.search(r', "t": "([^"]+)"', video_webpage)
678                         if mobj is None:
679                                 self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
680                                 return
681                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
682                         if format_param is not None:
683                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
684                         self.report_video_url(video_id, video_real_url)
685
686                         # uploader
687                         mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
688                         if mobj is None:
689                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
690                                 return
691                         video_uploader = mobj.group(1)
692
693                         # title
694                         mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
695                         if mobj is None:
696                                 self._downloader.trouble(u'ERROR: unable to extract video title')
697                                 return
698                         video_title = mobj.group(1).decode('utf-8')
699                         video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
700                         video_title = video_title.replace(os.sep, u'%')
701
702                         # simplified title
703                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
704                         simple_title = simple_title.strip(ur'_')
705
706                         try:
707                                 # Process video information
708                                 self._downloader.process_info({
709                                         'id':           video_id.decode('utf-8'),
710                                         'url':          video_real_url.decode('utf-8'),
711                                         'uploader':     video_uploader.decode('utf-8'),
712                                         'title':        video_title,
713                                         'stitle':       simple_title,
714                                         'ext':          video_extension.decode('utf-8'),
715                                 })
716
717                                 return
718
719                         except UnavailableFormatError, err:
720                                 if best_quality:
721                                         if quality_index == len(self._available_formats) - 1:
722                                                 # I don't ever expect this to happen
723                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
724                                                 return
725                                         else:
726                                                 self.report_unavailable_format(video_id, format_param)
727                                                 quality_index += 1
728                                                 format_param = self._available_formats[quality_index]
729                                                 continue
730                                 else: 
731                                         self._downloader.trouble('ERROR: format not available for video')
732                                         return
733
734
735 class MetacafeIE(InfoExtractor):
736         """Information Extractor for metacafe.com."""
737
738         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
739         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
740         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
741         _youtube_ie = None
742
743         def __init__(self, youtube_ie, downloader=None):
744                 InfoExtractor.__init__(self, downloader)
745                 self._youtube_ie = youtube_ie
746
747         @staticmethod
748         def suitable(url):
749                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
750
751         def report_disclaimer(self):
752                 """Report disclaimer retrieval."""
753                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
754
755         def report_age_confirmation(self):
756                 """Report attempt to confirm age."""
757                 self._downloader.to_stdout(u'[metacafe] Confirming age')
758         
759         def report_download_webpage(self, video_id):
760                 """Report webpage download."""
761                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
762         
763         def report_extraction(self, video_id):
764                 """Report information extraction."""
765                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
766
767         def _real_initialize(self):
768                 # Retrieve disclaimer
769                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
770                 try:
771                         self.report_disclaimer()
772                         disclaimer = urllib2.urlopen(request).read()
773                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
774                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
775                         return
776
777                 # Confirm age
778                 disclaimer_form = {
779                         'filters': '0',
780                         'submit': "Continue - I'm over 18",
781                         }
782                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
783                 try:
784                         self.report_age_confirmation()
785                         disclaimer = urllib2.urlopen(request).read()
786                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
787                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
788                         return
789         
790         def _real_extract(self, url):
791                 # Extract id and simplified title from URL
792                 mobj = re.match(self._VALID_URL, url)
793                 if mobj is None:
794                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
795                         return
796
797                 video_id = mobj.group(1)
798
799                 # Check if video comes from YouTube
800                 mobj2 = re.match(r'^yt-(.*)$', video_id)
801                 if mobj2 is not None:
802                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
803                         return
804
805                 simple_title = mobj.group(2).decode('utf-8')
806                 video_extension = 'flv'
807
808                 # Retrieve video webpage to extract further information
809                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
810                 try:
811                         self.report_download_webpage(video_id)
812                         webpage = urllib2.urlopen(request).read()
813                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
814                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
815                         return
816
817                 # Extract URL, uploader and title from webpage
818                 self.report_extraction(video_id)
819                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
820                 if mobj is None:
821                         self._downloader.trouble(u'ERROR: unable to extract media URL')
822                         return
823                 mediaURL = urllib.unquote(mobj.group(1))
824
825                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
826                 if mobj is None:
827                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
828                         return
829                 gdaKey = mobj.group(1)
830
831                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
832
833                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
834                 if mobj is None:
835                         self._downloader.trouble(u'ERROR: unable to extract title')
836                         return
837                 video_title = mobj.group(1).decode('utf-8')
838
839                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
840                 if mobj is None:
841                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
842                         return
843                 video_uploader = mobj.group(1)
844
845                 try:
846                         # Process video information
847                         self._downloader.process_info({
848                                 'id':           video_id.decode('utf-8'),
849                                 'url':          video_url.decode('utf-8'),
850                                 'uploader':     video_uploader.decode('utf-8'),
851                                 'title':        video_title,
852                                 'stitle':       simple_title,
853                                 'ext':          video_extension.decode('utf-8'),
854                         })
855                 except UnavailableFormatError:
856                         self._downloader.trouble(u'ERROR: format not available for video')
857
858
859 class YoutubeSearchIE(InfoExtractor):
860         """Information Extractor for YouTube search queries."""
861         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
862         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
863         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
864         _MORE_PAGES_INDICATOR = r'>Next</a>'
865         _youtube_ie = None
866         _max_youtube_results = 1000
867
868         def __init__(self, youtube_ie, downloader=None):
869                 InfoExtractor.__init__(self, downloader)
870                 self._youtube_ie = youtube_ie
871         
872         @staticmethod
873         def suitable(url):
874                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
875
876         def report_download_page(self, query, pagenum):
877                 """Report attempt to download playlist page with given number."""
878                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
879
880         def _real_initialize(self):
881                 self._youtube_ie.initialize()
882         
883         def _real_extract(self, query):
884                 mobj = re.match(self._VALID_QUERY, query)
885                 if mobj is None:
886                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
887                         return
888
889                 prefix, query = query.split(':')
890                 prefix = prefix[8:]
891                 if prefix == '':
892                         self._download_n_results(query, 1)
893                         return
894                 elif prefix == 'all':
895                         self._download_n_results(query, self._max_youtube_results)
896                         return
897                 else:
898                         try:
899                                 n = long(prefix)
900                                 if n <= 0:
901                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
902                                         return
903                                 elif n > self._max_youtube_results:
904                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
905                                         n = self._max_youtube_results
906                                 self._download_n_results(query, n)
907                                 return
908                         except ValueError: # parsing prefix as integer fails
909                                 self._download_n_results(query, 1)
910                                 return
911
912         def _download_n_results(self, query, n):
913                 """Downloads a specified number of results for a query"""
914
915                 video_ids = []
916                 already_seen = set()
917                 pagenum = 1
918
919                 while True:
920                         self.report_download_page(query, pagenum)
921                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
922                         request = urllib2.Request(result_url, None, std_headers)
923                         try:
924                                 page = urllib2.urlopen(request).read()
925                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
926                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
927                                 return
928
929                         # Extract video identifiers
930                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
931                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
932                                 if video_id not in already_seen:
933                                         video_ids.append(video_id)
934                                         already_seen.add(video_id)
935                                         if len(video_ids) == n:
936                                                 # Specified n videos reached
937                                                 for id in video_ids:
938                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
939                                                 return
940
941                         if self._MORE_PAGES_INDICATOR not in page:
942                                 for id in video_ids:
943                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
944                                 return
945
946                         pagenum = pagenum + 1
947
948 class YoutubePlaylistIE(InfoExtractor):
949         """Information Extractor for YouTube playlists."""
950
951         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
952         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
953         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
954         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
955         _youtube_ie = None
956
957         def __init__(self, youtube_ie, downloader=None):
958                 InfoExtractor.__init__(self, downloader)
959                 self._youtube_ie = youtube_ie
960         
961         @staticmethod
962         def suitable(url):
963                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
964
965         def report_download_page(self, playlist_id, pagenum):
966                 """Report attempt to download playlist page with given number."""
967                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
968
969         def _real_initialize(self):
970                 self._youtube_ie.initialize()
971         
972         def _real_extract(self, url):
973                 # Extract playlist id
974                 mobj = re.match(self._VALID_URL, url)
975                 if mobj is None:
976                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
977                         return
978
979                 # Download playlist pages
980                 playlist_id = mobj.group(1)
981                 video_ids = []
982                 pagenum = 1
983
984                 while True:
985                         self.report_download_page(playlist_id, pagenum)
986                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
987                         try:
988                                 page = urllib2.urlopen(request).read()
989                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
990                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
991                                 return
992
993                         # Extract video identifiers
994                         ids_in_page = []
995                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
996                                 if mobj.group(1) not in ids_in_page:
997                                         ids_in_page.append(mobj.group(1))
998                         video_ids.extend(ids_in_page)
999
1000                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
1001                                 break
1002                         pagenum = pagenum + 1
1003
1004                 for id in video_ids:
1005                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1006                 return
1007
1008 class PostProcessor(object):
1009         """Post Processor class.
1010
1011         PostProcessor objects can be added to downloaders with their
1012         add_post_processor() method. When the downloader has finished a
1013         successful download, it will take its internal chain of PostProcessors
1014         and start calling the run() method on each one of them, first with
1015         an initial argument and then with the returned value of the previous
1016         PostProcessor.
1017
1018         The chain will be stopped if one of them ever returns None or the end
1019         of the chain is reached.
1020
1021         PostProcessor objects follow a "mutual registration" process similar
1022         to InfoExtractor objects.
1023         """
1024
1025         _downloader = None
1026
1027         def __init__(self, downloader=None):
1028                 self._downloader = downloader
1029
1030         def set_downloader(self, downloader):
1031                 """Sets the downloader for this PP."""
1032                 self._downloader = downloader
1033         
1034         def run(self, information):
1035                 """Run the PostProcessor.
1036
1037                 The "information" argument is a dictionary like the ones
1038                 composed by InfoExtractors. The only difference is that this
1039                 one has an extra field called "filepath" that points to the
1040                 downloaded file.
1041
1042                 When this method returns None, the postprocessing chain is
1043                 stopped. However, this method may return an information
1044                 dictionary that will be passed to the next postprocessing
1045                 object in the chain. It can be the one it received after
1046                 changing some fields.
1047
1048                 In addition, this method may raise a PostProcessingError
1049                 exception that will be taken into account by the downloader
1050                 it was called from.
1051                 """
1052                 return information # by default, do nothing
1053         
1054 ### MAIN PROGRAM ###
1055 if __name__ == '__main__':
1056         try:
1057                 # Modules needed only when running the main program
1058                 import getpass
1059                 import optparse
1060
1061                 # General configuration
1062                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1063                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1064                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1065
1066                 # Parse command line
1067                 parser = optparse.OptionParser(
1068                         usage='Usage: %prog [options] url...',
1069                         version='2009.05.30',
1070                         conflict_handler='resolve',
1071                 )
1072
1073                 parser.add_option('-h', '--help',
1074                                 action='help', help='print this help text and exit')
1075                 parser.add_option('-v', '--version',
1076                                 action='version', help='print program version and exit')
1077                 parser.add_option('-i', '--ignore-errors',
1078                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1079                 parser.add_option('-r', '--rate-limit',
1080                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1081
1082                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1083                 authentication.add_option('-u', '--username',
1084                                 dest='username', metavar='UN', help='account username')
1085                 authentication.add_option('-p', '--password',
1086                                 dest='password', metavar='PW', help='account password')
1087                 authentication.add_option('-n', '--netrc',
1088                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1089                 parser.add_option_group(authentication)
1090
1091                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1092                 video_format.add_option('-f', '--format',
1093                                 action='store', dest='format', metavar='FMT', help='video format code')
1094                 video_format.add_option('-b', '--best-quality',
1095                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1096                 video_format.add_option('-m', '--mobile-version',
1097                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1098                 video_format.add_option('-d', '--high-def',
1099                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1100                 parser.add_option_group(video_format)
1101
1102                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1103                 verbosity.add_option('-q', '--quiet',
1104                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1105                 verbosity.add_option('-s', '--simulate',
1106                                 action='store_true', dest='simulate', help='do not download video', default=False)
1107                 verbosity.add_option('-g', '--get-url',
1108                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1109                 verbosity.add_option('-e', '--get-title',
1110                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1111                 parser.add_option_group(verbosity)
1112
1113                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1114                 filesystem.add_option('-t', '--title',
1115                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1116                 filesystem.add_option('-l', '--literal',
1117                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1118                 filesystem.add_option('-o', '--output',
1119                                 dest='outtmpl', metavar='TPL', help='output filename template')
1120                 filesystem.add_option('-a', '--batch-file',
1121                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1122                 filesystem.add_option('-w', '--no-overwrites',
1123                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1124                 filesystem.add_option('-c', '--continue',
1125                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1126                 parser.add_option_group(filesystem)
1127
1128                 (opts, args) = parser.parse_args()
1129
1130                 # Batch file verification
1131                 batchurls = []
1132                 if opts.batchfile is not None:
1133                         try:
1134                                 batchurls = open(opts.batchfile, 'r').readlines()
1135                                 batchurls = [x.strip() for x in batchurls]
1136                                 batchurls = [x for x in batchurls if len(x) > 0]
1137                         except IOError:
1138                                 sys.exit(u'ERROR: batch file could not be read')
1139                 all_urls = batchurls + args
1140
1141                 # Conflicting, missing and erroneous options
1142                 if len(all_urls) < 1:
1143                         parser.error(u'you must provide at least one URL')
1144                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1145                         parser.error(u'using .netrc conflicts with giving username/password')
1146                 if opts.password is not None and opts.username is None:
1147                         parser.error(u'account username missing')
1148                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1149                         parser.error(u'using output template conflicts with using title or literal title')
1150                 if opts.usetitle and opts.useliteral:
1151                         parser.error(u'using title conflicts with using literal title')
1152                 if opts.username is not None and opts.password is None:
1153                         opts.password = getpass.getpass(u'Type account password and press return:')
1154                 if opts.ratelimit is not None:
1155                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1156                         if numeric_limit is None:
1157                                 parser.error(u'invalid rate limit specified')
1158                         opts.ratelimit = numeric_limit
1159
1160                 # Information extractors
1161                 youtube_ie = YoutubeIE()
1162                 metacafe_ie = MetacafeIE(youtube_ie)
1163                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1164                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1165
1166                 # File downloader
1167                 fd = FileDownloader({
1168                         'usenetrc': opts.usenetrc,
1169                         'username': opts.username,
1170                         'password': opts.password,
1171                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1172                         'forceurl': opts.geturl,
1173                         'forcetitle': opts.gettitle,
1174                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1175                         'format': opts.format,
1176                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1177                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1178                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1179                                 or u'%(id)s.%(ext)s'),
1180                         'ignoreerrors': opts.ignoreerrors,
1181                         'ratelimit': opts.ratelimit,
1182                         'nooverwrites': opts.nooverwrites,
1183                         'continuedl': opts.continue_dl,
1184                         })
1185                 fd.add_info_extractor(youtube_search_ie)
1186                 fd.add_info_extractor(youtube_pl_ie)
1187                 fd.add_info_extractor(metacafe_ie)
1188                 fd.add_info_extractor(youtube_ie)
1189                 retcode = fd.download(all_urls)
1190                 sys.exit(retcode)
1191
1192         except DownloadError:
1193                 sys.exit(1)
1194         except SameFileError:
1195                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1196         except KeyboardInterrupt:
1197                 sys.exit(u'\nERROR: Interrupted by user')