Fix minor problem with size formatting method
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
6 import htmlentitydefs
7 import httplib
8 import locale
9 import math
10 import netrc
11 import os
12 import os.path
13 import re
14 import socket
15 import string
16 import sys
17 import time
18 import urllib
19 import urllib2
20
21 std_headers = {
22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25         'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
29
30 class DownloadError(Exception):
31         """Download Error exception.
32         
33         This exception may be thrown by FileDownloader objects if they are not
34         configured to continue on errors. They will contain the appropriate
35         error message.
36         """
37         pass
38
39 class SameFileError(Exception):
40         """Same File exception.
41
42         This exception will be thrown by FileDownloader objects if they detect
43         multiple files would have to be downloaded to the same file on disk.
44         """
45         pass
46
47 class PostProcessingError(Exception):
48         """Post Processing exception.
49
50         This exception may be raised by PostProcessor's .run() method to
51         indicate an error in the postprocessing task.
52         """
53         pass
54
55 class UnavailableFormatError(Exception):
56         """Unavailable Format exception.
57
58         This exception will be thrown when a video is requested
59         in a format that is not available for that video.
60         """
61         pass
62
63 class ContentTooShortError(Exception):
64         """Content Too Short exception.
65
66         This exception may be raised by FileDownloader objects when a file they
67         download is too small for what the server announced first, indicating
68         the connection was probably interrupted.
69         """
70         # Both in bytes
71         downloaded = None
72         expected = None
73
74         def __init__(self, downloaded, expected):
75                 self.downloaded = downloaded
76                 self.expected = expected
77
78 class FileDownloader(object):
79         """File Downloader class.
80
81         File downloader objects are the ones responsible of downloading the
82         actual video file and writing it to disk if the user has requested
83         it, among some other tasks. In most cases there should be one per
84         program. As, given a video URL, the downloader doesn't know how to
85         extract all the needed information, task that InfoExtractors do, it
86         has to pass the URL to one of them.
87
88         For this, file downloader objects have a method that allows
89         InfoExtractors to be registered in a given order. When it is passed
90         a URL, the file downloader handles it to the first InfoExtractor it
91         finds that reports being able to handle it. The InfoExtractor extracts
92         all the information about the video or videos the URL refers to, and
93         asks the FileDownloader to process the video information, possibly
94         downloading the video.
95
96         File downloaders accept a lot of parameters. In order not to saturate
97         the object constructor with arguments, it receives a dictionary of
98         options instead. These options are available through the params
99         attribute for the InfoExtractors to use. The FileDownloader also
100         registers itself as the downloader in charge for the InfoExtractors
101         that are added to it, so this is a "mutual registration".
102
103         Available options:
104
105         username:       Username for authentication purposes.
106         password:       Password for authentication purposes.
107         usenetrc:       Use netrc for authentication instead.
108         quiet:          Do not print messages to stdout.
109         forceurl:       Force printing final URL.
110         forcetitle:     Force printing title.
111         simulate:       Do not download the video files.
112         format:         Video format code.
113         outtmpl:        Template for output names.
114         ignoreerrors:   Do not stop on download errors.
115         ratelimit:      Download speed limit, in bytes/sec.
116         nooverwrites:   Prevent overwriting files.
117         continuedl:     Try to continue downloads if possible.
118         """
119
120         params = None
121         _ies = []
122         _pps = []
123         _download_retcode = None
124
125         def __init__(self, params):
126                 """Create a FileDownloader object with the given options."""
127                 self._ies = []
128                 self._pps = []
129                 self._download_retcode = 0
130                 self.params = params
131         
132         @staticmethod
133         def pmkdir(filename):
134                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
135                 components = filename.split(os.sep)
136                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
137                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
138                 for dir in aggregate:
139                         if not os.path.exists(dir):
140                                 os.mkdir(dir)
141         
142         @staticmethod
143         def format_bytes(bytes):
144                 if bytes is None:
145                         return 'N/A'
146                 if type(bytes) is str:
147                         bytes = float(bytes)
148                 if bytes == 0.0:
149                         exponent = 0
150                 else:
151                         exponent = long(math.log(bytes, 1024.0))
152                 suffix = 'bkMGTPEZY'[exponent]
153                 converted = float(bytes) / float(1024**exponent)
154                 return '%.2f%s' % (converted, suffix)
155
156         @staticmethod
157         def calc_percent(byte_counter, data_len):
158                 if data_len is None:
159                         return '---.-%'
160                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
161
162         @staticmethod
163         def calc_eta(start, now, total, current):
164                 if total is None:
165                         return '--:--'
166                 dif = now - start
167                 if current == 0 or dif < 0.001: # One millisecond
168                         return '--:--'
169                 rate = float(current) / dif
170                 eta = long((float(total) - float(current)) / rate)
171                 (eta_mins, eta_secs) = divmod(eta, 60)
172                 if eta_mins > 99:
173                         return '--:--'
174                 return '%02d:%02d' % (eta_mins, eta_secs)
175
176         @staticmethod
177         def calc_speed(start, now, bytes):
178                 dif = now - start
179                 if bytes == 0 or dif < 0.001: # One millisecond
180                         return '%10s' % '---b/s'
181                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
182
183         @staticmethod
184         def best_block_size(elapsed_time, bytes):
185                 new_min = max(bytes / 2.0, 1.0)
186                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
187                 if elapsed_time < 0.001:
188                         return long(new_max)
189                 rate = bytes / elapsed_time
190                 if rate > new_max:
191                         return long(new_max)
192                 if rate < new_min:
193                         return long(new_min)
194                 return long(rate)
195
196         @staticmethod
197         def parse_bytes(bytestr):
198                 """Parse a string indicating a byte quantity into a long integer."""
199                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
200                 if matchobj is None:
201                         return None
202                 number = float(matchobj.group(1))
203                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
204                 return long(round(number * multiplier))
205
206         @staticmethod
207         def verify_url(url):
208                 """Verify a URL is valid and data could be downloaded."""
209                 request = urllib2.Request(url, None, std_headers)
210                 data = urllib2.urlopen(request)
211                 data.read(1)
212                 data.close()
213
214         def add_info_extractor(self, ie):
215                 """Add an InfoExtractor object to the end of the list."""
216                 self._ies.append(ie)
217                 ie.set_downloader(self)
218         
219         def add_post_processor(self, pp):
220                 """Add a PostProcessor object to the end of the chain."""
221                 self._pps.append(pp)
222                 pp.set_downloader(self)
223         
224         def to_stdout(self, message, skip_eol=False):
225                 """Print message to stdout if not in quiet mode."""
226                 if not self.params.get('quiet', False):
227                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
228                         sys.stdout.flush()
229         
230         def to_stderr(self, message):
231                 """Print message to stderr."""
232                 print >>sys.stderr, message.encode(locale.getpreferredencoding())
233         
234         def fixed_template(self):
235                 """Checks if the output template is fixed."""
236                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
237
238         def trouble(self, message=None):
239                 """Determine action to take when a download problem appears.
240
241                 Depending on if the downloader has been configured to ignore
242                 download errors or not, this method may throw an exception or
243                 not when errors are found, after printing the message.
244                 """
245                 if message is not None:
246                         self.to_stderr(message)
247                 if not self.params.get('ignoreerrors', False):
248                         raise DownloadError(message)
249                 self._download_retcode = 1
250
251         def slow_down(self, start_time, byte_counter):
252                 """Sleep if the download speed is over the rate limit."""
253                 rate_limit = self.params.get('ratelimit', None)
254                 if rate_limit is None or byte_counter == 0:
255                         return
256                 now = time.time()
257                 elapsed = now - start_time
258                 if elapsed <= 0.0:
259                         return
260                 speed = float(byte_counter) / elapsed
261                 if speed > rate_limit:
262                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
263
264         def report_destination(self, filename):
265                 """Report destination filename."""
266                 self.to_stdout(u'[download] Destination: %s' % filename)
267         
268         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
269                 """Report download progress."""
270                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
271                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
272
273         def report_resuming_byte(self, resume_len):
274                 """Report attemtp to resume at given byte."""
275                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
276         
277         def report_file_already_downloaded(self, file_name):
278                 """Report file has already been fully downloaded."""
279                 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
280         
281         def report_unable_to_resume(self):
282                 """Report it was impossible to resume download."""
283                 self.to_stdout(u'[download] Unable to resume')
284         
285         def report_finish(self):
286                 """Report download finished."""
287                 self.to_stdout(u'')
288
289         def process_info(self, info_dict):
290                 """Process a single dictionary returned by an InfoExtractor."""
291                 # Do nothing else if in simulate mode
292                 if self.params.get('simulate', False):
293                         try:
294                                 self.verify_url(info_dict['url'])
295                         except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
296                                 raise UnavailableFormatError
297
298                         # Forced printings
299                         if self.params.get('forcetitle', False):
300                                 print info_dict['title'].encode(locale.getpreferredencoding())
301                         if self.params.get('forceurl', False):
302                                 print info_dict['url'].encode(locale.getpreferredencoding())
303
304                         return
305                         
306                 try:
307                         template_dict = dict(info_dict)
308                         template_dict['epoch'] = unicode(long(time.time()))
309                         filename = self.params['outtmpl'] % template_dict
310                 except (ValueError, KeyError), err:
311                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
312                 if self.params['nooverwrites'] and os.path.exists(filename):
313                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
314                         return
315
316                 try:
317                         self.pmkdir(filename)
318                 except (OSError, IOError), err:
319                         self.trouble('ERROR: unable to create directories: %s' % str(err))
320                         return
321
322                 try:
323                         success = self._do_download(filename, info_dict['url'])
324                 except (OSError, IOError), err:
325                         raise UnavailableFormatError
326                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
327                         self.trouble('ERROR: unable to download video data: %s' % str(err))
328                         return
329                 except (ContentTooShortError, ), err:
330                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
331                         return
332
333                 if success:
334                         try:
335                                 self.post_process(filename, info_dict)
336                         except (PostProcessingError), err:
337                                 self.trouble('ERROR: postprocessing: %s' % str(err))
338                                 return
339
340         def download(self, url_list):
341                 """Download a given list of URLs."""
342                 if len(url_list) > 1 and self.fixed_template():
343                         raise SameFileError(self.params['outtmpl'])
344
345                 for url in url_list:
346                         suitable_found = False
347                         for ie in self._ies:
348                                 # Go to next InfoExtractor if not suitable
349                                 if not ie.suitable(url):
350                                         continue
351
352                                 # Suitable InfoExtractor found
353                                 suitable_found = True
354
355                                 # Extract information from URL and process it
356                                 ie.extract(url)
357
358                                 # Suitable InfoExtractor had been found; go to next URL
359                                 break
360
361                         if not suitable_found:
362                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
363
364                 return self._download_retcode
365
366         def post_process(self, filename, ie_info):
367                 """Run the postprocessing chain on the given file."""
368                 info = dict(ie_info)
369                 info['filepath'] = filename
370                 for pp in self._pps:
371                         info = pp.run(info)
372                         if info is None:
373                                 break
374         
375         def _do_download(self, filename, url):
376                 stream = None
377                 open_mode = 'ab'
378
379                 basic_request = urllib2.Request(url, None, std_headers)
380                 request = urllib2.Request(url, None, std_headers)
381
382                 # Attempt to resume download with "continuedl" option
383                 if os.path.isfile(filename):
384                         resume_len = os.path.getsize(filename)
385                 else:
386                         resume_len = 0
387                 if self.params['continuedl'] and resume_len != 0:
388                         self.report_resuming_byte(resume_len)
389                         request.add_header('Range','bytes=%d-' % resume_len)
390
391                 # Establish connection
392                 try:
393                         data = urllib2.urlopen(request)
394                 except (urllib2.HTTPError, ), err:
395                         if err.code != 416: #  416 is 'Requested range not satisfiable'
396                                 raise
397                         data = urllib2.urlopen(basic_request)
398                         content_length = data.info()['Content-Length']
399                         if content_length is not None and long(content_length) == resume_len:
400                                 self.report_file_already_downloaded(filename)
401                                 return True
402                         else:
403                                 self.report_unable_to_resume()
404                                 open_mode = 'wb'
405
406                 data_len = data.info().get('Content-length', None)
407                 data_len_str = self.format_bytes(data_len)
408                 byte_counter = 0
409                 block_size = 1024
410                 start = time.time()
411                 while True:
412                         # Download and write
413                         before = time.time()
414                         data_block = data.read(block_size)
415                         after = time.time()
416                         data_block_len = len(data_block)
417                         if data_block_len == 0:
418                                 break
419                         byte_counter += data_block_len
420
421                         # Open file just in time
422                         if stream is None:
423                                 try:
424                                         stream = open(filename, open_mode)
425                                         self.report_destination(filename)
426                                 except (OSError, IOError), err:
427                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
428                                         return False
429                         stream.write(data_block)
430                         block_size = self.best_block_size(after - before, data_block_len)
431
432                         # Progress message
433                         percent_str = self.calc_percent(byte_counter, data_len)
434                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
435                         speed_str = self.calc_speed(start, time.time(), byte_counter)
436                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
437
438                         # Apply rate limit
439                         self.slow_down(start, byte_counter)
440
441                 self.report_finish()
442                 if data_len is not None and str(byte_counter) != data_len:
443                         raise ContentTooShortError(byte_counter, long(data_len))
444                 return True
445
446 class InfoExtractor(object):
447         """Information Extractor class.
448
449         Information extractors are the classes that, given a URL, extract
450         information from the video (or videos) the URL refers to. This
451         information includes the real video URL, the video title and simplified
452         title, author and others. The information is stored in a dictionary
453         which is then passed to the FileDownloader. The FileDownloader
454         processes this information possibly downloading the video to the file
455         system, among other possible outcomes. The dictionaries must include
456         the following fields:
457
458         id:             Video identifier.
459         url:            Final video URL.
460         uploader:       Nickname of the video uploader.
461         title:          Literal title.
462         stitle:         Simplified title.
463         ext:            Video filename extension.
464
465         Subclasses of this one should re-define the _real_initialize() and
466         _real_extract() methods, as well as the suitable() static method.
467         Probably, they should also be instantiated and added to the main
468         downloader.
469         """
470
471         _ready = False
472         _downloader = None
473
474         def __init__(self, downloader=None):
475                 """Constructor. Receives an optional downloader."""
476                 self._ready = False
477                 self.set_downloader(downloader)
478
479         @staticmethod
480         def suitable(url):
481                 """Receives a URL and returns True if suitable for this IE."""
482                 return False
483
484         def initialize(self):
485                 """Initializes an instance (authentication, etc)."""
486                 if not self._ready:
487                         self._real_initialize()
488                         self._ready = True
489
490         def extract(self, url):
491                 """Extracts URL information and returns it in list of dicts."""
492                 self.initialize()
493                 return self._real_extract(url)
494
495         def set_downloader(self, downloader):
496                 """Sets the downloader for this IE."""
497                 self._downloader = downloader
498         
499         def _real_initialize(self):
500                 """Real initialization process. Redefine in subclasses."""
501                 pass
502
503         def _real_extract(self, url):
504                 """Real extraction process. Redefine in subclasses."""
505                 pass
506
507 class YoutubeIE(InfoExtractor):
508         """Information extractor for youtube.com."""
509
510         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
511         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
512         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
513         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
514         _NETRC_MACHINE = 'youtube'
515         _available_formats = ['22', '35', '18', '5', '17', '13'] # listed in order of priority for -b flag
516         _video_extensions = {
517                 '13': '3gp',
518                 '17': 'mp4',
519                 '18': 'mp4',
520                 '22': 'mp4',
521         }
522
523         @staticmethod
524         def suitable(url):
525                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
526
527         @staticmethod
528         def htmlentity_transform(matchobj):
529                 """Transforms an HTML entity to a Unicode character."""
530                 entity = matchobj.group(1)
531
532                 # Known non-numeric HTML entity
533                 if entity in htmlentitydefs.name2codepoint:
534                         return unichr(htmlentitydefs.name2codepoint[entity])
535
536                 # Unicode character
537                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
538                 if mobj is not None:
539                         numstr = mobj.group(1)
540                         if numstr.startswith(u'x'):
541                                 base = 16
542                                 numstr = u'0%s' % numstr
543                         else:
544                                 base = 10
545                         return unichr(long(numstr, base))
546
547                 # Unknown entity in name, return its literal representation
548                 return (u'&%s;' % entity)
549
550         def report_lang(self):
551                 """Report attempt to set language."""
552                 self._downloader.to_stdout(u'[youtube] Setting language')
553
554         def report_login(self):
555                 """Report attempt to log in."""
556                 self._downloader.to_stdout(u'[youtube] Logging in')
557         
558         def report_age_confirmation(self):
559                 """Report attempt to confirm age."""
560                 self._downloader.to_stdout(u'[youtube] Confirming age')
561         
562         def report_webpage_download(self, video_id):
563                 """Report attempt to download webpage."""
564                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
565         
566         def report_information_extraction(self, video_id):
567                 """Report attempt to extract video information."""
568                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
569         
570         def report_video_url(self, video_id, video_real_url):
571                 """Report extracted video URL."""
572                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
573         
574         def report_unavailable_format(self, video_id, format):
575                 """Report extracted video URL."""
576                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
577         
578         def _real_initialize(self):
579                 if self._downloader is None:
580                         return
581
582                 username = None
583                 password = None
584                 downloader_params = self._downloader.params
585
586                 # Attempt to use provided username and password or .netrc data
587                 if downloader_params.get('username', None) is not None:
588                         username = downloader_params['username']
589                         password = downloader_params['password']
590                 elif downloader_params.get('usenetrc', False):
591                         try:
592                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
593                                 if info is not None:
594                                         username = info[0]
595                                         password = info[2]
596                                 else:
597                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
598                         except (IOError, netrc.NetrcParseError), err:
599                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
600                                 return
601
602                 # Set language
603                 request = urllib2.Request(self._LANG_URL, None, std_headers)
604                 try:
605                         self.report_lang()
606                         urllib2.urlopen(request).read()
607                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
608                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
609                         return
610
611                 # No authentication to be performed
612                 if username is None:
613                         return
614
615                 # Log in
616                 login_form = {
617                                 'current_form': 'loginForm',
618                                 'next':         '/',
619                                 'action_login': 'Log In',
620                                 'username':     username,
621                                 'password':     password,
622                                 }
623                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
624                 try:
625                         self.report_login()
626                         login_results = urllib2.urlopen(request).read()
627                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
628                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
629                                 return
630                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
631                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
632                         return
633         
634                 # Confirm age
635                 age_form = {
636                                 'next_url':             '/',
637                                 'action_confirm':       'Confirm',
638                                 }
639                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
640                 try:
641                         self.report_age_confirmation()
642                         age_results = urllib2.urlopen(request).read()
643                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
644                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
645                         return
646
647         def _real_extract(self, url):
648                 # Extract video id from URL
649                 mobj = re.match(self._VALID_URL, url)
650                 if mobj is None:
651                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
652                         return
653                 video_id = mobj.group(2)
654
655                 # Downloader parameters
656                 best_quality = False
657                 format_param = None
658                 quality_index = 0
659                 if self._downloader is not None:
660                         params = self._downloader.params
661                         format_param = params.get('format', None)
662                         if format_param == '0':
663                                 format_param = self._available_formats[quality_index]
664                                 best_quality = True
665
666                 while True:
667                         # Extension
668                         video_extension = self._video_extensions.get(format_param, 'flv')
669
670                         # Normalize URL, including format
671                         normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
672                         if format_param is not None:
673                                 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
674                         request = urllib2.Request(normalized_url, None, std_headers)
675                         try:
676                                 self.report_webpage_download(video_id)
677                                 video_webpage = urllib2.urlopen(request).read()
678                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
679                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
680                                 return
681                         self.report_information_extraction(video_id)
682                         
683                         # "t" param
684                         mobj = re.search(r', "t": "([^"]+)"', video_webpage)
685                         if mobj is None:
686                                 self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
687                                 return
688                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
689                         if format_param is not None:
690                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
691                         self.report_video_url(video_id, video_real_url)
692
693                         # uploader
694                         mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
695                         if mobj is None:
696                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
697                                 return
698                         video_uploader = mobj.group(1)
699
700                         # title
701                         mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
702                         if mobj is None:
703                                 self._downloader.trouble(u'ERROR: unable to extract video title')
704                                 return
705                         video_title = mobj.group(1).decode('utf-8')
706                         video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
707                         video_title = video_title.replace(os.sep, u'%')
708
709                         # simplified title
710                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
711                         simple_title = simple_title.strip(ur'_')
712
713                         try:
714                                 # Process video information
715                                 self._downloader.process_info({
716                                         'id':           video_id.decode('utf-8'),
717                                         'url':          video_real_url.decode('utf-8'),
718                                         'uploader':     video_uploader.decode('utf-8'),
719                                         'title':        video_title,
720                                         'stitle':       simple_title,
721                                         'ext':          video_extension.decode('utf-8'),
722                                 })
723
724                                 return
725
726                         except UnavailableFormatError, err:
727                                 if best_quality:
728                                         if quality_index == len(self._available_formats) - 1:
729                                                 # I don't ever expect this to happen
730                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
731                                                 return
732                                         else:
733                                                 self.report_unavailable_format(video_id, format_param)
734                                                 quality_index += 1
735                                                 format_param = self._available_formats[quality_index]
736                                                 continue
737                                 else: 
738                                         self._downloader.trouble('ERROR: format not available for video')
739                                         return
740
741
742 class MetacafeIE(InfoExtractor):
743         """Information Extractor for metacafe.com."""
744
745         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
746         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
747         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
748         _youtube_ie = None
749
750         def __init__(self, youtube_ie, downloader=None):
751                 InfoExtractor.__init__(self, downloader)
752                 self._youtube_ie = youtube_ie
753
754         @staticmethod
755         def suitable(url):
756                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
757
758         def report_disclaimer(self):
759                 """Report disclaimer retrieval."""
760                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
761
762         def report_age_confirmation(self):
763                 """Report attempt to confirm age."""
764                 self._downloader.to_stdout(u'[metacafe] Confirming age')
765         
766         def report_download_webpage(self, video_id):
767                 """Report webpage download."""
768                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
769         
770         def report_extraction(self, video_id):
771                 """Report information extraction."""
772                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
773
774         def _real_initialize(self):
775                 # Retrieve disclaimer
776                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
777                 try:
778                         self.report_disclaimer()
779                         disclaimer = urllib2.urlopen(request).read()
780                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
781                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
782                         return
783
784                 # Confirm age
785                 disclaimer_form = {
786                         'filters': '0',
787                         'submit': "Continue - I'm over 18",
788                         }
789                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
790                 try:
791                         self.report_age_confirmation()
792                         disclaimer = urllib2.urlopen(request).read()
793                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
794                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
795                         return
796         
797         def _real_extract(self, url):
798                 # Extract id and simplified title from URL
799                 mobj = re.match(self._VALID_URL, url)
800                 if mobj is None:
801                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
802                         return
803
804                 video_id = mobj.group(1)
805
806                 # Check if video comes from YouTube
807                 mobj2 = re.match(r'^yt-(.*)$', video_id)
808                 if mobj2 is not None:
809                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
810                         return
811
812                 simple_title = mobj.group(2).decode('utf-8')
813                 video_extension = 'flv'
814
815                 # Retrieve video webpage to extract further information
816                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
817                 try:
818                         self.report_download_webpage(video_id)
819                         webpage = urllib2.urlopen(request).read()
820                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
821                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
822                         return
823
824                 # Extract URL, uploader and title from webpage
825                 self.report_extraction(video_id)
826                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
827                 if mobj is None:
828                         self._downloader.trouble(u'ERROR: unable to extract media URL')
829                         return
830                 mediaURL = urllib.unquote(mobj.group(1))
831
832                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
833                 #if mobj is None:
834                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
835                 #       return
836                 #gdaKey = mobj.group(1)
837                 #
838                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
839
840                 video_url = mediaURL
841
842                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
843                 if mobj is None:
844                         self._downloader.trouble(u'ERROR: unable to extract title')
845                         return
846                 video_title = mobj.group(1).decode('utf-8')
847
848                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
849                 if mobj is None:
850                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
851                         return
852                 video_uploader = mobj.group(1)
853
854                 try:
855                         # Process video information
856                         self._downloader.process_info({
857                                 'id':           video_id.decode('utf-8'),
858                                 'url':          video_url.decode('utf-8'),
859                                 'uploader':     video_uploader.decode('utf-8'),
860                                 'title':        video_title,
861                                 'stitle':       simple_title,
862                                 'ext':          video_extension.decode('utf-8'),
863                         })
864                 except UnavailableFormatError:
865                         self._downloader.trouble(u'ERROR: format not available for video')
866
867
868 class YoutubeSearchIE(InfoExtractor):
869         """Information Extractor for YouTube search queries."""
870         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
871         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
872         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
873         _MORE_PAGES_INDICATOR = r'>Next</a>'
874         _youtube_ie = None
875         _max_youtube_results = 1000
876
877         def __init__(self, youtube_ie, downloader=None):
878                 InfoExtractor.__init__(self, downloader)
879                 self._youtube_ie = youtube_ie
880         
881         @staticmethod
882         def suitable(url):
883                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
884
885         def report_download_page(self, query, pagenum):
886                 """Report attempt to download playlist page with given number."""
887                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
888
889         def _real_initialize(self):
890                 self._youtube_ie.initialize()
891         
892         def _real_extract(self, query):
893                 mobj = re.match(self._VALID_QUERY, query)
894                 if mobj is None:
895                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
896                         return
897
898                 prefix, query = query.split(':')
899                 prefix = prefix[8:]
900                 if prefix == '':
901                         self._download_n_results(query, 1)
902                         return
903                 elif prefix == 'all':
904                         self._download_n_results(query, self._max_youtube_results)
905                         return
906                 else:
907                         try:
908                                 n = long(prefix)
909                                 if n <= 0:
910                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
911                                         return
912                                 elif n > self._max_youtube_results:
913                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
914                                         n = self._max_youtube_results
915                                 self._download_n_results(query, n)
916                                 return
917                         except ValueError: # parsing prefix as integer fails
918                                 self._download_n_results(query, 1)
919                                 return
920
921         def _download_n_results(self, query, n):
922                 """Downloads a specified number of results for a query"""
923
924                 video_ids = []
925                 already_seen = set()
926                 pagenum = 1
927
928                 while True:
929                         self.report_download_page(query, pagenum)
930                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
931                         request = urllib2.Request(result_url, None, std_headers)
932                         try:
933                                 page = urllib2.urlopen(request).read()
934                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
935                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
936                                 return
937
938                         # Extract video identifiers
939                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
940                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
941                                 if video_id not in already_seen:
942                                         video_ids.append(video_id)
943                                         already_seen.add(video_id)
944                                         if len(video_ids) == n:
945                                                 # Specified n videos reached
946                                                 for id in video_ids:
947                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
948                                                 return
949
950                         if self._MORE_PAGES_INDICATOR not in page:
951                                 for id in video_ids:
952                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
953                                 return
954
955                         pagenum = pagenum + 1
956
957 class YoutubePlaylistIE(InfoExtractor):
958         """Information Extractor for YouTube playlists."""
959
960         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
961         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
962         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
963         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
964         _youtube_ie = None
965
966         def __init__(self, youtube_ie, downloader=None):
967                 InfoExtractor.__init__(self, downloader)
968                 self._youtube_ie = youtube_ie
969         
970         @staticmethod
971         def suitable(url):
972                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
973
974         def report_download_page(self, playlist_id, pagenum):
975                 """Report attempt to download playlist page with given number."""
976                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
977
978         def _real_initialize(self):
979                 self._youtube_ie.initialize()
980         
981         def _real_extract(self, url):
982                 # Extract playlist id
983                 mobj = re.match(self._VALID_URL, url)
984                 if mobj is None:
985                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
986                         return
987
988                 # Download playlist pages
989                 playlist_id = mobj.group(1)
990                 video_ids = []
991                 pagenum = 1
992
993                 while True:
994                         self.report_download_page(playlist_id, pagenum)
995                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
996                         try:
997                                 page = urllib2.urlopen(request).read()
998                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
999                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1000                                 return
1001
1002                         # Extract video identifiers
1003                         ids_in_page = []
1004                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1005                                 if mobj.group(1) not in ids_in_page:
1006                                         ids_in_page.append(mobj.group(1))
1007                         video_ids.extend(ids_in_page)
1008
1009                         if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1010                                 break
1011                         pagenum = pagenum + 1
1012
1013                 for id in video_ids:
1014                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1015                 return
1016
1017 class PostProcessor(object):
1018         """Post Processor class.
1019
1020         PostProcessor objects can be added to downloaders with their
1021         add_post_processor() method. When the downloader has finished a
1022         successful download, it will take its internal chain of PostProcessors
1023         and start calling the run() method on each one of them, first with
1024         an initial argument and then with the returned value of the previous
1025         PostProcessor.
1026
1027         The chain will be stopped if one of them ever returns None or the end
1028         of the chain is reached.
1029
1030         PostProcessor objects follow a "mutual registration" process similar
1031         to InfoExtractor objects.
1032         """
1033
1034         _downloader = None
1035
1036         def __init__(self, downloader=None):
1037                 self._downloader = downloader
1038
1039         def set_downloader(self, downloader):
1040                 """Sets the downloader for this PP."""
1041                 self._downloader = downloader
1042         
1043         def run(self, information):
1044                 """Run the PostProcessor.
1045
1046                 The "information" argument is a dictionary like the ones
1047                 composed by InfoExtractors. The only difference is that this
1048                 one has an extra field called "filepath" that points to the
1049                 downloaded file.
1050
1051                 When this method returns None, the postprocessing chain is
1052                 stopped. However, this method may return an information
1053                 dictionary that will be passed to the next postprocessing
1054                 object in the chain. It can be the one it received after
1055                 changing some fields.
1056
1057                 In addition, this method may raise a PostProcessingError
1058                 exception that will be taken into account by the downloader
1059                 it was called from.
1060                 """
1061                 return information # by default, do nothing
1062         
1063 ### MAIN PROGRAM ###
1064 if __name__ == '__main__':
1065         try:
1066                 # Modules needed only when running the main program
1067                 import getpass
1068                 import optparse
1069
1070                 # General configuration
1071                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1072                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1073                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1074
1075                 # Parse command line
1076                 parser = optparse.OptionParser(
1077                         usage='Usage: %prog [options] url...',
1078                         version='2009.06.29',
1079                         conflict_handler='resolve',
1080                 )
1081
1082                 parser.add_option('-h', '--help',
1083                                 action='help', help='print this help text and exit')
1084                 parser.add_option('-v', '--version',
1085                                 action='version', help='print program version and exit')
1086                 parser.add_option('-i', '--ignore-errors',
1087                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1088                 parser.add_option('-r', '--rate-limit',
1089                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1090
1091                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1092                 authentication.add_option('-u', '--username',
1093                                 dest='username', metavar='UN', help='account username')
1094                 authentication.add_option('-p', '--password',
1095                                 dest='password', metavar='PW', help='account password')
1096                 authentication.add_option('-n', '--netrc',
1097                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1098                 parser.add_option_group(authentication)
1099
1100                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1101                 video_format.add_option('-f', '--format',
1102                                 action='store', dest='format', metavar='FMT', help='video format code')
1103                 video_format.add_option('-b', '--best-quality',
1104                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1105                 video_format.add_option('-m', '--mobile-version',
1106                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1107                 video_format.add_option('-d', '--high-def',
1108                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1109                 parser.add_option_group(video_format)
1110
1111                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1112                 verbosity.add_option('-q', '--quiet',
1113                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1114                 verbosity.add_option('-s', '--simulate',
1115                                 action='store_true', dest='simulate', help='do not download video', default=False)
1116                 verbosity.add_option('-g', '--get-url',
1117                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1118                 verbosity.add_option('-e', '--get-title',
1119                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1120                 parser.add_option_group(verbosity)
1121
1122                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1123                 filesystem.add_option('-t', '--title',
1124                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1125                 filesystem.add_option('-l', '--literal',
1126                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1127                 filesystem.add_option('-o', '--output',
1128                                 dest='outtmpl', metavar='TPL', help='output filename template')
1129                 filesystem.add_option('-a', '--batch-file',
1130                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1131                 filesystem.add_option('-w', '--no-overwrites',
1132                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1133                 filesystem.add_option('-c', '--continue',
1134                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1135                 parser.add_option_group(filesystem)
1136
1137                 (opts, args) = parser.parse_args()
1138
1139                 # Batch file verification
1140                 batchurls = []
1141                 if opts.batchfile is not None:
1142                         try:
1143                                 batchurls = open(opts.batchfile, 'r').readlines()
1144                                 batchurls = [x.strip() for x in batchurls]
1145                                 batchurls = [x for x in batchurls if len(x) > 0]
1146                         except IOError:
1147                                 sys.exit(u'ERROR: batch file could not be read')
1148                 all_urls = batchurls + args
1149
1150                 # Conflicting, missing and erroneous options
1151                 if len(all_urls) < 1:
1152                         parser.error(u'you must provide at least one URL')
1153                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1154                         parser.error(u'using .netrc conflicts with giving username/password')
1155                 if opts.password is not None and opts.username is None:
1156                         parser.error(u'account username missing')
1157                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1158                         parser.error(u'using output template conflicts with using title or literal title')
1159                 if opts.usetitle and opts.useliteral:
1160                         parser.error(u'using title conflicts with using literal title')
1161                 if opts.username is not None and opts.password is None:
1162                         opts.password = getpass.getpass(u'Type account password and press return:')
1163                 if opts.ratelimit is not None:
1164                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1165                         if numeric_limit is None:
1166                                 parser.error(u'invalid rate limit specified')
1167                         opts.ratelimit = numeric_limit
1168
1169                 # Information extractors
1170                 youtube_ie = YoutubeIE()
1171                 metacafe_ie = MetacafeIE(youtube_ie)
1172                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1173                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1174
1175                 # File downloader
1176                 fd = FileDownloader({
1177                         'usenetrc': opts.usenetrc,
1178                         'username': opts.username,
1179                         'password': opts.password,
1180                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1181                         'forceurl': opts.geturl,
1182                         'forcetitle': opts.gettitle,
1183                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1184                         'format': opts.format,
1185                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1186                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1187                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1188                                 or u'%(id)s.%(ext)s'),
1189                         'ignoreerrors': opts.ignoreerrors,
1190                         'ratelimit': opts.ratelimit,
1191                         'nooverwrites': opts.nooverwrites,
1192                         'continuedl': opts.continue_dl,
1193                         })
1194                 fd.add_info_extractor(youtube_search_ie)
1195                 fd.add_info_extractor(youtube_pl_ie)
1196                 fd.add_info_extractor(metacafe_ie)
1197                 fd.add_info_extractor(youtube_ie)
1198                 retcode = fd.download(all_urls)
1199                 sys.exit(retcode)
1200
1201         except DownloadError:
1202                 sys.exit(1)
1203         except SameFileError:
1204                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1205         except KeyboardInterrupt:
1206                 sys.exit(u'\nERROR: Interrupted by user')