Add a --max-quality flag to limit the highest quality (fixes issue #145)
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 if filename == u'-':
97                         return (sys.stdout, filename)
98                 stream = open(filename, open_mode)
99                 return (stream, filename)
100         except (IOError, OSError), err:
101                 # In case of error, try to remove win32 forbidden chars
102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104                 # An exception here should be caught in the caller
105                 stream = open(filename, open_mode)
106                 return (stream, filename)
107
108
109 class DownloadError(Exception):
110         """Download Error exception.
111         
112         This exception may be thrown by FileDownloader objects if they are not
113         configured to continue on errors. They will contain the appropriate
114         error message.
115         """
116         pass
117
118 class SameFileError(Exception):
119         """Same File exception.
120
121         This exception will be thrown by FileDownloader objects if they detect
122         multiple files would have to be downloaded to the same file on disk.
123         """
124         pass
125
126 class PostProcessingError(Exception):
127         """Post Processing exception.
128
129         This exception may be raised by PostProcessor's .run() method to
130         indicate an error in the postprocessing task.
131         """
132         pass
133
134 class UnavailableFormatError(Exception):
135         """Unavailable Format exception.
136
137         This exception will be thrown when a video is requested
138         in a format that is not available for that video.
139         """
140         pass
141
142 class ContentTooShortError(Exception):
143         """Content Too Short exception.
144
145         This exception may be raised by FileDownloader objects when a file they
146         download is too small for what the server announced first, indicating
147         the connection was probably interrupted.
148         """
149         # Both in bytes
150         downloaded = None
151         expected = None
152
153         def __init__(self, downloaded, expected):
154                 self.downloaded = downloaded
155                 self.expected = expected
156
157 class FileDownloader(object):
158         """File Downloader class.
159
160         File downloader objects are the ones responsible of downloading the
161         actual video file and writing it to disk if the user has requested
162         it, among some other tasks. In most cases there should be one per
163         program. As, given a video URL, the downloader doesn't know how to
164         extract all the needed information, task that InfoExtractors do, it
165         has to pass the URL to one of them.
166
167         For this, file downloader objects have a method that allows
168         InfoExtractors to be registered in a given order. When it is passed
169         a URL, the file downloader handles it to the first InfoExtractor it
170         finds that reports being able to handle it. The InfoExtractor extracts
171         all the information about the video or videos the URL refers to, and
172         asks the FileDownloader to process the video information, possibly
173         downloading the video.
174
175         File downloaders accept a lot of parameters. In order not to saturate
176         the object constructor with arguments, it receives a dictionary of
177         options instead. These options are available through the params
178         attribute for the InfoExtractors to use. The FileDownloader also
179         registers itself as the downloader in charge for the InfoExtractors
180         that are added to it, so this is a "mutual registration".
181
182         Available options:
183
184         username:       Username for authentication purposes.
185         password:       Password for authentication purposes.
186         usenetrc:       Use netrc for authentication instead.
187         quiet:          Do not print messages to stdout.
188         forceurl:       Force printing final URL.
189         forcetitle:     Force printing title.
190         simulate:       Do not download the video files.
191         format:         Video format code.
192         format_limit:   Highest quality format to try.
193         outtmpl:        Template for output names.
194         ignoreerrors:   Do not stop on download errors.
195         ratelimit:      Download speed limit, in bytes/sec.
196         nooverwrites:   Prevent overwriting files.
197         retries:        Number of times to retry for HTTP error 503
198         continuedl:     Try to continue downloads if possible.
199         noprogress:     Do not print the progress bar.
200         """
201
202         params = None
203         _ies = []
204         _pps = []
205         _download_retcode = None
206         _num_downloads = None
207
208         def __init__(self, params):
209                 """Create a FileDownloader object with the given options."""
210                 self._ies = []
211                 self._pps = []
212                 self._download_retcode = 0
213                 self._num_downloads = 0
214                 self.params = params
215         
216         @staticmethod
217         def pmkdir(filename):
218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219                 components = filename.split(os.sep)
220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
222                 for dir in aggregate:
223                         if not os.path.exists(dir):
224                                 os.mkdir(dir)
225         
226         @staticmethod
227         def format_bytes(bytes):
228                 if bytes is None:
229                         return 'N/A'
230                 if type(bytes) is str:
231                         bytes = float(bytes)
232                 if bytes == 0.0:
233                         exponent = 0
234                 else:
235                         exponent = long(math.log(bytes, 1024.0))
236                 suffix = 'bkMGTPEZY'[exponent]
237                 converted = float(bytes) / float(1024**exponent)
238                 return '%.2f%s' % (converted, suffix)
239
240         @staticmethod
241         def calc_percent(byte_counter, data_len):
242                 if data_len is None:
243                         return '---.-%'
244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246         @staticmethod
247         def calc_eta(start, now, total, current):
248                 if total is None:
249                         return '--:--'
250                 dif = now - start
251                 if current == 0 or dif < 0.001: # One millisecond
252                         return '--:--'
253                 rate = float(current) / dif
254                 eta = long((float(total) - float(current)) / rate)
255                 (eta_mins, eta_secs) = divmod(eta, 60)
256                 if eta_mins > 99:
257                         return '--:--'
258                 return '%02d:%02d' % (eta_mins, eta_secs)
259
260         @staticmethod
261         def calc_speed(start, now, bytes):
262                 dif = now - start
263                 if bytes == 0 or dif < 0.001: # One millisecond
264                         return '%10s' % '---b/s'
265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267         @staticmethod
268         def best_block_size(elapsed_time, bytes):
269                 new_min = max(bytes / 2.0, 1.0)
270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271                 if elapsed_time < 0.001:
272                         return long(new_max)
273                 rate = bytes / elapsed_time
274                 if rate > new_max:
275                         return long(new_max)
276                 if rate < new_min:
277                         return long(new_min)
278                 return long(rate)
279
280         @staticmethod
281         def parse_bytes(bytestr):
282                 """Parse a string indicating a byte quantity into a long integer."""
283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284                 if matchobj is None:
285                         return None
286                 number = float(matchobj.group(1))
287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288                 return long(round(number * multiplier))
289
290         @staticmethod
291         def verify_url(url):
292                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
293                 request = urllib2.Request(url, None, std_headers)
294                 data = urllib2.urlopen(request)
295                 data.read(1)
296                 url = data.geturl()
297                 data.close()
298                 return url
299
300         def add_info_extractor(self, ie):
301                 """Add an InfoExtractor object to the end of the list."""
302                 self._ies.append(ie)
303                 ie.set_downloader(self)
304         
305         def add_post_processor(self, pp):
306                 """Add a PostProcessor object to the end of the chain."""
307                 self._pps.append(pp)
308                 pp.set_downloader(self)
309         
310         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
311                 """Print message to stdout if not in quiet mode."""
312                 try:
313                         if not self.params.get('quiet', False):
314                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
315                         sys.stdout.flush()
316                 except (UnicodeEncodeError), err:
317                         if not ignore_encoding_errors:
318                                 raise
319         
320         def to_stderr(self, message):
321                 """Print message to stderr."""
322                 print >>sys.stderr, message.encode(preferredencoding())
323         
324         def fixed_template(self):
325                 """Checks if the output template is fixed."""
326                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
327
328         def trouble(self, message=None):
329                 """Determine action to take when a download problem appears.
330
331                 Depending on if the downloader has been configured to ignore
332                 download errors or not, this method may throw an exception or
333                 not when errors are found, after printing the message.
334                 """
335                 if message is not None:
336                         self.to_stderr(message)
337                 if not self.params.get('ignoreerrors', False):
338                         raise DownloadError(message)
339                 self._download_retcode = 1
340
341         def slow_down(self, start_time, byte_counter):
342                 """Sleep if the download speed is over the rate limit."""
343                 rate_limit = self.params.get('ratelimit', None)
344                 if rate_limit is None or byte_counter == 0:
345                         return
346                 now = time.time()
347                 elapsed = now - start_time
348                 if elapsed <= 0.0:
349                         return
350                 speed = float(byte_counter) / elapsed
351                 if speed > rate_limit:
352                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
353
354         def report_destination(self, filename):
355                 """Report destination filename."""
356                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
357         
358         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
359                 """Report download progress."""
360                 if self.params.get('noprogress', False):
361                         return
362                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
363                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
364
365         def report_resuming_byte(self, resume_len):
366                 """Report attemtp to resume at given byte."""
367                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
368         
369         def report_retry(self, count, retries):
370                 """Report retry in case of HTTP error 503"""
371                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
372         
373         def report_file_already_downloaded(self, file_name):
374                 """Report file has already been fully downloaded."""
375                 try:
376                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
377                 except (UnicodeEncodeError), err:
378                         self.to_stdout(u'[download] The file has already been downloaded')
379         
380         def report_unable_to_resume(self):
381                 """Report it was impossible to resume download."""
382                 self.to_stdout(u'[download] Unable to resume')
383         
384         def report_finish(self):
385                 """Report download finished."""
386                 if self.params.get('noprogress', False):
387                         self.to_stdout(u'[download] Download completed')
388                 else:
389                         self.to_stdout(u'')
390
391         def process_info(self, info_dict):
392                 """Process a single dictionary returned by an InfoExtractor."""
393                 # Do nothing else if in simulate mode
394                 if self.params.get('simulate', False):
395                         # Verify URL if it's an HTTP one
396                         if info_dict['url'].startswith('http'):
397                                 try:
398                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
399                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
400                                         raise UnavailableFormatError
401
402                         # Forced printings
403                         if self.params.get('forcetitle', False):
404                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
405                         if self.params.get('forceurl', False):
406                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
407                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
408                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
409                         if self.params.get('forcedescription', False) and 'description' in info_dict:
410                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
411
412                         return
413                         
414                 try:
415                         template_dict = dict(info_dict)
416                         template_dict['epoch'] = unicode(long(time.time()))
417                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
418                         filename = self.params['outtmpl'] % template_dict
419                 except (ValueError, KeyError), err:
420                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
421                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
422                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
423                         return
424
425                 try:
426                         self.pmkdir(filename)
427                 except (OSError, IOError), err:
428                         self.trouble('ERROR: unable to create directories: %s' % str(err))
429                         return
430
431                 try:
432                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
433                 except (OSError, IOError), err:
434                         raise UnavailableFormatError
435                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
436                         self.trouble('ERROR: unable to download video data: %s' % str(err))
437                         return
438                 except (ContentTooShortError, ), err:
439                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
440                         return
441
442                 if success:
443                         try:
444                                 self.post_process(filename, info_dict)
445                         except (PostProcessingError), err:
446                                 self.trouble('ERROR: postprocessing: %s' % str(err))
447                                 return
448
449         def download(self, url_list):
450                 """Download a given list of URLs."""
451                 if len(url_list) > 1 and self.fixed_template():
452                         raise SameFileError(self.params['outtmpl'])
453
454                 for url in url_list:
455                         suitable_found = False
456                         for ie in self._ies:
457                                 # Go to next InfoExtractor if not suitable
458                                 if not ie.suitable(url):
459                                         continue
460
461                                 # Suitable InfoExtractor found
462                                 suitable_found = True
463
464                                 # Extract information from URL and process it
465                                 ie.extract(url)
466
467                                 # Suitable InfoExtractor had been found; go to next URL
468                                 break
469
470                         if not suitable_found:
471                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
472
473                 return self._download_retcode
474
475         def post_process(self, filename, ie_info):
476                 """Run the postprocessing chain on the given file."""
477                 info = dict(ie_info)
478                 info['filepath'] = filename
479                 for pp in self._pps:
480                         info = pp.run(info)
481                         if info is None:
482                                 break
483         
484         def _download_with_rtmpdump(self, filename, url, player_url):
485                 self.report_destination(filename)
486
487                 # Check for rtmpdump first
488                 try:
489                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
490                 except (OSError, IOError):
491                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
492                         return False
493
494                 # Download using rtmpdump. rtmpdump returns exit code 2 when
495                 # the connection was interrumpted and resuming appears to be
496                 # possible. This is part of rtmpdump's normal usage, AFAIK.
497                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
498                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
499                 while retval == 2 or retval == 1:
500                         prevsize = os.path.getsize(filename)
501                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
502                         time.sleep(5.0) # This seems to be needed
503                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
504                         cursize = os.path.getsize(filename)
505                         if prevsize == cursize and retval == 1:
506                                 break
507                 if retval == 0:
508                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
509                         return True
510                 else:
511                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
512                         return False
513
514         def _do_download(self, filename, url, player_url):
515                 # Attempt to download using rtmpdump
516                 if url.startswith('rtmp'):
517                         return self._download_with_rtmpdump(filename, url, player_url)
518
519                 stream = None
520                 open_mode = 'wb'
521                 basic_request = urllib2.Request(url, None, std_headers)
522                 request = urllib2.Request(url, None, std_headers)
523
524                 # Establish possible resume length
525                 if os.path.isfile(filename):
526                         resume_len = os.path.getsize(filename)
527                 else:
528                         resume_len = 0
529
530                 # Request parameters in case of being able to resume
531                 if self.params.get('continuedl', False) and resume_len != 0:
532                         self.report_resuming_byte(resume_len)
533                         request.add_header('Range','bytes=%d-' % resume_len)
534                         open_mode = 'ab'
535
536                 count = 0
537                 retries = self.params.get('retries', 0)
538                 while True:
539                         # Establish connection
540                         try:
541                                 data = urllib2.urlopen(request)
542                                 break
543                         except (urllib2.HTTPError, ), err:
544                                 if err.code == 503:
545                                         # Retry in case of HTTP error 503
546                                         count += 1
547                                         if count <= retries:
548                                                 self.report_retry(count, retries)
549                                                 continue
550                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
551                                         raise
552                                 # Unable to resume
553                                 data = urllib2.urlopen(basic_request)
554                                 content_length = data.info()['Content-Length']
555
556                                 if content_length is not None and long(content_length) == resume_len:
557                                         # Because the file had already been fully downloaded
558                                         self.report_file_already_downloaded(filename)
559                                         return True
560                                 else:
561                                         # Because the server didn't let us
562                                         self.report_unable_to_resume()
563                                         open_mode = 'wb'
564
565                 data_len = data.info().get('Content-length', None)
566                 data_len_str = self.format_bytes(data_len)
567                 byte_counter = 0
568                 block_size = 1024
569                 start = time.time()
570                 while True:
571                         # Download and write
572                         before = time.time()
573                         data_block = data.read(block_size)
574                         after = time.time()
575                         data_block_len = len(data_block)
576                         if data_block_len == 0:
577                                 break
578                         byte_counter += data_block_len
579
580                         # Open file just in time
581                         if stream is None:
582                                 try:
583                                         (stream, filename) = sanitize_open(filename, open_mode)
584                                         self.report_destination(filename)
585                                         self._num_downloads += 1
586                                 except (OSError, IOError), err:
587                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
588                                         return False
589                         try:
590                                 stream.write(data_block)
591                         except (IOError, OSError), err:
592                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
593                         block_size = self.best_block_size(after - before, data_block_len)
594
595                         # Progress message
596                         percent_str = self.calc_percent(byte_counter, data_len)
597                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
598                         speed_str = self.calc_speed(start, time.time(), byte_counter)
599                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
600
601                         # Apply rate limit
602                         self.slow_down(start, byte_counter)
603
604                 self.report_finish()
605                 if data_len is not None and str(byte_counter) != data_len:
606                         raise ContentTooShortError(byte_counter, long(data_len))
607                 return True
608
609 class InfoExtractor(object):
610         """Information Extractor class.
611
612         Information extractors are the classes that, given a URL, extract
613         information from the video (or videos) the URL refers to. This
614         information includes the real video URL, the video title and simplified
615         title, author and others. The information is stored in a dictionary
616         which is then passed to the FileDownloader. The FileDownloader
617         processes this information possibly downloading the video to the file
618         system, among other possible outcomes. The dictionaries must include
619         the following fields:
620
621         id:             Video identifier.
622         url:            Final video URL.
623         uploader:       Nickname of the video uploader.
624         title:          Literal title.
625         stitle:         Simplified title.
626         ext:            Video filename extension.
627         format:         Video format.
628         player_url:     SWF Player URL (may be None).
629
630         The following fields are optional. Their primary purpose is to allow
631         youtube-dl to serve as the backend for a video search function, such
632         as the one in youtube2mp3.  They are only used when their respective
633         forced printing functions are called:
634
635         thumbnail:      Full URL to a video thumbnail image.
636         description:    One-line video description.
637
638         Subclasses of this one should re-define the _real_initialize() and
639         _real_extract() methods, as well as the suitable() static method.
640         Probably, they should also be instantiated and added to the main
641         downloader.
642         """
643
644         _ready = False
645         _downloader = None
646
647         def __init__(self, downloader=None):
648                 """Constructor. Receives an optional downloader."""
649                 self._ready = False
650                 self.set_downloader(downloader)
651
652         @staticmethod
653         def suitable(url):
654                 """Receives a URL and returns True if suitable for this IE."""
655                 return False
656
657         def initialize(self):
658                 """Initializes an instance (authentication, etc)."""
659                 if not self._ready:
660                         self._real_initialize()
661                         self._ready = True
662
663         def extract(self, url):
664                 """Extracts URL information and returns it in list of dicts."""
665                 self.initialize()
666                 return self._real_extract(url)
667
668         def set_downloader(self, downloader):
669                 """Sets the downloader for this IE."""
670                 self._downloader = downloader
671         
672         def _real_initialize(self):
673                 """Real initialization process. Redefine in subclasses."""
674                 pass
675
676         def _real_extract(self, url):
677                 """Real extraction process. Redefine in subclasses."""
678                 pass
679
680 class YoutubeIE(InfoExtractor):
681         """Information extractor for youtube.com."""
682
683         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
684         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
685         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
686         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
687         _NETRC_MACHINE = 'youtube'
688         # Listed in order of priority for the -b option
689         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13', None]
690         _video_extensions = {
691                 '13': '3gp',
692                 '17': 'mp4',
693                 '18': 'mp4',
694                 '22': 'mp4',
695                 '37': 'mp4',
696                 '38': 'video',
697                 '43': 'webm',
698                 '45': 'webm',
699         }
700
701         @staticmethod
702         def suitable(url):
703                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
704
705         def report_lang(self):
706                 """Report attempt to set language."""
707                 self._downloader.to_stdout(u'[youtube] Setting language')
708
709         def report_login(self):
710                 """Report attempt to log in."""
711                 self._downloader.to_stdout(u'[youtube] Logging in')
712         
713         def report_age_confirmation(self):
714                 """Report attempt to confirm age."""
715                 self._downloader.to_stdout(u'[youtube] Confirming age')
716         
717         def report_video_webpage_download(self, video_id):
718                 """Report attempt to download video webpage."""
719                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
720         
721         def report_video_info_webpage_download(self, video_id):
722                 """Report attempt to download video info webpage."""
723                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
724         
725         def report_information_extraction(self, video_id):
726                 """Report attempt to extract video information."""
727                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
728         
729         def report_unavailable_format(self, video_id, format):
730                 """Report extracted video URL."""
731                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
732         
733         def report_rtmp_download(self):
734                 """Indicate the download will use the RTMP protocol."""
735                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
736         
737         def _real_initialize(self):
738                 if self._downloader is None:
739                         return
740
741                 username = None
742                 password = None
743                 downloader_params = self._downloader.params
744
745                 # Attempt to use provided username and password or .netrc data
746                 if downloader_params.get('username', None) is not None:
747                         username = downloader_params['username']
748                         password = downloader_params['password']
749                 elif downloader_params.get('usenetrc', False):
750                         try:
751                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
752                                 if info is not None:
753                                         username = info[0]
754                                         password = info[2]
755                                 else:
756                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
757                         except (IOError, netrc.NetrcParseError), err:
758                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
759                                 return
760
761                 # Set language
762                 request = urllib2.Request(self._LANG_URL, None, std_headers)
763                 try:
764                         self.report_lang()
765                         urllib2.urlopen(request).read()
766                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
767                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
768                         return
769
770                 # No authentication to be performed
771                 if username is None:
772                         return
773
774                 # Log in
775                 login_form = {
776                                 'current_form': 'loginForm',
777                                 'next':         '/',
778                                 'action_login': 'Log In',
779                                 'username':     username,
780                                 'password':     password,
781                                 }
782                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
783                 try:
784                         self.report_login()
785                         login_results = urllib2.urlopen(request).read()
786                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
787                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
788                                 return
789                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
790                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
791                         return
792         
793                 # Confirm age
794                 age_form = {
795                                 'next_url':             '/',
796                                 'action_confirm':       'Confirm',
797                                 }
798                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
799                 try:
800                         self.report_age_confirmation()
801                         age_results = urllib2.urlopen(request).read()
802                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
803                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
804                         return
805
806         def _real_extract(self, url):
807                 # Extract video id from URL
808                 mobj = re.match(self._VALID_URL, url)
809                 if mobj is None:
810                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
811                         return
812                 video_id = mobj.group(2)
813
814                 # Downloader parameters
815                 best_quality = False
816                 all_formats = False
817                 format_param = None
818                 quality_index = 0
819                 if self._downloader is not None:
820                         params = self._downloader.params
821                         format_param = params.get('format', None)
822                         if format_param == '0':
823                                 format_limit = params.get('format_limit', None)
824                                 if format_limit is not None:
825                                         try:
826                                                 # Start at a different format if the user has limited the maximum quality
827                                                 quality_index = self._available_formats.index(format_limit)
828                                         except ValueError:
829                                                 pass
830                                 format_param = self._available_formats[quality_index]
831                                 best_quality = True
832                         elif format_param == '-1':
833                                 format_param = self._available_formats[quality_index]
834                                 all_formats = True
835
836                 while True:
837                         # Extension
838                         video_extension = self._video_extensions.get(format_param, 'flv')
839
840                         # Get video webpage
841                         self.report_video_webpage_download(video_id)
842                         request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
843                         try:
844                                 video_webpage = urllib2.urlopen(request).read()
845                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
846                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
847                                 return
848
849                         # Attempt to extract SWF player URL
850                         mobj = re.search(r'swfConfig.*"(http://.*?watch-.*?\.swf)"', video_webpage)
851                         if mobj is not None:
852                                 player_url = mobj.group(1)
853                         else:
854                                 player_url = None
855
856                         # Get video info
857                         self.report_video_info_webpage_download(video_id)
858                         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
859                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
860                                                    % (video_id, el_type))
861                                 request = urllib2.Request(video_info_url, None, std_headers)
862                                 try:
863                                         video_info_webpage = urllib2.urlopen(request).read()
864                                         video_info = parse_qs(video_info_webpage)
865                                         if 'token' in video_info:
866                                                 break
867                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
868                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
869                                         return
870                         self.report_information_extraction(video_id)
871
872                         # "t" param
873                         if 'token' not in video_info:
874                                 # Attempt to see if YouTube has issued an error message
875                                 if 'reason' not in video_info:
876                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
877                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
878                                         stream.write(video_info_webpage)
879                                         stream.close()
880                                 else:
881                                         reason = urllib.unquote_plus(video_info['reason'][0])
882                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
883                                 return
884                         token = urllib.unquote_plus(video_info['token'][0])
885                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
886                         if format_param is not None:
887                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
888
889                         # Check possible RTMP download
890                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
891                                 self.report_rtmp_download()
892                                 video_real_url = video_info['conn'][0]
893
894                         # uploader
895                         if 'author' not in video_info:
896                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
897                                 return
898                         video_uploader = urllib.unquote_plus(video_info['author'][0])
899
900                         # title
901                         if 'title' not in video_info:
902                                 self._downloader.trouble(u'ERROR: unable to extract video title')
903                                 return
904                         video_title = urllib.unquote_plus(video_info['title'][0])
905                         video_title = video_title.decode('utf-8')
906                         video_title = sanitize_title(video_title)
907
908                         # simplified title
909                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
910                         simple_title = simple_title.strip(ur'_')
911
912                         # thumbnail image
913                         if 'thumbnail_url' not in video_info:
914                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
915                                 video_thumbnail = ''
916                         else:   # don't panic if we can't find it
917                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
918
919                         # description
920                         video_description = 'No description available.'
921                         if self._downloader.params.get('forcedescription', False):
922                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
923                                 if mobj is not None:
924                                         video_description = mobj.group(1)
925
926                         try:
927                                 # Process video information
928                                 self._downloader.process_info({
929                                         'id':           video_id.decode('utf-8'),
930                                         'url':          video_real_url.decode('utf-8'),
931                                         'uploader':     video_uploader.decode('utf-8'),
932                                         'title':        video_title,
933                                         'stitle':       simple_title,
934                                         'ext':          video_extension.decode('utf-8'),
935                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
936                                         'thumbnail':    video_thumbnail.decode('utf-8'),
937                                         'description':  video_description.decode('utf-8'),
938                                         'player_url':   player_url,
939                                 })
940
941                                 if all_formats:
942                                         quality_index += 1
943                                         if quality_index == len(self._available_formats):
944                                                 # None left to get
945                                                 return
946                                         else:
947                                                 format_param = self._available_formats[quality_index]
948                                                 continue
949                                 return
950
951                         except UnavailableFormatError, err:
952                                 if best_quality or all_formats:
953                                         quality_index += 1
954                                         if quality_index == len(self._available_formats):
955                                                 # I don't ever expect this to happen
956                                                 if not all_formats:
957                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
958                                                 return
959                                         else:
960                                                 self.report_unavailable_format(video_id, format_param)
961                                                 format_param = self._available_formats[quality_index]
962                                                 continue
963                                 else: 
964                                         self._downloader.trouble('ERROR: format not available for video')
965                                         return
966
967
968 class MetacafeIE(InfoExtractor):
969         """Information Extractor for metacafe.com."""
970
971         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
972         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
973         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
974         _youtube_ie = None
975
976         def __init__(self, youtube_ie, downloader=None):
977                 InfoExtractor.__init__(self, downloader)
978                 self._youtube_ie = youtube_ie
979
980         @staticmethod
981         def suitable(url):
982                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
983
984         def report_disclaimer(self):
985                 """Report disclaimer retrieval."""
986                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
987
988         def report_age_confirmation(self):
989                 """Report attempt to confirm age."""
990                 self._downloader.to_stdout(u'[metacafe] Confirming age')
991         
992         def report_download_webpage(self, video_id):
993                 """Report webpage download."""
994                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
995         
996         def report_extraction(self, video_id):
997                 """Report information extraction."""
998                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
999
1000         def _real_initialize(self):
1001                 # Retrieve disclaimer
1002                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1003                 try:
1004                         self.report_disclaimer()
1005                         disclaimer = urllib2.urlopen(request).read()
1006                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1007                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1008                         return
1009
1010                 # Confirm age
1011                 disclaimer_form = {
1012                         'filters': '0',
1013                         'submit': "Continue - I'm over 18",
1014                         }
1015                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1016                 try:
1017                         self.report_age_confirmation()
1018                         disclaimer = urllib2.urlopen(request).read()
1019                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1020                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1021                         return
1022         
1023         def _real_extract(self, url):
1024                 # Extract id and simplified title from URL
1025                 mobj = re.match(self._VALID_URL, url)
1026                 if mobj is None:
1027                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1028                         return
1029
1030                 video_id = mobj.group(1)
1031
1032                 # Check if video comes from YouTube
1033                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1034                 if mobj2 is not None:
1035                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1036                         return
1037
1038                 simple_title = mobj.group(2).decode('utf-8')
1039                 video_extension = 'flv'
1040
1041                 # Retrieve video webpage to extract further information
1042                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1043                 try:
1044                         self.report_download_webpage(video_id)
1045                         webpage = urllib2.urlopen(request).read()
1046                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1047                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1048                         return
1049
1050                 # Extract URL, uploader and title from webpage
1051                 self.report_extraction(video_id)
1052                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1053                 if mobj is None:
1054                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1055                         return
1056                 mediaURL = urllib.unquote(mobj.group(1))
1057
1058                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1059                 #if mobj is None:
1060                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1061                 #       return
1062                 #gdaKey = mobj.group(1)
1063                 #
1064                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1065
1066                 video_url = mediaURL
1067
1068                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1069                 if mobj is None:
1070                         self._downloader.trouble(u'ERROR: unable to extract title')
1071                         return
1072                 video_title = mobj.group(1).decode('utf-8')
1073                 video_title = sanitize_title(video_title)
1074
1075                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1076                 if mobj is None:
1077                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1078                         return
1079                 video_uploader = mobj.group(1)
1080
1081                 try:
1082                         # Process video information
1083                         self._downloader.process_info({
1084                                 'id':           video_id.decode('utf-8'),
1085                                 'url':          video_url.decode('utf-8'),
1086                                 'uploader':     video_uploader.decode('utf-8'),
1087                                 'title':        video_title,
1088                                 'stitle':       simple_title,
1089                                 'ext':          video_extension.decode('utf-8'),
1090                                 'format':       u'NA',
1091                                 'player_url':   None,
1092                         })
1093                 except UnavailableFormatError:
1094                         self._downloader.trouble(u'ERROR: format not available for video')
1095
1096
1097 class DailymotionIE(InfoExtractor):
1098         """Information Extractor for Dailymotion"""
1099
1100         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1101
1102         def __init__(self, downloader=None):
1103                 InfoExtractor.__init__(self, downloader)
1104
1105         @staticmethod
1106         def suitable(url):
1107                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1108
1109         def report_download_webpage(self, video_id):
1110                 """Report webpage download."""
1111                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1112         
1113         def report_extraction(self, video_id):
1114                 """Report information extraction."""
1115                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1116
1117         def _real_initialize(self):
1118                 return
1119
1120         def _real_extract(self, url):
1121                 # Extract id and simplified title from URL
1122                 mobj = re.match(self._VALID_URL, url)
1123                 if mobj is None:
1124                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1125                         return
1126
1127                 video_id = mobj.group(1)
1128
1129                 simple_title = mobj.group(2).decode('utf-8')
1130                 video_extension = 'flv'
1131
1132                 # Retrieve video webpage to extract further information
1133                 request = urllib2.Request(url)
1134                 try:
1135                         self.report_download_webpage(video_id)
1136                         webpage = urllib2.urlopen(request).read()
1137                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1139                         return
1140
1141                 # Extract URL, uploader and title from webpage
1142                 self.report_extraction(video_id)
1143                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1144                 if mobj is None:
1145                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1146                         return
1147                 mediaURL = urllib.unquote(mobj.group(1))
1148
1149                 # if needed add http://www.dailymotion.com/ if relative URL
1150
1151                 video_url = mediaURL
1152
1153                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1154                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1155                 if mobj is None:
1156                         self._downloader.trouble(u'ERROR: unable to extract title')
1157                         return
1158                 video_title = mobj.group(1).decode('utf-8')
1159                 video_title = sanitize_title(video_title)
1160
1161                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1162                 if mobj is None:
1163                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1164                         return
1165                 video_uploader = mobj.group(1)
1166
1167                 try:
1168                         # Process video information
1169                         self._downloader.process_info({
1170                                 'id':           video_id.decode('utf-8'),
1171                                 'url':          video_url.decode('utf-8'),
1172                                 'uploader':     video_uploader.decode('utf-8'),
1173                                 'title':        video_title,
1174                                 'stitle':       simple_title,
1175                                 'ext':          video_extension.decode('utf-8'),
1176                                 'format':       u'NA',
1177                                 'player_url':   None,
1178                         })
1179                 except UnavailableFormatError:
1180                         self._downloader.trouble(u'ERROR: format not available for video')
1181
1182 class GoogleIE(InfoExtractor):
1183         """Information extractor for video.google.com."""
1184
1185         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1186
1187         def __init__(self, downloader=None):
1188                 InfoExtractor.__init__(self, downloader)
1189
1190         @staticmethod
1191         def suitable(url):
1192                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1193
1194         def report_download_webpage(self, video_id):
1195                 """Report webpage download."""
1196                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1197
1198         def report_extraction(self, video_id):
1199                 """Report information extraction."""
1200                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1201
1202         def _real_initialize(self):
1203                 return
1204
1205         def _real_extract(self, url):
1206                 # Extract id from URL
1207                 mobj = re.match(self._VALID_URL, url)
1208                 if mobj is None:
1209                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1210                         return
1211
1212                 video_id = mobj.group(1)
1213
1214                 video_extension = 'mp4'
1215
1216                 # Retrieve video webpage to extract further information
1217                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1218                 try:
1219                         self.report_download_webpage(video_id)
1220                         webpage = urllib2.urlopen(request).read()
1221                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1222                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1223                         return
1224
1225                 # Extract URL, uploader, and title from webpage
1226                 self.report_extraction(video_id)
1227                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1228                 if mobj is None:
1229                         video_extension = 'flv'
1230                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1231                 if mobj is None:
1232                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1233                         return
1234                 mediaURL = urllib.unquote(mobj.group(1))
1235                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1236                 mediaURL = mediaURL.replace('\\x26', '\x26')
1237
1238                 video_url = mediaURL
1239
1240                 mobj = re.search(r'<title>(.*)</title>', webpage)
1241                 if mobj is None:
1242                         self._downloader.trouble(u'ERROR: unable to extract title')
1243                         return
1244                 video_title = mobj.group(1).decode('utf-8')
1245                 video_title = sanitize_title(video_title)
1246                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1247
1248                 # Extract video description
1249                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1250                 if mobj is None:
1251                         self._downloader.trouble(u'ERROR: unable to extract video description')
1252                         return
1253                 video_description = mobj.group(1).decode('utf-8')
1254                 if not video_description:
1255                         video_description = 'No description available.'
1256
1257                 # Extract video thumbnail
1258                 if self._downloader.params.get('forcethumbnail', False):
1259                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1260                         try:
1261                                 webpage = urllib2.urlopen(request).read()
1262                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1263                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1264                                 return
1265                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1266                         if mobj is None:
1267                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1268                                 return
1269                         video_thumbnail = mobj.group(1)
1270                 else:   # we need something to pass to process_info
1271                         video_thumbnail = ''
1272
1273
1274                 try:
1275                         # Process video information
1276                         self._downloader.process_info({
1277                                 'id':           video_id.decode('utf-8'),
1278                                 'url':          video_url.decode('utf-8'),
1279                                 'uploader':     u'NA',
1280                                 'title':        video_title,
1281                                 'stitle':       simple_title,
1282                                 'ext':          video_extension.decode('utf-8'),
1283                                 'format':       u'NA',
1284                                 'player_url':   None,
1285                         })
1286                 except UnavailableFormatError:
1287                         self._downloader.trouble(u'ERROR: format not available for video')
1288
1289
1290 class PhotobucketIE(InfoExtractor):
1291         """Information extractor for photobucket.com."""
1292
1293         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1294
1295         def __init__(self, downloader=None):
1296                 InfoExtractor.__init__(self, downloader)
1297
1298         @staticmethod
1299         def suitable(url):
1300                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1301
1302         def report_download_webpage(self, video_id):
1303                 """Report webpage download."""
1304                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1305
1306         def report_extraction(self, video_id):
1307                 """Report information extraction."""
1308                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1309
1310         def _real_initialize(self):
1311                 return
1312
1313         def _real_extract(self, url):
1314                 # Extract id from URL
1315                 mobj = re.match(self._VALID_URL, url)
1316                 if mobj is None:
1317                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1318                         return
1319
1320                 video_id = mobj.group(1)
1321
1322                 video_extension = 'flv'
1323
1324                 # Retrieve video webpage to extract further information
1325                 request = urllib2.Request(url)
1326                 try:
1327                         self.report_download_webpage(video_id)
1328                         webpage = urllib2.urlopen(request).read()
1329                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1330                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1331                         return
1332
1333                 # Extract URL, uploader, and title from webpage
1334                 self.report_extraction(video_id)
1335                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1336                 if mobj is None:
1337                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1338                         return
1339                 mediaURL = urllib.unquote(mobj.group(1))
1340
1341                 video_url = mediaURL
1342
1343                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1344                 if mobj is None:
1345                         self._downloader.trouble(u'ERROR: unable to extract title')
1346                         return
1347                 video_title = mobj.group(1).decode('utf-8')
1348                 video_title = sanitize_title(video_title)
1349                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1350
1351                 video_uploader = mobj.group(2).decode('utf-8')
1352
1353                 try:
1354                         # Process video information
1355                         self._downloader.process_info({
1356                                 'id':           video_id.decode('utf-8'),
1357                                 'url':          video_url.decode('utf-8'),
1358                                 'uploader':     video_uploader,
1359                                 'title':        video_title,
1360                                 'stitle':       simple_title,
1361                                 'ext':          video_extension.decode('utf-8'),
1362                                 'format':       u'NA',
1363                                 'player_url':   None,
1364                         })
1365                 except UnavailableFormatError:
1366                         self._downloader.trouble(u'ERROR: format not available for video')
1367
1368
1369 class YahooIE(InfoExtractor):
1370         """Information extractor for video.yahoo.com."""
1371
1372         # _VALID_URL matches all Yahoo! Video URLs
1373         # _VPAGE_URL matches only the extractable '/watch/' URLs
1374         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1375         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1376
1377         def __init__(self, downloader=None):
1378                 InfoExtractor.__init__(self, downloader)
1379
1380         @staticmethod
1381         def suitable(url):
1382                 return (re.match(YahooIE._VALID_URL, url) is not None)
1383
1384         def report_download_webpage(self, video_id):
1385                 """Report webpage download."""
1386                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1387
1388         def report_extraction(self, video_id):
1389                 """Report information extraction."""
1390                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1391
1392         def _real_initialize(self):
1393                 return
1394
1395         def _real_extract(self, url):
1396                 # Extract ID from URL
1397                 mobj = re.match(self._VALID_URL, url)
1398                 if mobj is None:
1399                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1400                         return
1401
1402                 video_id = mobj.group(2)
1403                 video_extension = 'flv'
1404
1405                 # Rewrite valid but non-extractable URLs as
1406                 # extractable English language /watch/ URLs
1407                 if re.match(self._VPAGE_URL, url) is None:
1408                         request = urllib2.Request(url)
1409                         try:
1410                                 webpage = urllib2.urlopen(request).read()
1411                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1413                                 return
1414
1415                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1416                         if mobj is None:
1417                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1418                                 return
1419                         yahoo_id = mobj.group(1)
1420
1421                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1422                         if mobj is None:
1423                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1424                                 return
1425                         yahoo_vid = mobj.group(1)
1426
1427                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1428                         return self._real_extract(url)
1429
1430                 # Retrieve video webpage to extract further information
1431                 request = urllib2.Request(url)
1432                 try:
1433                         self.report_download_webpage(video_id)
1434                         webpage = urllib2.urlopen(request).read()
1435                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1436                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1437                         return
1438
1439                 # Extract uploader and title from webpage
1440                 self.report_extraction(video_id)
1441                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1442                 if mobj is None:
1443                         self._downloader.trouble(u'ERROR: unable to extract video title')
1444                         return
1445                 video_title = mobj.group(1).decode('utf-8')
1446                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1447
1448                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1449                 if mobj is None:
1450                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1451                         return
1452                 video_uploader = mobj.group(1).decode('utf-8')
1453
1454                 # Extract video thumbnail
1455                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1456                 if mobj is None:
1457                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1458                         return
1459                 video_thumbnail = mobj.group(1).decode('utf-8')
1460
1461                 # Extract video description
1462                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1463                 if mobj is None:
1464                         self._downloader.trouble(u'ERROR: unable to extract video description')
1465                         return
1466                 video_description = mobj.group(1).decode('utf-8')
1467                 if not video_description: video_description = 'No description available.'
1468
1469                 # Extract video height and width
1470                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1471                 if mobj is None:
1472                         self._downloader.trouble(u'ERROR: unable to extract video height')
1473                         return
1474                 yv_video_height = mobj.group(1)
1475
1476                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1477                 if mobj is None:
1478                         self._downloader.trouble(u'ERROR: unable to extract video width')
1479                         return
1480                 yv_video_width = mobj.group(1)
1481
1482                 # Retrieve video playlist to extract media URL
1483                 # I'm not completely sure what all these options are, but we
1484                 # seem to need most of them, otherwise the server sends a 401.
1485                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1486                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1487                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1488                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1489                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1490                 try:
1491                         self.report_download_webpage(video_id)
1492                         webpage = urllib2.urlopen(request).read()
1493                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1494                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1495                         return
1496
1497                 # Extract media URL from playlist XML
1498                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1499                 if mobj is None:
1500                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1501                         return
1502                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1503                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1504
1505                 try:
1506                         # Process video information
1507                         self._downloader.process_info({
1508                                 'id':           video_id.decode('utf-8'),
1509                                 'url':          video_url,
1510                                 'uploader':     video_uploader,
1511                                 'title':        video_title,
1512                                 'stitle':       simple_title,
1513                                 'ext':          video_extension.decode('utf-8'),
1514                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1515                                 'description':  video_description,
1516                                 'thumbnail':    video_thumbnail,
1517                                 'description':  video_description,
1518                                 'player_url':   None,
1519                         })
1520                 except UnavailableFormatError:
1521                         self._downloader.trouble(u'ERROR: format not available for video')
1522
1523
1524 class GenericIE(InfoExtractor):
1525         """Generic last-resort information extractor."""
1526
1527         def __init__(self, downloader=None):
1528                 InfoExtractor.__init__(self, downloader)
1529
1530         @staticmethod
1531         def suitable(url):
1532                 return True
1533
1534         def report_download_webpage(self, video_id):
1535                 """Report webpage download."""
1536                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1537                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1538
1539         def report_extraction(self, video_id):
1540                 """Report information extraction."""
1541                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1542
1543         def _real_initialize(self):
1544                 return
1545
1546         def _real_extract(self, url):
1547                 video_id = url.split('/')[-1]
1548                 request = urllib2.Request(url)
1549                 try:
1550                         self.report_download_webpage(video_id)
1551                         webpage = urllib2.urlopen(request).read()
1552                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1553                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1554                         return
1555                 except ValueError, err:
1556                         # since this is the last-resort InfoExtractor, if
1557                         # this error is thrown, it'll be thrown here
1558                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1559                         return
1560
1561                 # Start with something easy: JW Player in SWFObject
1562                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1563                 if mobj is None:
1564                         # Broaden the search a little bit
1565                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1566                 if mobj is None:
1567                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1568                         return
1569
1570                 # It's possible that one of the regexes
1571                 # matched, but returned an empty group:
1572                 if mobj.group(1) is None:
1573                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1574                         return
1575
1576                 video_url = urllib.unquote(mobj.group(1))
1577                 video_id  = os.path.basename(video_url)
1578
1579                 # here's a fun little line of code for you:
1580                 video_extension = os.path.splitext(video_id)[1][1:]
1581                 video_id        = os.path.splitext(video_id)[0]
1582
1583                 # it's tempting to parse this further, but you would
1584                 # have to take into account all the variations like
1585                 #   Video Title - Site Name
1586                 #   Site Name | Video Title
1587                 #   Video Title - Tagline | Site Name
1588                 # and so on and so forth; it's just not practical
1589                 mobj = re.search(r'<title>(.*)</title>', webpage)
1590                 if mobj is None:
1591                         self._downloader.trouble(u'ERROR: unable to extract title')
1592                         return
1593                 video_title = mobj.group(1).decode('utf-8')
1594                 video_title = sanitize_title(video_title)
1595                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1596
1597                 # video uploader is domain name
1598                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1599                 if mobj is None:
1600                         self._downloader.trouble(u'ERROR: unable to extract title')
1601                         return
1602                 video_uploader = mobj.group(1).decode('utf-8')
1603
1604                 try:
1605                         # Process video information
1606                         self._downloader.process_info({
1607                                 'id':           video_id.decode('utf-8'),
1608                                 'url':          video_url.decode('utf-8'),
1609                                 'uploader':     video_uploader,
1610                                 'title':        video_title,
1611                                 'stitle':       simple_title,
1612                                 'ext':          video_extension.decode('utf-8'),
1613                                 'format':       u'NA',
1614                                 'player_url':   None,
1615                         })
1616                 except UnavailableFormatError:
1617                         self._downloader.trouble(u'ERROR: format not available for video')
1618
1619
1620 class YoutubeSearchIE(InfoExtractor):
1621         """Information Extractor for YouTube search queries."""
1622         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1623         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1624         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1625         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1626         _youtube_ie = None
1627         _max_youtube_results = 1000
1628
1629         def __init__(self, youtube_ie, downloader=None):
1630                 InfoExtractor.__init__(self, downloader)
1631                 self._youtube_ie = youtube_ie
1632         
1633         @staticmethod
1634         def suitable(url):
1635                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1636
1637         def report_download_page(self, query, pagenum):
1638                 """Report attempt to download playlist page with given number."""
1639                 query = query.decode(preferredencoding())
1640                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1641
1642         def _real_initialize(self):
1643                 self._youtube_ie.initialize()
1644         
1645         def _real_extract(self, query):
1646                 mobj = re.match(self._VALID_QUERY, query)
1647                 if mobj is None:
1648                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1649                         return
1650
1651                 prefix, query = query.split(':')
1652                 prefix = prefix[8:]
1653                 query  = query.encode('utf-8')
1654                 if prefix == '':
1655                         self._download_n_results(query, 1)
1656                         return
1657                 elif prefix == 'all':
1658                         self._download_n_results(query, self._max_youtube_results)
1659                         return
1660                 else:
1661                         try:
1662                                 n = long(prefix)
1663                                 if n <= 0:
1664                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1665                                         return
1666                                 elif n > self._max_youtube_results:
1667                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1668                                         n = self._max_youtube_results
1669                                 self._download_n_results(query, n)
1670                                 return
1671                         except ValueError: # parsing prefix as integer fails
1672                                 self._download_n_results(query, 1)
1673                                 return
1674
1675         def _download_n_results(self, query, n):
1676                 """Downloads a specified number of results for a query"""
1677
1678                 video_ids = []
1679                 already_seen = set()
1680                 pagenum = 1
1681
1682                 while True:
1683                         self.report_download_page(query, pagenum)
1684                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1685                         request = urllib2.Request(result_url, None, std_headers)
1686                         try:
1687                                 page = urllib2.urlopen(request).read()
1688                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1689                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1690                                 return
1691
1692                         # Extract video identifiers
1693                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1694                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1695                                 if video_id not in already_seen:
1696                                         video_ids.append(video_id)
1697                                         already_seen.add(video_id)
1698                                         if len(video_ids) == n:
1699                                                 # Specified n videos reached
1700                                                 for id in video_ids:
1701                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1702                                                 return
1703
1704                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1705                                 for id in video_ids:
1706                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1707                                 return
1708
1709                         pagenum = pagenum + 1
1710
1711 class GoogleSearchIE(InfoExtractor):
1712         """Information Extractor for Google Video search queries."""
1713         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1714         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1715         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1716         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1717         _google_ie = None
1718         _max_google_results = 1000
1719
1720         def __init__(self, google_ie, downloader=None):
1721                 InfoExtractor.__init__(self, downloader)
1722                 self._google_ie = google_ie
1723         
1724         @staticmethod
1725         def suitable(url):
1726                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1727
1728         def report_download_page(self, query, pagenum):
1729                 """Report attempt to download playlist page with given number."""
1730                 query = query.decode(preferredencoding())
1731                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1732
1733         def _real_initialize(self):
1734                 self._google_ie.initialize()
1735         
1736         def _real_extract(self, query):
1737                 mobj = re.match(self._VALID_QUERY, query)
1738                 if mobj is None:
1739                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1740                         return
1741
1742                 prefix, query = query.split(':')
1743                 prefix = prefix[8:]
1744                 query  = query.encode('utf-8')
1745                 if prefix == '':
1746                         self._download_n_results(query, 1)
1747                         return
1748                 elif prefix == 'all':
1749                         self._download_n_results(query, self._max_google_results)
1750                         return
1751                 else:
1752                         try:
1753                                 n = long(prefix)
1754                                 if n <= 0:
1755                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1756                                         return
1757                                 elif n > self._max_google_results:
1758                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1759                                         n = self._max_google_results
1760                                 self._download_n_results(query, n)
1761                                 return
1762                         except ValueError: # parsing prefix as integer fails
1763                                 self._download_n_results(query, 1)
1764                                 return
1765
1766         def _download_n_results(self, query, n):
1767                 """Downloads a specified number of results for a query"""
1768
1769                 video_ids = []
1770                 already_seen = set()
1771                 pagenum = 1
1772
1773                 while True:
1774                         self.report_download_page(query, pagenum)
1775                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1776                         request = urllib2.Request(result_url, None, std_headers)
1777                         try:
1778                                 page = urllib2.urlopen(request).read()
1779                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1780                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1781                                 return
1782
1783                         # Extract video identifiers
1784                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1785                                 video_id = mobj.group(1)
1786                                 if video_id not in already_seen:
1787                                         video_ids.append(video_id)
1788                                         already_seen.add(video_id)
1789                                         if len(video_ids) == n:
1790                                                 # Specified n videos reached
1791                                                 for id in video_ids:
1792                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1793                                                 return
1794
1795                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1796                                 for id in video_ids:
1797                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1798                                 return
1799
1800                         pagenum = pagenum + 1
1801
1802 class YahooSearchIE(InfoExtractor):
1803         """Information Extractor for Yahoo! Video search queries."""
1804         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1805         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1806         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1807         _MORE_PAGES_INDICATOR = r'\s*Next'
1808         _yahoo_ie = None
1809         _max_yahoo_results = 1000
1810
1811         def __init__(self, yahoo_ie, downloader=None):
1812                 InfoExtractor.__init__(self, downloader)
1813                 self._yahoo_ie = yahoo_ie
1814         
1815         @staticmethod
1816         def suitable(url):
1817                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1818
1819         def report_download_page(self, query, pagenum):
1820                 """Report attempt to download playlist page with given number."""
1821                 query = query.decode(preferredencoding())
1822                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1823
1824         def _real_initialize(self):
1825                 self._yahoo_ie.initialize()
1826         
1827         def _real_extract(self, query):
1828                 mobj = re.match(self._VALID_QUERY, query)
1829                 if mobj is None:
1830                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1831                         return
1832
1833                 prefix, query = query.split(':')
1834                 prefix = prefix[8:]
1835                 query  = query.encode('utf-8')
1836                 if prefix == '':
1837                         self._download_n_results(query, 1)
1838                         return
1839                 elif prefix == 'all':
1840                         self._download_n_results(query, self._max_yahoo_results)
1841                         return
1842                 else:
1843                         try:
1844                                 n = long(prefix)
1845                                 if n <= 0:
1846                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1847                                         return
1848                                 elif n > self._max_yahoo_results:
1849                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1850                                         n = self._max_yahoo_results
1851                                 self._download_n_results(query, n)
1852                                 return
1853                         except ValueError: # parsing prefix as integer fails
1854                                 self._download_n_results(query, 1)
1855                                 return
1856
1857         def _download_n_results(self, query, n):
1858                 """Downloads a specified number of results for a query"""
1859
1860                 video_ids = []
1861                 already_seen = set()
1862                 pagenum = 1
1863
1864                 while True:
1865                         self.report_download_page(query, pagenum)
1866                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1867                         request = urllib2.Request(result_url, None, std_headers)
1868                         try:
1869                                 page = urllib2.urlopen(request).read()
1870                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1871                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1872                                 return
1873
1874                         # Extract video identifiers
1875                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1876                                 video_id = mobj.group(1)
1877                                 if video_id not in already_seen:
1878                                         video_ids.append(video_id)
1879                                         already_seen.add(video_id)
1880                                         if len(video_ids) == n:
1881                                                 # Specified n videos reached
1882                                                 for id in video_ids:
1883                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1884                                                 return
1885
1886                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1887                                 for id in video_ids:
1888                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1889                                 return
1890
1891                         pagenum = pagenum + 1
1892
1893 class YoutubePlaylistIE(InfoExtractor):
1894         """Information Extractor for YouTube playlists."""
1895
1896         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1897         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1898         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1899         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1900         _youtube_ie = None
1901
1902         def __init__(self, youtube_ie, downloader=None):
1903                 InfoExtractor.__init__(self, downloader)
1904                 self._youtube_ie = youtube_ie
1905         
1906         @staticmethod
1907         def suitable(url):
1908                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1909
1910         def report_download_page(self, playlist_id, pagenum):
1911                 """Report attempt to download playlist page with given number."""
1912                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1913
1914         def _real_initialize(self):
1915                 self._youtube_ie.initialize()
1916         
1917         def _real_extract(self, url):
1918                 # Extract playlist id
1919                 mobj = re.match(self._VALID_URL, url)
1920                 if mobj is None:
1921                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1922                         return
1923
1924                 # Download playlist pages
1925                 playlist_id = mobj.group(1)
1926                 video_ids = []
1927                 pagenum = 1
1928
1929                 while True:
1930                         self.report_download_page(playlist_id, pagenum)
1931                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1932                         try:
1933                                 page = urllib2.urlopen(request).read()
1934                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1935                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1936                                 return
1937
1938                         # Extract video identifiers
1939                         ids_in_page = []
1940                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1941                                 if mobj.group(1) not in ids_in_page:
1942                                         ids_in_page.append(mobj.group(1))
1943                         video_ids.extend(ids_in_page)
1944
1945                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1946                                 break
1947                         pagenum = pagenum + 1
1948
1949                 for id in video_ids:
1950                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1951                 return
1952
1953 class YoutubeUserIE(InfoExtractor):
1954         """Information Extractor for YouTube users."""
1955
1956         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1957         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1958         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1959         _youtube_ie = None
1960
1961         def __init__(self, youtube_ie, downloader=None):
1962                 InfoExtractor.__init__(self, downloader)
1963                 self._youtube_ie = youtube_ie
1964         
1965         @staticmethod
1966         def suitable(url):
1967                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1968
1969         def report_download_page(self, username):
1970                 """Report attempt to download user page."""
1971                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1972
1973         def _real_initialize(self):
1974                 self._youtube_ie.initialize()
1975         
1976         def _real_extract(self, url):
1977                 # Extract username
1978                 mobj = re.match(self._VALID_URL, url)
1979                 if mobj is None:
1980                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1981                         return
1982
1983                 # Download user page
1984                 username = mobj.group(1)
1985                 video_ids = []
1986                 pagenum = 1
1987
1988                 self.report_download_page(username)
1989                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1990                 try:
1991                         page = urllib2.urlopen(request).read()
1992                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1994                         return
1995
1996                 # Extract video identifiers
1997                 ids_in_page = []
1998
1999                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2000                         if mobj.group(1) not in ids_in_page:
2001                                 ids_in_page.append(mobj.group(1))
2002                 video_ids.extend(ids_in_page)
2003
2004                 for id in video_ids:
2005                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2006                 return
2007
2008 class PostProcessor(object):
2009         """Post Processor class.
2010
2011         PostProcessor objects can be added to downloaders with their
2012         add_post_processor() method. When the downloader has finished a
2013         successful download, it will take its internal chain of PostProcessors
2014         and start calling the run() method on each one of them, first with
2015         an initial argument and then with the returned value of the previous
2016         PostProcessor.
2017
2018         The chain will be stopped if one of them ever returns None or the end
2019         of the chain is reached.
2020
2021         PostProcessor objects follow a "mutual registration" process similar
2022         to InfoExtractor objects.
2023         """
2024
2025         _downloader = None
2026
2027         def __init__(self, downloader=None):
2028                 self._downloader = downloader
2029
2030         def set_downloader(self, downloader):
2031                 """Sets the downloader for this PP."""
2032                 self._downloader = downloader
2033         
2034         def run(self, information):
2035                 """Run the PostProcessor.
2036
2037                 The "information" argument is a dictionary like the ones
2038                 composed by InfoExtractors. The only difference is that this
2039                 one has an extra field called "filepath" that points to the
2040                 downloaded file.
2041
2042                 When this method returns None, the postprocessing chain is
2043                 stopped. However, this method may return an information
2044                 dictionary that will be passed to the next postprocessing
2045                 object in the chain. It can be the one it received after
2046                 changing some fields.
2047
2048                 In addition, this method may raise a PostProcessingError
2049                 exception that will be taken into account by the downloader
2050                 it was called from.
2051                 """
2052                 return information # by default, do nothing
2053         
2054 ### MAIN PROGRAM ###
2055 if __name__ == '__main__':
2056         try:
2057                 # Modules needed only when running the main program
2058                 import getpass
2059                 import optparse
2060
2061                 # Function to update the program file with the latest version from bitbucket.org
2062                 def update_self(downloader, filename):
2063                         # Note: downloader only used for options
2064                         if not os.access (filename, os.W_OK):
2065                                 sys.exit('ERROR: no write permissions on %s' % filename)
2066
2067                         downloader.to_stdout('Updating to latest stable version...')
2068                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2069                         latest_version = urllib.urlopen(latest_url).read().strip()
2070                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2071                         newcontent = urllib.urlopen(prog_url).read()
2072                         stream = open(filename, 'w')
2073                         stream.write(newcontent)
2074                         stream.close()
2075                         downloader.to_stdout('Updated to version %s' % latest_version)
2076
2077                 # General configuration
2078                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2079                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2080                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2081
2082                 # Parse command line
2083                 parser = optparse.OptionParser(
2084                         usage='Usage: %prog [options] url...',
2085                         version='2010.06.06',
2086                         conflict_handler='resolve',
2087                 )
2088
2089                 parser.add_option('-h', '--help',
2090                                 action='help', help='print this help text and exit')
2091                 parser.add_option('-v', '--version',
2092                                 action='version', help='print program version and exit')
2093                 parser.add_option('-U', '--update',
2094                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2095                 parser.add_option('-i', '--ignore-errors',
2096                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2097                 parser.add_option('-r', '--rate-limit',
2098                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2099                 parser.add_option('-R', '--retries',
2100                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2101
2102                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2103                 authentication.add_option('-u', '--username',
2104                                 dest='username', metavar='USERNAME', help='account username')
2105                 authentication.add_option('-p', '--password',
2106                                 dest='password', metavar='PASSWORD', help='account password')
2107                 authentication.add_option('-n', '--netrc',
2108                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2109                 parser.add_option_group(authentication)
2110
2111                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2112                 video_format.add_option('-f', '--format',
2113                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2114                 video_format.add_option('-b', '--best-quality',
2115                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
2116                 video_format.add_option('-m', '--mobile-version',
2117                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2118                 video_format.add_option('-d', '--high-def',
2119                                 action='store_const', dest='format', help='alias for -f 22', const='22')
2120                 video_format.add_option('--all-formats',
2121                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2122                 video_format.add_option('--max-quality',
2123                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format limit for -b')
2124                 parser.add_option_group(video_format)
2125
2126                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2127                 verbosity.add_option('-q', '--quiet',
2128                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2129                 verbosity.add_option('-s', '--simulate',
2130                                 action='store_true', dest='simulate', help='do not download video', default=False)
2131                 verbosity.add_option('-g', '--get-url',
2132                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2133                 verbosity.add_option('-e', '--get-title',
2134                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2135                 verbosity.add_option('--get-thumbnail',
2136                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2137                 verbosity.add_option('--get-description',
2138                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2139                 verbosity.add_option('--no-progress',
2140                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2141                 parser.add_option_group(verbosity)
2142
2143                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2144                 filesystem.add_option('-t', '--title',
2145                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2146                 filesystem.add_option('-l', '--literal',
2147                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2148                 filesystem.add_option('-o', '--output',
2149                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2150                 filesystem.add_option('-a', '--batch-file',
2151                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2152                 filesystem.add_option('-w', '--no-overwrites',
2153                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2154                 filesystem.add_option('-c', '--continue',
2155                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2156                 parser.add_option_group(filesystem)
2157
2158                 (opts, args) = parser.parse_args()
2159
2160                 # Batch file verification
2161                 batchurls = []
2162                 if opts.batchfile is not None:
2163                         try:
2164                                 if opts.batchfile == '-':
2165                                         batchfd = sys.stdin
2166                                 else:
2167                                         batchfd = open(opts.batchfile, 'r')
2168                                 batchurls = batchfd.readlines()
2169                                 batchurls = [x.strip() for x in batchurls]
2170                                 batchurls = [x for x in batchurls if len(x) > 0]
2171                         except IOError:
2172                                 sys.exit(u'ERROR: batch file could not be read')
2173                 all_urls = batchurls + args
2174
2175                 # Conflicting, missing and erroneous options
2176                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2177                         parser.error(u'using .netrc conflicts with giving username/password')
2178                 if opts.password is not None and opts.username is None:
2179                         parser.error(u'account username missing')
2180                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2181                         parser.error(u'using output template conflicts with using title or literal title')
2182                 if opts.usetitle and opts.useliteral:
2183                         parser.error(u'using title conflicts with using literal title')
2184                 if opts.username is not None and opts.password is None:
2185                         opts.password = getpass.getpass(u'Type account password and press return:')
2186                 if opts.ratelimit is not None:
2187                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2188                         if numeric_limit is None:
2189                                 parser.error(u'invalid rate limit specified')
2190                         opts.ratelimit = numeric_limit
2191                 if opts.retries is not None:
2192                         try:
2193                                 opts.retries = long(opts.retries)
2194                         except (TypeError, ValueError), err:
2195                                 parser.error(u'invalid retry count specified')
2196
2197                 # Information extractors
2198                 youtube_ie = YoutubeIE()
2199                 metacafe_ie = MetacafeIE(youtube_ie)
2200                 dailymotion_ie = DailymotionIE()
2201                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2202                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2203                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2204                 google_ie = GoogleIE()
2205                 google_search_ie = GoogleSearchIE(google_ie)
2206                 photobucket_ie = PhotobucketIE()
2207                 yahoo_ie = YahooIE()
2208                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2209                 generic_ie = GenericIE()
2210
2211                 # File downloader
2212                 fd = FileDownloader({
2213                         'usenetrc': opts.usenetrc,
2214                         'username': opts.username,
2215                         'password': opts.password,
2216                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2217                         'forceurl': opts.geturl,
2218                         'forcetitle': opts.gettitle,
2219                         'forcethumbnail': opts.getthumbnail,
2220                         'forcedescription': opts.getdescription,
2221                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2222                         'format': opts.format,
2223                         'format_limit': opts.format_limit,
2224                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2225                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2226                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2227                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2228                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2229                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2230                                 or u'%(id)s.%(ext)s'),
2231                         'ignoreerrors': opts.ignoreerrors,
2232                         'ratelimit': opts.ratelimit,
2233                         'nooverwrites': opts.nooverwrites,
2234                         'retries': opts.retries,
2235                         'continuedl': opts.continue_dl,
2236                         'noprogress': opts.noprogress,
2237                         })
2238                 fd.add_info_extractor(youtube_search_ie)
2239                 fd.add_info_extractor(youtube_pl_ie)
2240                 fd.add_info_extractor(youtube_user_ie)
2241                 fd.add_info_extractor(metacafe_ie)
2242                 fd.add_info_extractor(dailymotion_ie)
2243                 fd.add_info_extractor(youtube_ie)
2244                 fd.add_info_extractor(google_ie)
2245                 fd.add_info_extractor(google_search_ie)
2246                 fd.add_info_extractor(photobucket_ie)
2247                 fd.add_info_extractor(yahoo_ie)
2248                 fd.add_info_extractor(yahoo_search_ie)
2249
2250                 # This must come last since it's the
2251                 # fallback if none of the others work
2252                 fd.add_info_extractor(generic_ie)
2253
2254                 # Update version
2255                 if opts.update_self:
2256                         update_self(fd, sys.argv[0])
2257
2258                 # Maybe do nothing
2259                 if len(all_urls) < 1:
2260                         if not opts.update_self:
2261                                 parser.error(u'you must provide at least one URL')
2262                         else:
2263                                 sys.exit()
2264                 retcode = fd.download(all_urls)
2265                 sys.exit(retcode)
2266
2267         except DownloadError:
2268                 sys.exit(1)
2269         except SameFileError:
2270                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2271         except KeyboardInterrupt:
2272                 sys.exit(u'\nERROR: Interrupted by user')