Add --dump-user-agent option (patch provided by Benjamin Johnson)
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
8 import cookielib
9 import datetime
10 import htmlentitydefs
11 import httplib
12 import locale
13 import math
14 import netrc
15 import os
16 import os.path
17 import re
18 import socket
19 import string
20 import subprocess
21 import sys
22 import time
23 import urllib
24 import urllib2
25
26 # parse_qs was moved from the cgi module to the urlparse module recently.
27 try:
28         from urlparse import parse_qs
29 except ImportError:
30         from cgi import parse_qs
31
32 std_headers = {
33         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
34         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
35         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36         'Accept-Encoding': 'gzip, deflate',
37         'Accept-Language': 'en-us,en;q=0.5',
38 }
39
40 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
41
42 def preferredencoding():
43         """Get preferred encoding.
44
45         Returns the best encoding scheme for the system, based on
46         locale.getpreferredencoding() and some further tweaks.
47         """
48         def yield_preferredencoding():
49                 try:
50                         pref = locale.getpreferredencoding()
51                         u'TEST'.encode(pref)
52                 except:
53                         pref = 'UTF-8'
54                 while True:
55                         yield pref
56         return yield_preferredencoding().next()
57
58 def htmlentity_transform(matchobj):
59         """Transforms an HTML entity to a Unicode character.
60         
61         This function receives a match object and is intended to be used with
62         the re.sub() function.
63         """
64         entity = matchobj.group(1)
65
66         # Known non-numeric HTML entity
67         if entity in htmlentitydefs.name2codepoint:
68                 return unichr(htmlentitydefs.name2codepoint[entity])
69
70         # Unicode character
71         mobj = re.match(ur'(?u)#(x?\d+)', entity)
72         if mobj is not None:
73                 numstr = mobj.group(1)
74                 if numstr.startswith(u'x'):
75                         base = 16
76                         numstr = u'0%s' % numstr
77                 else:
78                         base = 10
79                 return unichr(long(numstr, base))
80
81         # Unknown entity in name, return its literal representation
82         return (u'&%s;' % entity)
83
84 def sanitize_title(utitle):
85         """Sanitizes a video title so it could be used as part of a filename."""
86         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
87         return utitle.replace(unicode(os.sep), u'%')
88
89 def sanitize_open(filename, open_mode):
90         """Try to open the given filename, and slightly tweak it if this fails.
91
92         Attempts to open the given filename. If this fails, it tries to change
93         the filename slightly, step by step, until it's either able to open it
94         or it fails and raises a final exception, like the standard open()
95         function.
96
97         It returns the tuple (stream, definitive_file_name).
98         """
99         try:
100                 if filename == u'-':
101                         if sys.platform == 'win32':
102                                 import msvcrt
103                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
104                         return (sys.stdout, filename)
105                 stream = open(filename, open_mode)
106                 return (stream, filename)
107         except (IOError, OSError), err:
108                 # In case of error, try to remove win32 forbidden chars
109                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
110
111                 # An exception here should be caught in the caller
112                 stream = open(filename, open_mode)
113                 return (stream, filename)
114
115 class DownloadError(Exception):
116         """Download Error exception.
117         
118         This exception may be thrown by FileDownloader objects if they are not
119         configured to continue on errors. They will contain the appropriate
120         error message.
121         """
122         pass
123
124 class SameFileError(Exception):
125         """Same File exception.
126
127         This exception will be thrown by FileDownloader objects if they detect
128         multiple files would have to be downloaded to the same file on disk.
129         """
130         pass
131
132 class PostProcessingError(Exception):
133         """Post Processing exception.
134
135         This exception may be raised by PostProcessor's .run() method to
136         indicate an error in the postprocessing task.
137         """
138         pass
139
140 class UnavailableVideoError(Exception):
141         """Unavailable Format exception.
142
143         This exception will be thrown when a video is requested
144         in a format that is not available for that video.
145         """
146         pass
147
148 class ContentTooShortError(Exception):
149         """Content Too Short exception.
150
151         This exception may be raised by FileDownloader objects when a file they
152         download is too small for what the server announced first, indicating
153         the connection was probably interrupted.
154         """
155         # Both in bytes
156         downloaded = None
157         expected = None
158
159         def __init__(self, downloaded, expected):
160                 self.downloaded = downloaded
161                 self.expected = expected
162
163 class FileDownloader(object):
164         """File Downloader class.
165
166         File downloader objects are the ones responsible of downloading the
167         actual video file and writing it to disk if the user has requested
168         it, among some other tasks. In most cases there should be one per
169         program. As, given a video URL, the downloader doesn't know how to
170         extract all the needed information, task that InfoExtractors do, it
171         has to pass the URL to one of them.
172
173         For this, file downloader objects have a method that allows
174         InfoExtractors to be registered in a given order. When it is passed
175         a URL, the file downloader handles it to the first InfoExtractor it
176         finds that reports being able to handle it. The InfoExtractor extracts
177         all the information about the video or videos the URL refers to, and
178         asks the FileDownloader to process the video information, possibly
179         downloading the video.
180
181         File downloaders accept a lot of parameters. In order not to saturate
182         the object constructor with arguments, it receives a dictionary of
183         options instead. These options are available through the params
184         attribute for the InfoExtractors to use. The FileDownloader also
185         registers itself as the downloader in charge for the InfoExtractors
186         that are added to it, so this is a "mutual registration".
187
188         Available options:
189
190         username:         Username for authentication purposes.
191         password:         Password for authentication purposes.
192         usenetrc:         Use netrc for authentication instead.
193         quiet:            Do not print messages to stdout.
194         forceurl:         Force printing final URL.
195         forcetitle:       Force printing title.
196         forcethumbnail:   Force printing thumbnail URL.
197         forcedescription: Force printing description.
198         simulate:         Do not download the video files.
199         format:           Video format code.
200         format_limit:     Highest quality format to try.
201         outtmpl:          Template for output names.
202         ignoreerrors:     Do not stop on download errors.
203         ratelimit:        Download speed limit, in bytes/sec.
204         nooverwrites:     Prevent overwriting files.
205         retries:          Number of times to retry for HTTP error 5xx
206         continuedl:       Try to continue downloads if possible.
207         noprogress:       Do not print the progress bar.
208         playliststart:    Playlist item to start at.
209         playlistend:      Playlist item to end at.
210         logtostderr:      Log messages to stderr instead of stdout.
211         """
212
213         params = None
214         _ies = []
215         _pps = []
216         _download_retcode = None
217         _num_downloads = None
218         _screen_file = None
219
220         def __init__(self, params):
221                 """Create a FileDownloader object with the given options."""
222                 self._ies = []
223                 self._pps = []
224                 self._download_retcode = 0
225                 self._num_downloads = 0
226                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
227                 self.params = params
228         
229         @staticmethod
230         def pmkdir(filename):
231                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
232                 components = filename.split(os.sep)
233                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
234                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
235                 for dir in aggregate:
236                         if not os.path.exists(dir):
237                                 os.mkdir(dir)
238         
239         @staticmethod
240         def temp_name(filename):
241                 """Returns a temporary filename for the given filename."""
242                 if filename == u'-' or (os.path.exists(filename) and not os.path.isfile(filename)):
243                         return filename
244                 return filename + u'.part'
245         
246         @staticmethod
247         def format_bytes(bytes):
248                 if bytes is None:
249                         return 'N/A'
250                 if type(bytes) is str:
251                         bytes = float(bytes)
252                 if bytes == 0.0:
253                         exponent = 0
254                 else:
255                         exponent = long(math.log(bytes, 1024.0))
256                 suffix = 'bkMGTPEZY'[exponent]
257                 converted = float(bytes) / float(1024**exponent)
258                 return '%.2f%s' % (converted, suffix)
259
260         @staticmethod
261         def calc_percent(byte_counter, data_len):
262                 if data_len is None:
263                         return '---.-%'
264                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
265
266         @staticmethod
267         def calc_eta(start, now, total, current):
268                 if total is None:
269                         return '--:--'
270                 dif = now - start
271                 if current == 0 or dif < 0.001: # One millisecond
272                         return '--:--'
273                 rate = float(current) / dif
274                 eta = long((float(total) - float(current)) / rate)
275                 (eta_mins, eta_secs) = divmod(eta, 60)
276                 if eta_mins > 99:
277                         return '--:--'
278                 return '%02d:%02d' % (eta_mins, eta_secs)
279
280         @staticmethod
281         def calc_speed(start, now, bytes):
282                 dif = now - start
283                 if bytes == 0 or dif < 0.001: # One millisecond
284                         return '%10s' % '---b/s'
285                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
286
287         @staticmethod
288         def best_block_size(elapsed_time, bytes):
289                 new_min = max(bytes / 2.0, 1.0)
290                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
291                 if elapsed_time < 0.001:
292                         return long(new_max)
293                 rate = bytes / elapsed_time
294                 if rate > new_max:
295                         return long(new_max)
296                 if rate < new_min:
297                         return long(new_min)
298                 return long(rate)
299
300         @staticmethod
301         def parse_bytes(bytestr):
302                 """Parse a string indicating a byte quantity into a long integer."""
303                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
304                 if matchobj is None:
305                         return None
306                 number = float(matchobj.group(1))
307                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
308                 return long(round(number * multiplier))
309
310         def add_info_extractor(self, ie):
311                 """Add an InfoExtractor object to the end of the list."""
312                 self._ies.append(ie)
313                 ie.set_downloader(self)
314         
315         def add_post_processor(self, pp):
316                 """Add a PostProcessor object to the end of the chain."""
317                 self._pps.append(pp)
318                 pp.set_downloader(self)
319         
320         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
321                 """Print message to stdout if not in quiet mode."""
322                 try:
323                         if not self.params.get('quiet', False):
324                                 terminator = [u'\n', u''][skip_eol]
325                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
326                         self._screen_file.flush()
327                 except (UnicodeEncodeError), err:
328                         if not ignore_encoding_errors:
329                                 raise
330         
331         def to_stderr(self, message):
332                 """Print message to stderr."""
333                 print >>sys.stderr, message.encode(preferredencoding())
334         
335         def fixed_template(self):
336                 """Checks if the output template is fixed."""
337                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
338
339         def trouble(self, message=None):
340                 """Determine action to take when a download problem appears.
341
342                 Depending on if the downloader has been configured to ignore
343                 download errors or not, this method may throw an exception or
344                 not when errors are found, after printing the message.
345                 """
346                 if message is not None:
347                         self.to_stderr(message)
348                 if not self.params.get('ignoreerrors', False):
349                         raise DownloadError(message)
350                 self._download_retcode = 1
351
352         def slow_down(self, start_time, byte_counter):
353                 """Sleep if the download speed is over the rate limit."""
354                 rate_limit = self.params.get('ratelimit', None)
355                 if rate_limit is None or byte_counter == 0:
356                         return
357                 now = time.time()
358                 elapsed = now - start_time
359                 if elapsed <= 0.0:
360                         return
361                 speed = float(byte_counter) / elapsed
362                 if speed > rate_limit:
363                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
364         
365         def try_rename(self, old_filename, new_filename):
366                 try:
367                         if old_filename == new_filename:
368                                 return
369                         os.rename(old_filename, new_filename)
370                 except (IOError, OSError), err:
371                         self.trouble(u'ERROR: unable to rename file')
372
373         def report_destination(self, filename):
374                 """Report destination filename."""
375                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
376         
377         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
378                 """Report download progress."""
379                 if self.params.get('noprogress', False):
380                         return
381                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
382                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
383
384         def report_resuming_byte(self, resume_len):
385                 """Report attempt to resume at given byte."""
386                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
387         
388         def report_retry(self, count, retries):
389                 """Report retry in case of HTTP error 5xx"""
390                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
391         
392         def report_file_already_downloaded(self, file_name):
393                 """Report file has already been fully downloaded."""
394                 try:
395                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
396                 except (UnicodeEncodeError), err:
397                         self.to_screen(u'[download] The file has already been downloaded')
398         
399         def report_unable_to_resume(self):
400                 """Report it was impossible to resume download."""
401                 self.to_screen(u'[download] Unable to resume')
402         
403         def report_finish(self):
404                 """Report download finished."""
405                 if self.params.get('noprogress', False):
406                         self.to_screen(u'[download] Download completed')
407                 else:
408                         self.to_screen(u'')
409         
410         def increment_downloads(self):
411                 """Increment the ordinal that assigns a number to each file."""
412                 self._num_downloads += 1
413
414         def process_info(self, info_dict):
415                 """Process a single dictionary returned by an InfoExtractor."""
416                 # Do nothing else if in simulate mode
417                 if self.params.get('simulate', False):
418                         # Forced printings
419                         if self.params.get('forcetitle', False):
420                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
421                         if self.params.get('forceurl', False):
422                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
423                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
424                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
425                         if self.params.get('forcedescription', False) and 'description' in info_dict:
426                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
427
428                         return
429                         
430                 try:
431                         template_dict = dict(info_dict)
432                         template_dict['epoch'] = unicode(long(time.time()))
433                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
434                         filename = self.params['outtmpl'] % template_dict
435                 except (ValueError, KeyError), err:
436                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
437                         return
438                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
439                         self.to_stderr(u'WARNING: file exists and will be skipped')
440                         return
441
442                 try:
443                         self.pmkdir(filename)
444                 except (OSError, IOError), err:
445                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
446                         return
447
448                 try:
449                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
450                 except (OSError, IOError), err:
451                         raise UnavailableVideoError
452                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
453                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
454                         return
455                 except (ContentTooShortError, ), err:
456                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
457                         return
458
459                 if success:
460                         try:
461                                 self.post_process(filename, info_dict)
462                         except (PostProcessingError), err:
463                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
464                                 return
465
466         def download(self, url_list):
467                 """Download a given list of URLs."""
468                 if len(url_list) > 1 and self.fixed_template():
469                         raise SameFileError(self.params['outtmpl'])
470
471                 for url in url_list:
472                         suitable_found = False
473                         for ie in self._ies:
474                                 # Go to next InfoExtractor if not suitable
475                                 if not ie.suitable(url):
476                                         continue
477
478                                 # Suitable InfoExtractor found
479                                 suitable_found = True
480
481                                 # Extract information from URL and process it
482                                 ie.extract(url)
483
484                                 # Suitable InfoExtractor had been found; go to next URL
485                                 break
486
487                         if not suitable_found:
488                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
489
490                 return self._download_retcode
491
492         def post_process(self, filename, ie_info):
493                 """Run the postprocessing chain on the given file."""
494                 info = dict(ie_info)
495                 info['filepath'] = filename
496                 for pp in self._pps:
497                         info = pp.run(info)
498                         if info is None:
499                                 break
500         
501         def _download_with_rtmpdump(self, filename, url, player_url):
502                 self.report_destination(filename)
503                 tmpfilename = self.temp_name(filename)
504
505                 # Check for rtmpdump first
506                 try:
507                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
508                 except (OSError, IOError):
509                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
510                         return False
511
512                 # Download using rtmpdump. rtmpdump returns exit code 2 when
513                 # the connection was interrumpted and resuming appears to be
514                 # possible. This is part of rtmpdump's normal usage, AFAIK.
515                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
516                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
517                 while retval == 2 or retval == 1:
518                         prevsize = os.path.getsize(tmpfilename)
519                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
520                         time.sleep(5.0) # This seems to be needed
521                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
522                         cursize = os.path.getsize(tmpfilename)
523                         if prevsize == cursize and retval == 1:
524                                 break
525                 if retval == 0:
526                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
527                         self.try_rename(tmpfilename, filename)
528                         return True
529                 else:
530                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
531                         return False
532
533         def _do_download(self, filename, url, player_url):
534                 # Check file already present
535                 if self.params.get('continuedl', False) and os.path.isfile(filename):
536                         self.report_file_already_downloaded(filename)
537                         return True
538
539                 # Attempt to download using rtmpdump
540                 if url.startswith('rtmp'):
541                         return self._download_with_rtmpdump(filename, url, player_url)
542
543                 tmpfilename = self.temp_name(filename)
544                 stream = None
545                 open_mode = 'wb'
546                 basic_request = urllib2.Request(url, None, std_headers)
547                 request = urllib2.Request(url, None, std_headers)
548
549                 # Establish possible resume length
550                 if os.path.isfile(tmpfilename):
551                         resume_len = os.path.getsize(tmpfilename)
552                 else:
553                         resume_len = 0
554
555                 # Request parameters in case of being able to resume
556                 if self.params.get('continuedl', False) and resume_len != 0:
557                         self.report_resuming_byte(resume_len)
558                         request.add_header('Range','bytes=%d-' % resume_len)
559                         open_mode = 'ab'
560
561                 count = 0
562                 retries = self.params.get('retries', 0)
563                 while count <= retries:
564                         # Establish connection
565                         try:
566                                 data = urllib2.urlopen(request)
567                                 break
568                         except (urllib2.HTTPError, ), err:
569                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
570                                         # Unexpected HTTP error
571                                         raise
572                                 elif err.code == 416:
573                                         # Unable to resume (requested range not satisfiable)
574                                         try:
575                                                 # Open the connection again without the range header
576                                                 data = urllib2.urlopen(basic_request)
577                                                 content_length = data.info()['Content-Length']
578                                         except (urllib2.HTTPError, ), err:
579                                                 if err.code < 500 or err.code >= 600:
580                                                         raise
581                                         else:
582                                                 # Examine the reported length
583                                                 if (content_length is not None and
584                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
585                                                         # The file had already been fully downloaded.
586                                                         # Explanation to the above condition: in issue #175 it was revealed that
587                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
588                                                         # changing the file size slightly and causing problems for some users. So
589                                                         # I decided to implement a suggested change and consider the file
590                                                         # completely downloaded if the file size differs less than 100 bytes from
591                                                         # the one in the hard drive.
592                                                         self.report_file_already_downloaded(filename)
593                                                         self.try_rename(tmpfilename, filename)
594                                                         return True
595                                                 else:
596                                                         # The length does not match, we start the download over
597                                                         self.report_unable_to_resume()
598                                                         open_mode = 'wb'
599                                                         break
600                         # Retry
601                         count += 1
602                         if count <= retries:
603                                 self.report_retry(count, retries)
604
605                 if count > retries:
606                         self.trouble(u'ERROR: giving up after %s retries' % retries)
607                         return False
608
609                 data_len = data.info().get('Content-length', None)
610                 if data_len is not None:
611                         data_len = long(data_len) + resume_len
612                 data_len_str = self.format_bytes(data_len)
613                 byte_counter = 0 + resume_len
614                 block_size = 1024
615                 start = time.time()
616                 while True:
617                         # Download and write
618                         before = time.time()
619                         data_block = data.read(block_size)
620                         after = time.time()
621                         if len(data_block) == 0:
622                                 break
623                         byte_counter += len(data_block)
624
625                         # Open file just in time
626                         if stream is None:
627                                 try:
628                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
629                                         self.report_destination(filename)
630                                 except (OSError, IOError), err:
631                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
632                                         return False
633                         try:
634                                 stream.write(data_block)
635                         except (IOError, OSError), err:
636                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
637                                 return False
638                         block_size = self.best_block_size(after - before, len(data_block))
639
640                         # Progress message
641                         percent_str = self.calc_percent(byte_counter, data_len)
642                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
643                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
644                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
645
646                         # Apply rate limit
647                         self.slow_down(start, byte_counter - resume_len)
648
649                 stream.close()
650                 self.report_finish()
651                 if data_len is not None and byte_counter != data_len:
652                         raise ContentTooShortError(byte_counter, long(data_len))
653                 self.try_rename(tmpfilename, filename)
654                 return True
655
656 class InfoExtractor(object):
657         """Information Extractor class.
658
659         Information extractors are the classes that, given a URL, extract
660         information from the video (or videos) the URL refers to. This
661         information includes the real video URL, the video title and simplified
662         title, author and others. The information is stored in a dictionary
663         which is then passed to the FileDownloader. The FileDownloader
664         processes this information possibly downloading the video to the file
665         system, among other possible outcomes. The dictionaries must include
666         the following fields:
667
668         id:             Video identifier.
669         url:            Final video URL.
670         uploader:       Nickname of the video uploader.
671         title:          Literal title.
672         stitle:         Simplified title.
673         ext:            Video filename extension.
674         format:         Video format.
675         player_url:     SWF Player URL (may be None).
676
677         The following fields are optional. Their primary purpose is to allow
678         youtube-dl to serve as the backend for a video search function, such
679         as the one in youtube2mp3.  They are only used when their respective
680         forced printing functions are called:
681
682         thumbnail:      Full URL to a video thumbnail image.
683         description:    One-line video description.
684
685         Subclasses of this one should re-define the _real_initialize() and
686         _real_extract() methods, as well as the suitable() static method.
687         Probably, they should also be instantiated and added to the main
688         downloader.
689         """
690
691         _ready = False
692         _downloader = None
693
694         def __init__(self, downloader=None):
695                 """Constructor. Receives an optional downloader."""
696                 self._ready = False
697                 self.set_downloader(downloader)
698
699         @staticmethod
700         def suitable(url):
701                 """Receives a URL and returns True if suitable for this IE."""
702                 return False
703
704         def initialize(self):
705                 """Initializes an instance (authentication, etc)."""
706                 if not self._ready:
707                         self._real_initialize()
708                         self._ready = True
709
710         def extract(self, url):
711                 """Extracts URL information and returns it in list of dicts."""
712                 self.initialize()
713                 return self._real_extract(url)
714
715         def set_downloader(self, downloader):
716                 """Sets the downloader for this IE."""
717                 self._downloader = downloader
718         
719         def _real_initialize(self):
720                 """Real initialization process. Redefine in subclasses."""
721                 pass
722
723         def _real_extract(self, url):
724                 """Real extraction process. Redefine in subclasses."""
725                 pass
726
727 class YoutubeIE(InfoExtractor):
728         """Information extractor for youtube.com."""
729
730         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
731         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
732         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
733         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
734         _NETRC_MACHINE = 'youtube'
735         # Listed in order of quality
736         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
737         _video_extensions = {
738                 '13': '3gp',
739                 '17': 'mp4',
740                 '18': 'mp4',
741                 '22': 'mp4',
742                 '37': 'mp4',
743                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
744                 '43': 'webm',
745                 '45': 'webm',
746         }
747
748         @staticmethod
749         def suitable(url):
750                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
751
752         def report_lang(self):
753                 """Report attempt to set language."""
754                 self._downloader.to_screen(u'[youtube] Setting language')
755
756         def report_login(self):
757                 """Report attempt to log in."""
758                 self._downloader.to_screen(u'[youtube] Logging in')
759         
760         def report_age_confirmation(self):
761                 """Report attempt to confirm age."""
762                 self._downloader.to_screen(u'[youtube] Confirming age')
763         
764         def report_video_webpage_download(self, video_id):
765                 """Report attempt to download video webpage."""
766                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
767         
768         def report_video_info_webpage_download(self, video_id):
769                 """Report attempt to download video info webpage."""
770                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
771         
772         def report_information_extraction(self, video_id):
773                 """Report attempt to extract video information."""
774                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
775         
776         def report_unavailable_format(self, video_id, format):
777                 """Report extracted video URL."""
778                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
779         
780         def report_rtmp_download(self):
781                 """Indicate the download will use the RTMP protocol."""
782                 self._downloader.to_screen(u'[youtube] RTMP download detected')
783         
784         def _real_initialize(self):
785                 if self._downloader is None:
786                         return
787
788                 username = None
789                 password = None
790                 downloader_params = self._downloader.params
791
792                 # Attempt to use provided username and password or .netrc data
793                 if downloader_params.get('username', None) is not None:
794                         username = downloader_params['username']
795                         password = downloader_params['password']
796                 elif downloader_params.get('usenetrc', False):
797                         try:
798                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
799                                 if info is not None:
800                                         username = info[0]
801                                         password = info[2]
802                                 else:
803                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
804                         except (IOError, netrc.NetrcParseError), err:
805                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
806                                 return
807
808                 # Set language
809                 request = urllib2.Request(self._LANG_URL, None, std_headers)
810                 try:
811                         self.report_lang()
812                         urllib2.urlopen(request).read()
813                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
814                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
815                         return
816
817                 # No authentication to be performed
818                 if username is None:
819                         return
820
821                 # Log in
822                 login_form = {
823                                 'current_form': 'loginForm',
824                                 'next':         '/',
825                                 'action_login': 'Log In',
826                                 'username':     username,
827                                 'password':     password,
828                                 }
829                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
830                 try:
831                         self.report_login()
832                         login_results = urllib2.urlopen(request).read()
833                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
834                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
835                                 return
836                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
837                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
838                         return
839         
840                 # Confirm age
841                 age_form = {
842                                 'next_url':             '/',
843                                 'action_confirm':       'Confirm',
844                                 }
845                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
846                 try:
847                         self.report_age_confirmation()
848                         age_results = urllib2.urlopen(request).read()
849                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
850                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
851                         return
852
853         def _real_extract(self, url):
854                 # Extract video id from URL
855                 mobj = re.match(self._VALID_URL, url)
856                 if mobj is None:
857                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
858                         return
859                 video_id = mobj.group(2)
860
861                 # Get video webpage
862                 self.report_video_webpage_download(video_id)
863                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id, None, std_headers)
864                 try:
865                         video_webpage = urllib2.urlopen(request).read()
866                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
867                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
868                         return
869
870                 # Attempt to extract SWF player URL
871                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
872                 if mobj is not None:
873                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
874                 else:
875                         player_url = None
876
877                 # Get video info
878                 self.report_video_info_webpage_download(video_id)
879                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
880                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
881                                            % (video_id, el_type))
882                         request = urllib2.Request(video_info_url, None, std_headers)
883                         try:
884                                 video_info_webpage = urllib2.urlopen(request).read()
885                                 video_info = parse_qs(video_info_webpage)
886                                 if 'token' in video_info:
887                                         break
888                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
889                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
890                                 return
891                 if 'token' not in video_info:
892                         if 'reason' in video_info:
893                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
894                         else:
895                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
896                         return
897
898                 # Start extracting information
899                 self.report_information_extraction(video_id)
900
901                 # uploader
902                 if 'author' not in video_info:
903                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
904                         return
905                 video_uploader = urllib.unquote_plus(video_info['author'][0])
906
907                 # title
908                 if 'title' not in video_info:
909                         self._downloader.trouble(u'ERROR: unable to extract video title')
910                         return
911                 video_title = urllib.unquote_plus(video_info['title'][0])
912                 video_title = video_title.decode('utf-8')
913                 video_title = sanitize_title(video_title)
914
915                 # simplified title
916                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
917                 simple_title = simple_title.strip(ur'_')
918
919                 # thumbnail image
920                 if 'thumbnail_url' not in video_info:
921                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
922                         video_thumbnail = ''
923                 else:   # don't panic if we can't find it
924                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
925
926                 # upload date
927                 upload_date = u'NA'
928                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
929                 if mobj is not None:
930                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
931                         format_expressions = ['%d %B %Y', '%B %d %Y']
932                         for expression in format_expressions:
933                                 try:
934                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
935                                 except:
936                                         pass
937
938                 # description
939                 video_description = 'No description available.'
940                 if self._downloader.params.get('forcedescription', False):
941                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
942                         if mobj is not None:
943                                 video_description = mobj.group(1)
944
945                 # token
946                 video_token = urllib.unquote_plus(video_info['token'][0])
947
948                 # Decide which formats to download
949                 req_format = self._downloader.params.get('format', None)
950
951                 if 'fmt_url_map' in video_info:
952                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
953                         format_limit = self._downloader.params.get('format_limit', None)
954                         if format_limit is not None and format_limit in self._available_formats:
955                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
956                         else:
957                                 format_list = self._available_formats
958                         existing_formats = [x for x in format_list if x in url_map]
959                         if len(existing_formats) == 0:
960                                 self._downloader.trouble(u'ERROR: no known formats available for video')
961                                 return
962                         if req_format is None:
963                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
964                         elif req_format == '-1':
965                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
966                         else:
967                                 # Specific format
968                                 if req_format not in url_map:
969                                         self._downloader.trouble(u'ERROR: requested format not available')
970                                         return
971                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
972
973                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
974                         self.report_rtmp_download()
975                         video_url_list = [(None, video_info['conn'][0])]
976
977                 else:
978                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
979                         return
980
981                 for format_param, video_real_url in video_url_list:
982                         # At this point we have a new video
983                         self._downloader.increment_downloads()
984
985                         # Extension
986                         video_extension = self._video_extensions.get(format_param, 'flv')
987
988                         # Find the video URL in fmt_url_map or conn paramters
989                         try:
990                                 # Process video information
991                                 self._downloader.process_info({
992                                         'id':           video_id.decode('utf-8'),
993                                         'url':          video_real_url.decode('utf-8'),
994                                         'uploader':     video_uploader.decode('utf-8'),
995                                         'upload_date':  upload_date,
996                                         'title':        video_title,
997                                         'stitle':       simple_title,
998                                         'ext':          video_extension.decode('utf-8'),
999                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1000                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1001                                         'description':  video_description.decode('utf-8'),
1002                                         'player_url':   player_url,
1003                                 })
1004                         except UnavailableVideoError, err:
1005                                 self._downloader.trouble(u'\nERROR: unable to download video')
1006
1007
1008 class MetacafeIE(InfoExtractor):
1009         """Information Extractor for metacafe.com."""
1010
1011         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1012         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1013         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1014         _youtube_ie = None
1015
1016         def __init__(self, youtube_ie, downloader=None):
1017                 InfoExtractor.__init__(self, downloader)
1018                 self._youtube_ie = youtube_ie
1019
1020         @staticmethod
1021         def suitable(url):
1022                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1023
1024         def report_disclaimer(self):
1025                 """Report disclaimer retrieval."""
1026                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1027
1028         def report_age_confirmation(self):
1029                 """Report attempt to confirm age."""
1030                 self._downloader.to_screen(u'[metacafe] Confirming age')
1031         
1032         def report_download_webpage(self, video_id):
1033                 """Report webpage download."""
1034                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1035         
1036         def report_extraction(self, video_id):
1037                 """Report information extraction."""
1038                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1039
1040         def _real_initialize(self):
1041                 # Retrieve disclaimer
1042                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1043                 try:
1044                         self.report_disclaimer()
1045                         disclaimer = urllib2.urlopen(request).read()
1046                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1047                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1048                         return
1049
1050                 # Confirm age
1051                 disclaimer_form = {
1052                         'filters': '0',
1053                         'submit': "Continue - I'm over 18",
1054                         }
1055                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1056                 try:
1057                         self.report_age_confirmation()
1058                         disclaimer = urllib2.urlopen(request).read()
1059                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1060                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1061                         return
1062         
1063         def _real_extract(self, url):
1064                 # Extract id and simplified title from URL
1065                 mobj = re.match(self._VALID_URL, url)
1066                 if mobj is None:
1067                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1068                         return
1069
1070                 video_id = mobj.group(1)
1071
1072                 # Check if video comes from YouTube
1073                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1074                 if mobj2 is not None:
1075                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1076                         return
1077
1078                 # At this point we have a new video
1079                 self._downloader.increment_downloads()
1080
1081                 simple_title = mobj.group(2).decode('utf-8')
1082
1083                 # Retrieve video webpage to extract further information
1084                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1085                 try:
1086                         self.report_download_webpage(video_id)
1087                         webpage = urllib2.urlopen(request).read()
1088                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1089                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1090                         return
1091
1092                 # Extract URL, uploader and title from webpage
1093                 self.report_extraction(video_id)
1094                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1095                 if mobj is not None:
1096                         mediaURL = urllib.unquote(mobj.group(1))
1097                         video_extension = mediaURL[-3:]
1098                         
1099                         # Extract gdaKey if available
1100                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1101                         if mobj is None:
1102                                 video_url = mediaURL
1103                         else:
1104                                 gdaKey = mobj.group(1)
1105                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1106                 else:
1107                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1108                         if mobj is None:
1109                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1110                                 return
1111                         vardict = parse_qs(mobj.group(1))
1112                         if 'mediaData' not in vardict:
1113                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1114                                 return
1115                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1116                         if mobj is None:
1117                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1118                                 return
1119                         mediaURL = mobj.group(1).replace('\\/', '/')
1120                         video_extension = mediaURL[-3:]
1121                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1122
1123                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1124                 if mobj is None:
1125                         self._downloader.trouble(u'ERROR: unable to extract title')
1126                         return
1127                 video_title = mobj.group(1).decode('utf-8')
1128                 video_title = sanitize_title(video_title)
1129
1130                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1131                 if mobj is None:
1132                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1133                         return
1134                 video_uploader = mobj.group(1)
1135
1136                 try:
1137                         # Process video information
1138                         self._downloader.process_info({
1139                                 'id':           video_id.decode('utf-8'),
1140                                 'url':          video_url.decode('utf-8'),
1141                                 'uploader':     video_uploader.decode('utf-8'),
1142                                 'upload_date':  u'NA',
1143                                 'title':        video_title,
1144                                 'stitle':       simple_title,
1145                                 'ext':          video_extension.decode('utf-8'),
1146                                 'format':       u'NA',
1147                                 'player_url':   None,
1148                         })
1149                 except UnavailableVideoError:
1150                         self._downloader.trouble(u'\nERROR: unable to download video')
1151
1152
1153 class DailymotionIE(InfoExtractor):
1154         """Information Extractor for Dailymotion"""
1155
1156         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1157
1158         def __init__(self, downloader=None):
1159                 InfoExtractor.__init__(self, downloader)
1160
1161         @staticmethod
1162         def suitable(url):
1163                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1164
1165         def report_download_webpage(self, video_id):
1166                 """Report webpage download."""
1167                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1168         
1169         def report_extraction(self, video_id):
1170                 """Report information extraction."""
1171                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1172
1173         def _real_initialize(self):
1174                 return
1175
1176         def _real_extract(self, url):
1177                 # Extract id and simplified title from URL
1178                 mobj = re.match(self._VALID_URL, url)
1179                 if mobj is None:
1180                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1181                         return
1182
1183                 # At this point we have a new video
1184                 self._downloader.increment_downloads()
1185                 video_id = mobj.group(1)
1186
1187                 simple_title = mobj.group(2).decode('utf-8')
1188                 video_extension = 'flv'
1189
1190                 # Retrieve video webpage to extract further information
1191                 request = urllib2.Request(url)
1192                 try:
1193                         self.report_download_webpage(video_id)
1194                         webpage = urllib2.urlopen(request).read()
1195                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1196                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1197                         return
1198
1199                 # Extract URL, uploader and title from webpage
1200                 self.report_extraction(video_id)
1201                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1202                 if mobj is None:
1203                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1204                         return
1205                 mediaURL = urllib.unquote(mobj.group(1))
1206
1207                 # if needed add http://www.dailymotion.com/ if relative URL
1208
1209                 video_url = mediaURL
1210
1211                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1212                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1213                 if mobj is None:
1214                         self._downloader.trouble(u'ERROR: unable to extract title')
1215                         return
1216                 video_title = mobj.group(1).decode('utf-8')
1217                 video_title = sanitize_title(video_title)
1218
1219                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1220                 if mobj is None:
1221                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1222                         return
1223                 video_uploader = mobj.group(1)
1224
1225                 try:
1226                         # Process video information
1227                         self._downloader.process_info({
1228                                 'id':           video_id.decode('utf-8'),
1229                                 'url':          video_url.decode('utf-8'),
1230                                 'uploader':     video_uploader.decode('utf-8'),
1231                                 'upload_date':  u'NA',
1232                                 'title':        video_title,
1233                                 'stitle':       simple_title,
1234                                 'ext':          video_extension.decode('utf-8'),
1235                                 'format':       u'NA',
1236                                 'player_url':   None,
1237                         })
1238                 except UnavailableVideoError:
1239                         self._downloader.trouble(u'\nERROR: unable to download video')
1240
1241 class GoogleIE(InfoExtractor):
1242         """Information extractor for video.google.com."""
1243
1244         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1245
1246         def __init__(self, downloader=None):
1247                 InfoExtractor.__init__(self, downloader)
1248
1249         @staticmethod
1250         def suitable(url):
1251                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1252
1253         def report_download_webpage(self, video_id):
1254                 """Report webpage download."""
1255                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1256
1257         def report_extraction(self, video_id):
1258                 """Report information extraction."""
1259                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1260
1261         def _real_initialize(self):
1262                 return
1263
1264         def _real_extract(self, url):
1265                 # Extract id from URL
1266                 mobj = re.match(self._VALID_URL, url)
1267                 if mobj is None:
1268                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1269                         return
1270
1271                 # At this point we have a new video
1272                 self._downloader.increment_downloads()
1273                 video_id = mobj.group(1)
1274
1275                 video_extension = 'mp4'
1276
1277                 # Retrieve video webpage to extract further information
1278                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1279                 try:
1280                         self.report_download_webpage(video_id)
1281                         webpage = urllib2.urlopen(request).read()
1282                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1284                         return
1285
1286                 # Extract URL, uploader, and title from webpage
1287                 self.report_extraction(video_id)
1288                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1289                 if mobj is None:
1290                         video_extension = 'flv'
1291                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1292                 if mobj is None:
1293                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1294                         return
1295                 mediaURL = urllib.unquote(mobj.group(1))
1296                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1297                 mediaURL = mediaURL.replace('\\x26', '\x26')
1298
1299                 video_url = mediaURL
1300
1301                 mobj = re.search(r'<title>(.*)</title>', webpage)
1302                 if mobj is None:
1303                         self._downloader.trouble(u'ERROR: unable to extract title')
1304                         return
1305                 video_title = mobj.group(1).decode('utf-8')
1306                 video_title = sanitize_title(video_title)
1307                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1308
1309                 # Extract video description
1310                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1311                 if mobj is None:
1312                         self._downloader.trouble(u'ERROR: unable to extract video description')
1313                         return
1314                 video_description = mobj.group(1).decode('utf-8')
1315                 if not video_description:
1316                         video_description = 'No description available.'
1317
1318                 # Extract video thumbnail
1319                 if self._downloader.params.get('forcethumbnail', False):
1320                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1321                         try:
1322                                 webpage = urllib2.urlopen(request).read()
1323                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1324                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1325                                 return
1326                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1327                         if mobj is None:
1328                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1329                                 return
1330                         video_thumbnail = mobj.group(1)
1331                 else:   # we need something to pass to process_info
1332                         video_thumbnail = ''
1333
1334
1335                 try:
1336                         # Process video information
1337                         self._downloader.process_info({
1338                                 'id':           video_id.decode('utf-8'),
1339                                 'url':          video_url.decode('utf-8'),
1340                                 'uploader':     u'NA',
1341                                 'upload_date':  u'NA',
1342                                 'title':        video_title,
1343                                 'stitle':       simple_title,
1344                                 'ext':          video_extension.decode('utf-8'),
1345                                 'format':       u'NA',
1346                                 'player_url':   None,
1347                         })
1348                 except UnavailableVideoError:
1349                         self._downloader.trouble(u'\nERROR: unable to download video')
1350
1351
1352 class PhotobucketIE(InfoExtractor):
1353         """Information extractor for photobucket.com."""
1354
1355         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1356
1357         def __init__(self, downloader=None):
1358                 InfoExtractor.__init__(self, downloader)
1359
1360         @staticmethod
1361         def suitable(url):
1362                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1363
1364         def report_download_webpage(self, video_id):
1365                 """Report webpage download."""
1366                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1367
1368         def report_extraction(self, video_id):
1369                 """Report information extraction."""
1370                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1371
1372         def _real_initialize(self):
1373                 return
1374
1375         def _real_extract(self, url):
1376                 # Extract id from URL
1377                 mobj = re.match(self._VALID_URL, url)
1378                 if mobj is None:
1379                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380                         return
1381
1382                 # At this point we have a new video
1383                 self._downloader.increment_downloads()
1384                 video_id = mobj.group(1)
1385
1386                 video_extension = 'flv'
1387
1388                 # Retrieve video webpage to extract further information
1389                 request = urllib2.Request(url)
1390                 try:
1391                         self.report_download_webpage(video_id)
1392                         webpage = urllib2.urlopen(request).read()
1393                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1394                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1395                         return
1396
1397                 # Extract URL, uploader, and title from webpage
1398                 self.report_extraction(video_id)
1399                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1400                 if mobj is None:
1401                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1402                         return
1403                 mediaURL = urllib.unquote(mobj.group(1))
1404
1405                 video_url = mediaURL
1406
1407                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1408                 if mobj is None:
1409                         self._downloader.trouble(u'ERROR: unable to extract title')
1410                         return
1411                 video_title = mobj.group(1).decode('utf-8')
1412                 video_title = sanitize_title(video_title)
1413                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1414
1415                 video_uploader = mobj.group(2).decode('utf-8')
1416
1417                 try:
1418                         # Process video information
1419                         self._downloader.process_info({
1420                                 'id':           video_id.decode('utf-8'),
1421                                 'url':          video_url.decode('utf-8'),
1422                                 'uploader':     video_uploader,
1423                                 'upload_date':  u'NA',
1424                                 'title':        video_title,
1425                                 'stitle':       simple_title,
1426                                 'ext':          video_extension.decode('utf-8'),
1427                                 'format':       u'NA',
1428                                 'player_url':   None,
1429                         })
1430                 except UnavailableVideoError:
1431                         self._downloader.trouble(u'\nERROR: unable to download video')
1432
1433
1434 class YahooIE(InfoExtractor):
1435         """Information extractor for video.yahoo.com."""
1436
1437         # _VALID_URL matches all Yahoo! Video URLs
1438         # _VPAGE_URL matches only the extractable '/watch/' URLs
1439         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1440         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1441
1442         def __init__(self, downloader=None):
1443                 InfoExtractor.__init__(self, downloader)
1444
1445         @staticmethod
1446         def suitable(url):
1447                 return (re.match(YahooIE._VALID_URL, url) is not None)
1448
1449         def report_download_webpage(self, video_id):
1450                 """Report webpage download."""
1451                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1452
1453         def report_extraction(self, video_id):
1454                 """Report information extraction."""
1455                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1456
1457         def _real_initialize(self):
1458                 return
1459
1460         def _real_extract(self, url, new_video=True):
1461                 # Extract ID from URL
1462                 mobj = re.match(self._VALID_URL, url)
1463                 if mobj is None:
1464                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1465                         return
1466
1467                 # At this point we have a new video
1468                 self._downloader.increment_downloads()
1469                 video_id = mobj.group(2)
1470                 video_extension = 'flv'
1471
1472                 # Rewrite valid but non-extractable URLs as
1473                 # extractable English language /watch/ URLs
1474                 if re.match(self._VPAGE_URL, url) is None:
1475                         request = urllib2.Request(url)
1476                         try:
1477                                 webpage = urllib2.urlopen(request).read()
1478                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1479                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1480                                 return
1481
1482                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1483                         if mobj is None:
1484                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1485                                 return
1486                         yahoo_id = mobj.group(1)
1487
1488                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1489                         if mobj is None:
1490                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1491                                 return
1492                         yahoo_vid = mobj.group(1)
1493
1494                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1495                         return self._real_extract(url, new_video=False)
1496
1497                 # Retrieve video webpage to extract further information
1498                 request = urllib2.Request(url)
1499                 try:
1500                         self.report_download_webpage(video_id)
1501                         webpage = urllib2.urlopen(request).read()
1502                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1503                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1504                         return
1505
1506                 # Extract uploader and title from webpage
1507                 self.report_extraction(video_id)
1508                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1509                 if mobj is None:
1510                         self._downloader.trouble(u'ERROR: unable to extract video title')
1511                         return
1512                 video_title = mobj.group(1).decode('utf-8')
1513                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1514
1515                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1516                 if mobj is None:
1517                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1518                         return
1519                 video_uploader = mobj.group(1).decode('utf-8')
1520
1521                 # Extract video thumbnail
1522                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1523                 if mobj is None:
1524                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1525                         return
1526                 video_thumbnail = mobj.group(1).decode('utf-8')
1527
1528                 # Extract video description
1529                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1530                 if mobj is None:
1531                         self._downloader.trouble(u'ERROR: unable to extract video description')
1532                         return
1533                 video_description = mobj.group(1).decode('utf-8')
1534                 if not video_description: video_description = 'No description available.'
1535
1536                 # Extract video height and width
1537                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1538                 if mobj is None:
1539                         self._downloader.trouble(u'ERROR: unable to extract video height')
1540                         return
1541                 yv_video_height = mobj.group(1)
1542
1543                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1544                 if mobj is None:
1545                         self._downloader.trouble(u'ERROR: unable to extract video width')
1546                         return
1547                 yv_video_width = mobj.group(1)
1548
1549                 # Retrieve video playlist to extract media URL
1550                 # I'm not completely sure what all these options are, but we
1551                 # seem to need most of them, otherwise the server sends a 401.
1552                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1553                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1554                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1555                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1556                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1557                 try:
1558                         self.report_download_webpage(video_id)
1559                         webpage = urllib2.urlopen(request).read()
1560                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1561                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1562                         return
1563
1564                 # Extract media URL from playlist XML
1565                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1566                 if mobj is None:
1567                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1568                         return
1569                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1570                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1571
1572                 try:
1573                         # Process video information
1574                         self._downloader.process_info({
1575                                 'id':           video_id.decode('utf-8'),
1576                                 'url':          video_url,
1577                                 'uploader':     video_uploader,
1578                                 'upload_date':  u'NA',
1579                                 'title':        video_title,
1580                                 'stitle':       simple_title,
1581                                 'ext':          video_extension.decode('utf-8'),
1582                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1583                                 'description':  video_description,
1584                                 'thumbnail':    video_thumbnail,
1585                                 'description':  video_description,
1586                                 'player_url':   None,
1587                         })
1588                 except UnavailableVideoError:
1589                         self._downloader.trouble(u'\nERROR: unable to download video')
1590
1591
1592 class GenericIE(InfoExtractor):
1593         """Generic last-resort information extractor."""
1594
1595         def __init__(self, downloader=None):
1596                 InfoExtractor.__init__(self, downloader)
1597
1598         @staticmethod
1599         def suitable(url):
1600                 return True
1601
1602         def report_download_webpage(self, video_id):
1603                 """Report webpage download."""
1604                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1605                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1606
1607         def report_extraction(self, video_id):
1608                 """Report information extraction."""
1609                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1610
1611         def _real_initialize(self):
1612                 return
1613
1614         def _real_extract(self, url):
1615                 # At this point we have a new video
1616                 self._downloader.increment_downloads()
1617
1618                 video_id = url.split('/')[-1]
1619                 request = urllib2.Request(url)
1620                 try:
1621                         self.report_download_webpage(video_id)
1622                         webpage = urllib2.urlopen(request).read()
1623                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1624                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1625                         return
1626                 except ValueError, err:
1627                         # since this is the last-resort InfoExtractor, if
1628                         # this error is thrown, it'll be thrown here
1629                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1630                         return
1631
1632                 self.report_extraction(video_id)
1633                 # Start with something easy: JW Player in SWFObject
1634                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1635                 if mobj is None:
1636                         # Broaden the search a little bit
1637                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1638                 if mobj is None:
1639                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1640                         return
1641
1642                 # It's possible that one of the regexes
1643                 # matched, but returned an empty group:
1644                 if mobj.group(1) is None:
1645                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1646                         return
1647
1648                 video_url = urllib.unquote(mobj.group(1))
1649                 video_id  = os.path.basename(video_url)
1650
1651                 # here's a fun little line of code for you:
1652                 video_extension = os.path.splitext(video_id)[1][1:]
1653                 video_id        = os.path.splitext(video_id)[0]
1654
1655                 # it's tempting to parse this further, but you would
1656                 # have to take into account all the variations like
1657                 #   Video Title - Site Name
1658                 #   Site Name | Video Title
1659                 #   Video Title - Tagline | Site Name
1660                 # and so on and so forth; it's just not practical
1661                 mobj = re.search(r'<title>(.*)</title>', webpage)
1662                 if mobj is None:
1663                         self._downloader.trouble(u'ERROR: unable to extract title')
1664                         return
1665                 video_title = mobj.group(1).decode('utf-8')
1666                 video_title = sanitize_title(video_title)
1667                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1668
1669                 # video uploader is domain name
1670                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1671                 if mobj is None:
1672                         self._downloader.trouble(u'ERROR: unable to extract title')
1673                         return
1674                 video_uploader = mobj.group(1).decode('utf-8')
1675
1676                 try:
1677                         # Process video information
1678                         self._downloader.process_info({
1679                                 'id':           video_id.decode('utf-8'),
1680                                 'url':          video_url.decode('utf-8'),
1681                                 'uploader':     video_uploader,
1682                                 'upload_date':  u'NA',
1683                                 'title':        video_title,
1684                                 'stitle':       simple_title,
1685                                 'ext':          video_extension.decode('utf-8'),
1686                                 'format':       u'NA',
1687                                 'player_url':   None,
1688                         })
1689                 except UnavailableVideoError, err:
1690                         self._downloader.trouble(u'\nERROR: unable to download video')
1691
1692
1693 class YoutubeSearchIE(InfoExtractor):
1694         """Information Extractor for YouTube search queries."""
1695         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1696         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1697         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1698         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1699         _youtube_ie = None
1700         _max_youtube_results = 1000
1701
1702         def __init__(self, youtube_ie, downloader=None):
1703                 InfoExtractor.__init__(self, downloader)
1704                 self._youtube_ie = youtube_ie
1705         
1706         @staticmethod
1707         def suitable(url):
1708                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1709
1710         def report_download_page(self, query, pagenum):
1711                 """Report attempt to download playlist page with given number."""
1712                 query = query.decode(preferredencoding())
1713                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1714
1715         def _real_initialize(self):
1716                 self._youtube_ie.initialize()
1717         
1718         def _real_extract(self, query):
1719                 mobj = re.match(self._VALID_QUERY, query)
1720                 if mobj is None:
1721                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1722                         return
1723
1724                 prefix, query = query.split(':')
1725                 prefix = prefix[8:]
1726                 query  = query.encode('utf-8')
1727                 if prefix == '':
1728                         self._download_n_results(query, 1)
1729                         return
1730                 elif prefix == 'all':
1731                         self._download_n_results(query, self._max_youtube_results)
1732                         return
1733                 else:
1734                         try:
1735                                 n = long(prefix)
1736                                 if n <= 0:
1737                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1738                                         return
1739                                 elif n > self._max_youtube_results:
1740                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1741                                         n = self._max_youtube_results
1742                                 self._download_n_results(query, n)
1743                                 return
1744                         except ValueError: # parsing prefix as integer fails
1745                                 self._download_n_results(query, 1)
1746                                 return
1747
1748         def _download_n_results(self, query, n):
1749                 """Downloads a specified number of results for a query"""
1750
1751                 video_ids = []
1752                 already_seen = set()
1753                 pagenum = 1
1754
1755                 while True:
1756                         self.report_download_page(query, pagenum)
1757                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1758                         request = urllib2.Request(result_url, None, std_headers)
1759                         try:
1760                                 page = urllib2.urlopen(request).read()
1761                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1762                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1763                                 return
1764
1765                         # Extract video identifiers
1766                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1767                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1768                                 if video_id not in already_seen:
1769                                         video_ids.append(video_id)
1770                                         already_seen.add(video_id)
1771                                         if len(video_ids) == n:
1772                                                 # Specified n videos reached
1773                                                 for id in video_ids:
1774                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1775                                                 return
1776
1777                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1778                                 for id in video_ids:
1779                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1780                                 return
1781
1782                         pagenum = pagenum + 1
1783
1784 class GoogleSearchIE(InfoExtractor):
1785         """Information Extractor for Google Video search queries."""
1786         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1787         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1788         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1789         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1790         _google_ie = None
1791         _max_google_results = 1000
1792
1793         def __init__(self, google_ie, downloader=None):
1794                 InfoExtractor.__init__(self, downloader)
1795                 self._google_ie = google_ie
1796         
1797         @staticmethod
1798         def suitable(url):
1799                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1800
1801         def report_download_page(self, query, pagenum):
1802                 """Report attempt to download playlist page with given number."""
1803                 query = query.decode(preferredencoding())
1804                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1805
1806         def _real_initialize(self):
1807                 self._google_ie.initialize()
1808         
1809         def _real_extract(self, query):
1810                 mobj = re.match(self._VALID_QUERY, query)
1811                 if mobj is None:
1812                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1813                         return
1814
1815                 prefix, query = query.split(':')
1816                 prefix = prefix[8:]
1817                 query  = query.encode('utf-8')
1818                 if prefix == '':
1819                         self._download_n_results(query, 1)
1820                         return
1821                 elif prefix == 'all':
1822                         self._download_n_results(query, self._max_google_results)
1823                         return
1824                 else:
1825                         try:
1826                                 n = long(prefix)
1827                                 if n <= 0:
1828                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1829                                         return
1830                                 elif n > self._max_google_results:
1831                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1832                                         n = self._max_google_results
1833                                 self._download_n_results(query, n)
1834                                 return
1835                         except ValueError: # parsing prefix as integer fails
1836                                 self._download_n_results(query, 1)
1837                                 return
1838
1839         def _download_n_results(self, query, n):
1840                 """Downloads a specified number of results for a query"""
1841
1842                 video_ids = []
1843                 already_seen = set()
1844                 pagenum = 1
1845
1846                 while True:
1847                         self.report_download_page(query, pagenum)
1848                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1849                         request = urllib2.Request(result_url, None, std_headers)
1850                         try:
1851                                 page = urllib2.urlopen(request).read()
1852                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1853                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1854                                 return
1855
1856                         # Extract video identifiers
1857                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1858                                 video_id = mobj.group(1)
1859                                 if video_id not in already_seen:
1860                                         video_ids.append(video_id)
1861                                         already_seen.add(video_id)
1862                                         if len(video_ids) == n:
1863                                                 # Specified n videos reached
1864                                                 for id in video_ids:
1865                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1866                                                 return
1867
1868                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1869                                 for id in video_ids:
1870                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1871                                 return
1872
1873                         pagenum = pagenum + 1
1874
1875 class YahooSearchIE(InfoExtractor):
1876         """Information Extractor for Yahoo! Video search queries."""
1877         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1878         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1879         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1880         _MORE_PAGES_INDICATOR = r'\s*Next'
1881         _yahoo_ie = None
1882         _max_yahoo_results = 1000
1883
1884         def __init__(self, yahoo_ie, downloader=None):
1885                 InfoExtractor.__init__(self, downloader)
1886                 self._yahoo_ie = yahoo_ie
1887         
1888         @staticmethod
1889         def suitable(url):
1890                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1891
1892         def report_download_page(self, query, pagenum):
1893                 """Report attempt to download playlist page with given number."""
1894                 query = query.decode(preferredencoding())
1895                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1896
1897         def _real_initialize(self):
1898                 self._yahoo_ie.initialize()
1899         
1900         def _real_extract(self, query):
1901                 mobj = re.match(self._VALID_QUERY, query)
1902                 if mobj is None:
1903                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1904                         return
1905
1906                 prefix, query = query.split(':')
1907                 prefix = prefix[8:]
1908                 query  = query.encode('utf-8')
1909                 if prefix == '':
1910                         self._download_n_results(query, 1)
1911                         return
1912                 elif prefix == 'all':
1913                         self._download_n_results(query, self._max_yahoo_results)
1914                         return
1915                 else:
1916                         try:
1917                                 n = long(prefix)
1918                                 if n <= 0:
1919                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1920                                         return
1921                                 elif n > self._max_yahoo_results:
1922                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1923                                         n = self._max_yahoo_results
1924                                 self._download_n_results(query, n)
1925                                 return
1926                         except ValueError: # parsing prefix as integer fails
1927                                 self._download_n_results(query, 1)
1928                                 return
1929
1930         def _download_n_results(self, query, n):
1931                 """Downloads a specified number of results for a query"""
1932
1933                 video_ids = []
1934                 already_seen = set()
1935                 pagenum = 1
1936
1937                 while True:
1938                         self.report_download_page(query, pagenum)
1939                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1940                         request = urllib2.Request(result_url, None, std_headers)
1941                         try:
1942                                 page = urllib2.urlopen(request).read()
1943                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1944                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1945                                 return
1946
1947                         # Extract video identifiers
1948                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1949                                 video_id = mobj.group(1)
1950                                 if video_id not in already_seen:
1951                                         video_ids.append(video_id)
1952                                         already_seen.add(video_id)
1953                                         if len(video_ids) == n:
1954                                                 # Specified n videos reached
1955                                                 for id in video_ids:
1956                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1957                                                 return
1958
1959                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1960                                 for id in video_ids:
1961                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1962                                 return
1963
1964                         pagenum = pagenum + 1
1965
1966 class YoutubePlaylistIE(InfoExtractor):
1967         """Information Extractor for YouTube playlists."""
1968
1969         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1970         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1971         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1972         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1973         _youtube_ie = None
1974
1975         def __init__(self, youtube_ie, downloader=None):
1976                 InfoExtractor.__init__(self, downloader)
1977                 self._youtube_ie = youtube_ie
1978         
1979         @staticmethod
1980         def suitable(url):
1981                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1982
1983         def report_download_page(self, playlist_id, pagenum):
1984                 """Report attempt to download playlist page with given number."""
1985                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1986
1987         def _real_initialize(self):
1988                 self._youtube_ie.initialize()
1989         
1990         def _real_extract(self, url):
1991                 # Extract playlist id
1992                 mobj = re.match(self._VALID_URL, url)
1993                 if mobj is None:
1994                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1995                         return
1996
1997                 # Download playlist pages
1998                 playlist_id = mobj.group(1)
1999                 video_ids = []
2000                 pagenum = 1
2001
2002                 while True:
2003                         self.report_download_page(playlist_id, pagenum)
2004                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
2005                         try:
2006                                 page = urllib2.urlopen(request).read()
2007                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2008                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2009                                 return
2010
2011                         # Extract video identifiers
2012                         ids_in_page = []
2013                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2014                                 if mobj.group(1) not in ids_in_page:
2015                                         ids_in_page.append(mobj.group(1))
2016                         video_ids.extend(ids_in_page)
2017
2018                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2019                                 break
2020                         pagenum = pagenum + 1
2021
2022                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2023                 playlistend = self._downloader.params.get('playlistend', -1)
2024                 video_ids = video_ids[playliststart:playlistend]
2025
2026                 for id in video_ids:
2027                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2028                 return
2029
2030 class YoutubeUserIE(InfoExtractor):
2031         """Information Extractor for YouTube users."""
2032
2033         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2034         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2035         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2036         _youtube_ie = None
2037
2038         def __init__(self, youtube_ie, downloader=None):
2039                 InfoExtractor.__init__(self, downloader)
2040                 self._youtube_ie = youtube_ie
2041         
2042         @staticmethod
2043         def suitable(url):
2044                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2045
2046         def report_download_page(self, username):
2047                 """Report attempt to download user page."""
2048                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2049
2050         def _real_initialize(self):
2051                 self._youtube_ie.initialize()
2052         
2053         def _real_extract(self, url):
2054                 # Extract username
2055                 mobj = re.match(self._VALID_URL, url)
2056                 if mobj is None:
2057                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2058                         return
2059
2060                 # Download user page
2061                 username = mobj.group(1)
2062                 video_ids = []
2063                 pagenum = 1
2064
2065                 self.report_download_page(username)
2066                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2067                 try:
2068                         page = urllib2.urlopen(request).read()
2069                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2070                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2071                         return
2072
2073                 # Extract video identifiers
2074                 ids_in_page = []
2075
2076                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2077                         if mobj.group(1) not in ids_in_page:
2078                                 ids_in_page.append(mobj.group(1))
2079                 video_ids.extend(ids_in_page)
2080
2081                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2082                 playlistend = self._downloader.params.get('playlistend', -1)
2083                 video_ids = video_ids[playliststart:playlistend]
2084
2085                 for id in video_ids:
2086                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2087                 return
2088
2089 class DepositFilesIE(InfoExtractor):
2090         """Information extractor for depositfiles.com"""
2091
2092         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2093
2094         def __init__(self, downloader=None):
2095                 InfoExtractor.__init__(self, downloader)
2096
2097         @staticmethod
2098         def suitable(url):
2099                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2100
2101         def report_download_webpage(self, file_id):
2102                 """Report webpage download."""
2103                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2104
2105         def report_extraction(self, file_id):
2106                 """Report information extraction."""
2107                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2108
2109         def _real_initialize(self):
2110                 return
2111
2112         def _real_extract(self, url):
2113                 # At this point we have a new file
2114                 self._downloader.increment_downloads()
2115
2116                 file_id = url.split('/')[-1]
2117                 # Rebuild url in english locale
2118                 url = 'http://depositfiles.com/en/files/' + file_id
2119
2120                 # Retrieve file webpage with 'Free download' button pressed
2121                 free_download_indication = { 'gateway_result' : '1' }
2122                 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2123                 try:
2124                         self.report_download_webpage(file_id)
2125                         webpage = urllib2.urlopen(request).read()
2126                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2127                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2128                         return
2129
2130                 # Search for the real file URL
2131                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2132                 if (mobj is None) or (mobj.group(1) is None):
2133                         # Try to figure out reason of the error.
2134                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2135                         if (mobj is not None) and (mobj.group(1) is not None):
2136                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2137                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2138                         else:
2139                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2140                         return
2141
2142                 file_url = mobj.group(1)
2143                 file_extension = os.path.splitext(file_url)[1][1:]
2144
2145                 # Search for file title
2146                 mobj = re.search(r'<b title="(.*?)">', webpage)
2147                 if mobj is None:
2148                         self._downloader.trouble(u'ERROR: unable to extract title')
2149                         return
2150                 file_title = mobj.group(1).decode('utf-8')
2151
2152                 try:
2153                         # Process file information
2154                         self._downloader.process_info({
2155                                 'id':           file_id.decode('utf-8'),
2156                                 'url':          file_url.decode('utf-8'),
2157                                 'uploader':     u'NA',
2158                                 'upload_date':  u'NA',
2159                                 'title':        file_title,
2160                                 'stitle':       file_title,
2161                                 'ext':          file_extension.decode('utf-8'),
2162                                 'format':       u'NA',
2163                                 'player_url':   None,
2164                         })
2165                 except UnavailableVideoError, err:
2166                         self._downloader.trouble(u'ERROR: unable to download file')
2167
2168 class PostProcessor(object):
2169         """Post Processor class.
2170
2171         PostProcessor objects can be added to downloaders with their
2172         add_post_processor() method. When the downloader has finished a
2173         successful download, it will take its internal chain of PostProcessors
2174         and start calling the run() method on each one of them, first with
2175         an initial argument and then with the returned value of the previous
2176         PostProcessor.
2177
2178         The chain will be stopped if one of them ever returns None or the end
2179         of the chain is reached.
2180
2181         PostProcessor objects follow a "mutual registration" process similar
2182         to InfoExtractor objects.
2183         """
2184
2185         _downloader = None
2186
2187         def __init__(self, downloader=None):
2188                 self._downloader = downloader
2189
2190         def set_downloader(self, downloader):
2191                 """Sets the downloader for this PP."""
2192                 self._downloader = downloader
2193         
2194         def run(self, information):
2195                 """Run the PostProcessor.
2196
2197                 The "information" argument is a dictionary like the ones
2198                 composed by InfoExtractors. The only difference is that this
2199                 one has an extra field called "filepath" that points to the
2200                 downloaded file.
2201
2202                 When this method returns None, the postprocessing chain is
2203                 stopped. However, this method may return an information
2204                 dictionary that will be passed to the next postprocessing
2205                 object in the chain. It can be the one it received after
2206                 changing some fields.
2207
2208                 In addition, this method may raise a PostProcessingError
2209                 exception that will be taken into account by the downloader
2210                 it was called from.
2211                 """
2212                 return information # by default, do nothing
2213         
2214 ### MAIN PROGRAM ###
2215 if __name__ == '__main__':
2216         try:
2217                 # Modules needed only when running the main program
2218                 import getpass
2219                 import optparse
2220
2221                 # Function to update the program file with the latest version from bitbucket.org
2222                 def update_self(downloader, filename):
2223                         # Note: downloader only used for options
2224                         if not os.access (filename, os.W_OK):
2225                                 sys.exit('ERROR: no write permissions on %s' % filename)
2226
2227                         downloader.to_screen('Updating to latest stable version...')
2228                         latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2229                         latest_version = urllib.urlopen(latest_url).read().strip()
2230                         prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2231                         newcontent = urllib.urlopen(prog_url).read()
2232                         stream = open(filename, 'w')
2233                         stream.write(newcontent)
2234                         stream.close()
2235                         downloader.to_screen('Updated to version %s' % latest_version)
2236
2237                 # Parse command line
2238                 parser = optparse.OptionParser(
2239                         usage='Usage: %prog [options] url...',
2240                         version='2010.12.09',
2241                         conflict_handler='resolve',
2242                 )
2243
2244                 parser.add_option('-h', '--help',
2245                                 action='help', help='print this help text and exit')
2246                 parser.add_option('-v', '--version',
2247                                 action='version', help='print program version and exit')
2248                 parser.add_option('-U', '--update',
2249                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2250                 parser.add_option('-i', '--ignore-errors',
2251                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2252                 parser.add_option('-r', '--rate-limit',
2253                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2254                 parser.add_option('-R', '--retries',
2255                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2256                 parser.add_option('--playlist-start',
2257                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2258                 parser.add_option('--playlist-end',
2259                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2260                 parser.add_option('--dump-user-agent',
2261                                 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2262
2263                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2264                 authentication.add_option('-u', '--username',
2265                                 dest='username', metavar='USERNAME', help='account username')
2266                 authentication.add_option('-p', '--password',
2267                                 dest='password', metavar='PASSWORD', help='account password')
2268                 authentication.add_option('-n', '--netrc',
2269                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2270                 parser.add_option_group(authentication)
2271
2272                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2273                 video_format.add_option('-f', '--format',
2274                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2275                 video_format.add_option('--all-formats',
2276                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2277                 video_format.add_option('--max-quality',
2278                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2279                 parser.add_option_group(video_format)
2280
2281                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2282                 verbosity.add_option('-q', '--quiet',
2283                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2284                 verbosity.add_option('-s', '--simulate',
2285                                 action='store_true', dest='simulate', help='do not download video', default=False)
2286                 verbosity.add_option('-g', '--get-url',
2287                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2288                 verbosity.add_option('-e', '--get-title',
2289                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2290                 verbosity.add_option('--get-thumbnail',
2291                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2292                 verbosity.add_option('--get-description',
2293                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2294                 verbosity.add_option('--no-progress',
2295                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2296                 parser.add_option_group(verbosity)
2297
2298                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2299                 filesystem.add_option('-t', '--title',
2300                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2301                 filesystem.add_option('-l', '--literal',
2302                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2303                 filesystem.add_option('-A', '--auto-number',
2304                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2305                 filesystem.add_option('-o', '--output',
2306                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2307                 filesystem.add_option('-a', '--batch-file',
2308                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2309                 filesystem.add_option('-w', '--no-overwrites',
2310                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2311                 filesystem.add_option('-c', '--continue',
2312                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2313                 filesystem.add_option('--cookies',
2314                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2315                 parser.add_option_group(filesystem)
2316
2317                 (opts, args) = parser.parse_args()
2318
2319                 # Open appropriate CookieJar
2320                 if opts.cookiefile is None:
2321                         jar = cookielib.CookieJar()
2322                 else:
2323                         try:
2324                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2325                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2326                                         jar.load()
2327                         except (IOError, OSError), err:
2328                                 sys.exit(u'ERROR: unable to open cookie file')
2329
2330                 # Dump user agent
2331                 if opts.dump_user_agent:
2332                         print std_headers['User-Agent']
2333                         sys.exit(0)
2334
2335                 # General configuration
2336                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2337                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2338                 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2339                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2340
2341                 # Batch file verification
2342                 batchurls = []
2343                 if opts.batchfile is not None:
2344                         try:
2345                                 if opts.batchfile == '-':
2346                                         batchfd = sys.stdin
2347                                 else:
2348                                         batchfd = open(opts.batchfile, 'r')
2349                                 batchurls = batchfd.readlines()
2350                                 batchurls = [x.strip() for x in batchurls]
2351                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2352                         except IOError:
2353                                 sys.exit(u'ERROR: batch file could not be read')
2354                 all_urls = batchurls + args
2355
2356                 # Conflicting, missing and erroneous options
2357                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2358                         parser.error(u'using .netrc conflicts with giving username/password')
2359                 if opts.password is not None and opts.username is None:
2360                         parser.error(u'account username missing')
2361                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2362                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2363                 if opts.usetitle and opts.useliteral:
2364                         parser.error(u'using title conflicts with using literal title')
2365                 if opts.username is not None and opts.password is None:
2366                         opts.password = getpass.getpass(u'Type account password and press return:')
2367                 if opts.ratelimit is not None:
2368                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2369                         if numeric_limit is None:
2370                                 parser.error(u'invalid rate limit specified')
2371                         opts.ratelimit = numeric_limit
2372                 if opts.retries is not None:
2373                         try:
2374                                 opts.retries = long(opts.retries)
2375                         except (TypeError, ValueError), err:
2376                                 parser.error(u'invalid retry count specified')
2377                 try:
2378                         opts.playliststart = long(opts.playliststart)
2379                         if opts.playliststart <= 0:
2380                                 raise ValueError
2381                 except (TypeError, ValueError), err:
2382                         parser.error(u'invalid playlist start number specified')
2383                 try:
2384                         opts.playlistend = long(opts.playlistend)
2385                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2386                                 raise ValueError
2387                 except (TypeError, ValueError), err:
2388                         parser.error(u'invalid playlist end number specified')
2389
2390                 # Information extractors
2391                 youtube_ie = YoutubeIE()
2392                 metacafe_ie = MetacafeIE(youtube_ie)
2393                 dailymotion_ie = DailymotionIE()
2394                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2395                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2396                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2397                 google_ie = GoogleIE()
2398                 google_search_ie = GoogleSearchIE(google_ie)
2399                 photobucket_ie = PhotobucketIE()
2400                 yahoo_ie = YahooIE()
2401                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2402                 deposit_files_ie = DepositFilesIE()
2403                 generic_ie = GenericIE()
2404
2405                 # File downloader
2406                 fd = FileDownloader({
2407                         'usenetrc': opts.usenetrc,
2408                         'username': opts.username,
2409                         'password': opts.password,
2410                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2411                         'forceurl': opts.geturl,
2412                         'forcetitle': opts.gettitle,
2413                         'forcethumbnail': opts.getthumbnail,
2414                         'forcedescription': opts.getdescription,
2415                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2416                         'format': opts.format,
2417                         'format_limit': opts.format_limit,
2418                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2419                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2420                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2421                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2422                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2423                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2424                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2425                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2426                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2427                                 or u'%(id)s.%(ext)s'),
2428                         'ignoreerrors': opts.ignoreerrors,
2429                         'ratelimit': opts.ratelimit,
2430                         'nooverwrites': opts.nooverwrites,
2431                         'retries': opts.retries,
2432                         'continuedl': opts.continue_dl,
2433                         'noprogress': opts.noprogress,
2434                         'playliststart': opts.playliststart,
2435                         'playlistend': opts.playlistend,
2436                         'logtostderr': opts.outtmpl == '-',
2437                         })
2438                 fd.add_info_extractor(youtube_search_ie)
2439                 fd.add_info_extractor(youtube_pl_ie)
2440                 fd.add_info_extractor(youtube_user_ie)
2441                 fd.add_info_extractor(metacafe_ie)
2442                 fd.add_info_extractor(dailymotion_ie)
2443                 fd.add_info_extractor(youtube_ie)
2444                 fd.add_info_extractor(google_ie)
2445                 fd.add_info_extractor(google_search_ie)
2446                 fd.add_info_extractor(photobucket_ie)
2447                 fd.add_info_extractor(yahoo_ie)
2448                 fd.add_info_extractor(yahoo_search_ie)
2449                 fd.add_info_extractor(deposit_files_ie)
2450
2451                 # This must come last since it's the
2452                 # fallback if none of the others work
2453                 fd.add_info_extractor(generic_ie)
2454
2455                 # Update version
2456                 if opts.update_self:
2457                         update_self(fd, sys.argv[0])
2458
2459                 # Maybe do nothing
2460                 if len(all_urls) < 1:
2461                         if not opts.update_self:
2462                                 parser.error(u'you must provide at least one URL')
2463                         else:
2464                                 sys.exit()
2465                 retcode = fd.download(all_urls)
2466
2467                 # Dump cookie jar if requested
2468                 if opts.cookiefile is not None:
2469                         try:
2470                                 jar.save()
2471                         except (IOError, OSError), err:
2472                                 sys.exit(u'ERROR: unable to save cookie jar')
2473
2474                 sys.exit(retcode)
2475
2476         except DownloadError:
2477                 sys.exit(1)
2478         except SameFileError:
2479                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2480         except KeyboardInterrupt:
2481                 sys.exit(u'\nERROR: Interrupted by user')