Bump version number
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
8 import cookielib
9 import datetime
10 import htmlentitydefs
11 import httplib
12 import locale
13 import math
14 import netrc
15 import os
16 import os.path
17 import re
18 import socket
19 import string
20 import subprocess
21 import sys
22 import time
23 import urllib
24 import urllib2
25
26 # parse_qs was moved from the cgi module to the urlparse module recently.
27 try:
28         from urlparse import parse_qs
29 except ImportError:
30         from cgi import parse_qs
31
32 std_headers = {
33         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
34         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
35         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36         'Accept-Language': 'en-us,en;q=0.5',
37 }
38
39 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
40
41 def preferredencoding():
42         """Get preferred encoding.
43
44         Returns the best encoding scheme for the system, based on
45         locale.getpreferredencoding() and some further tweaks.
46         """
47         def yield_preferredencoding():
48                 try:
49                         pref = locale.getpreferredencoding()
50                         u'TEST'.encode(pref)
51                 except:
52                         pref = 'UTF-8'
53                 while True:
54                         yield pref
55         return yield_preferredencoding().next()
56
57 def htmlentity_transform(matchobj):
58         """Transforms an HTML entity to a Unicode character.
59         
60         This function receives a match object and is intended to be used with
61         the re.sub() function.
62         """
63         entity = matchobj.group(1)
64
65         # Known non-numeric HTML entity
66         if entity in htmlentitydefs.name2codepoint:
67                 return unichr(htmlentitydefs.name2codepoint[entity])
68
69         # Unicode character
70         mobj = re.match(ur'(?u)#(x?\d+)', entity)
71         if mobj is not None:
72                 numstr = mobj.group(1)
73                 if numstr.startswith(u'x'):
74                         base = 16
75                         numstr = u'0%s' % numstr
76                 else:
77                         base = 10
78                 return unichr(long(numstr, base))
79
80         # Unknown entity in name, return its literal representation
81         return (u'&%s;' % entity)
82
83 def sanitize_title(utitle):
84         """Sanitizes a video title so it could be used as part of a filename."""
85         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
86         return utitle.replace(unicode(os.sep), u'%')
87
88 def sanitize_open(filename, open_mode):
89         """Try to open the given filename, and slightly tweak it if this fails.
90
91         Attempts to open the given filename. If this fails, it tries to change
92         the filename slightly, step by step, until it's either able to open it
93         or it fails and raises a final exception, like the standard open()
94         function.
95
96         It returns the tuple (stream, definitive_file_name).
97         """
98         try:
99                 if filename == u'-':
100                         if sys.platform == 'win32':
101                                 import msvcrt
102                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
103                         return (sys.stdout, filename)
104                 stream = open(filename, open_mode)
105                 return (stream, filename)
106         except (IOError, OSError), err:
107                 # In case of error, try to remove win32 forbidden chars
108                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
109
110                 # An exception here should be caught in the caller
111                 stream = open(filename, open_mode)
112                 return (stream, filename)
113
114 class DownloadError(Exception):
115         """Download Error exception.
116         
117         This exception may be thrown by FileDownloader objects if they are not
118         configured to continue on errors. They will contain the appropriate
119         error message.
120         """
121         pass
122
123 class SameFileError(Exception):
124         """Same File exception.
125
126         This exception will be thrown by FileDownloader objects if they detect
127         multiple files would have to be downloaded to the same file on disk.
128         """
129         pass
130
131 class PostProcessingError(Exception):
132         """Post Processing exception.
133
134         This exception may be raised by PostProcessor's .run() method to
135         indicate an error in the postprocessing task.
136         """
137         pass
138
139 class UnavailableVideoError(Exception):
140         """Unavailable Format exception.
141
142         This exception will be thrown when a video is requested
143         in a format that is not available for that video.
144         """
145         pass
146
147 class ContentTooShortError(Exception):
148         """Content Too Short exception.
149
150         This exception may be raised by FileDownloader objects when a file they
151         download is too small for what the server announced first, indicating
152         the connection was probably interrupted.
153         """
154         # Both in bytes
155         downloaded = None
156         expected = None
157
158         def __init__(self, downloaded, expected):
159                 self.downloaded = downloaded
160                 self.expected = expected
161
162 class FileDownloader(object):
163         """File Downloader class.
164
165         File downloader objects are the ones responsible of downloading the
166         actual video file and writing it to disk if the user has requested
167         it, among some other tasks. In most cases there should be one per
168         program. As, given a video URL, the downloader doesn't know how to
169         extract all the needed information, task that InfoExtractors do, it
170         has to pass the URL to one of them.
171
172         For this, file downloader objects have a method that allows
173         InfoExtractors to be registered in a given order. When it is passed
174         a URL, the file downloader handles it to the first InfoExtractor it
175         finds that reports being able to handle it. The InfoExtractor extracts
176         all the information about the video or videos the URL refers to, and
177         asks the FileDownloader to process the video information, possibly
178         downloading the video.
179
180         File downloaders accept a lot of parameters. In order not to saturate
181         the object constructor with arguments, it receives a dictionary of
182         options instead. These options are available through the params
183         attribute for the InfoExtractors to use. The FileDownloader also
184         registers itself as the downloader in charge for the InfoExtractors
185         that are added to it, so this is a "mutual registration".
186
187         Available options:
188
189         username:         Username for authentication purposes.
190         password:         Password for authentication purposes.
191         usenetrc:         Use netrc for authentication instead.
192         quiet:            Do not print messages to stdout.
193         forceurl:         Force printing final URL.
194         forcetitle:       Force printing title.
195         forcethumbnail:   Force printing thumbnail URL.
196         forcedescription: Force printing description.
197         simulate:         Do not download the video files.
198         format:           Video format code.
199         format_limit:     Highest quality format to try.
200         outtmpl:          Template for output names.
201         ignoreerrors:     Do not stop on download errors.
202         ratelimit:        Download speed limit, in bytes/sec.
203         nooverwrites:     Prevent overwriting files.
204         retries:          Number of times to retry for HTTP error 5xx
205         continuedl:       Try to continue downloads if possible.
206         noprogress:       Do not print the progress bar.
207         playliststart:    Playlist item to start at.
208         playlistend:      Playlist item to end at.
209         logtostderr:      Log messages to stderr instead of stdout.
210         """
211
212         params = None
213         _ies = []
214         _pps = []
215         _download_retcode = None
216         _num_downloads = None
217         _screen_file = None
218
219         def __init__(self, params):
220                 """Create a FileDownloader object with the given options."""
221                 self._ies = []
222                 self._pps = []
223                 self._download_retcode = 0
224                 self._num_downloads = 0
225                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
226                 self.params = params
227         
228         @staticmethod
229         def pmkdir(filename):
230                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
231                 components = filename.split(os.sep)
232                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
233                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
234                 for dir in aggregate:
235                         if not os.path.exists(dir):
236                                 os.mkdir(dir)
237         
238         @staticmethod
239         def temp_name(filename):
240                 """Returns a temporary filename for the given filename."""
241                 if filename == u'-' or (os.path.exists(filename) and not os.path.isfile(filename)):
242                         return filename
243                 return filename + u'.part'
244         
245         @staticmethod
246         def format_bytes(bytes):
247                 if bytes is None:
248                         return 'N/A'
249                 if type(bytes) is str:
250                         bytes = float(bytes)
251                 if bytes == 0.0:
252                         exponent = 0
253                 else:
254                         exponent = long(math.log(bytes, 1024.0))
255                 suffix = 'bkMGTPEZY'[exponent]
256                 converted = float(bytes) / float(1024**exponent)
257                 return '%.2f%s' % (converted, suffix)
258
259         @staticmethod
260         def calc_percent(byte_counter, data_len):
261                 if data_len is None:
262                         return '---.-%'
263                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
264
265         @staticmethod
266         def calc_eta(start, now, total, current):
267                 if total is None:
268                         return '--:--'
269                 dif = now - start
270                 if current == 0 or dif < 0.001: # One millisecond
271                         return '--:--'
272                 rate = float(current) / dif
273                 eta = long((float(total) - float(current)) / rate)
274                 (eta_mins, eta_secs) = divmod(eta, 60)
275                 if eta_mins > 99:
276                         return '--:--'
277                 return '%02d:%02d' % (eta_mins, eta_secs)
278
279         @staticmethod
280         def calc_speed(start, now, bytes):
281                 dif = now - start
282                 if bytes == 0 or dif < 0.001: # One millisecond
283                         return '%10s' % '---b/s'
284                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
285
286         @staticmethod
287         def best_block_size(elapsed_time, bytes):
288                 new_min = max(bytes / 2.0, 1.0)
289                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
290                 if elapsed_time < 0.001:
291                         return long(new_max)
292                 rate = bytes / elapsed_time
293                 if rate > new_max:
294                         return long(new_max)
295                 if rate < new_min:
296                         return long(new_min)
297                 return long(rate)
298
299         @staticmethod
300         def parse_bytes(bytestr):
301                 """Parse a string indicating a byte quantity into a long integer."""
302                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
303                 if matchobj is None:
304                         return None
305                 number = float(matchobj.group(1))
306                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
307                 return long(round(number * multiplier))
308
309         def add_info_extractor(self, ie):
310                 """Add an InfoExtractor object to the end of the list."""
311                 self._ies.append(ie)
312                 ie.set_downloader(self)
313         
314         def add_post_processor(self, pp):
315                 """Add a PostProcessor object to the end of the chain."""
316                 self._pps.append(pp)
317                 pp.set_downloader(self)
318         
319         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
320                 """Print message to stdout if not in quiet mode."""
321                 try:
322                         if not self.params.get('quiet', False):
323                                 terminator = [u'\n', u''][skip_eol]
324                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
325                         self._screen_file.flush()
326                 except (UnicodeEncodeError), err:
327                         if not ignore_encoding_errors:
328                                 raise
329         
330         def to_stderr(self, message):
331                 """Print message to stderr."""
332                 print >>sys.stderr, message.encode(preferredencoding())
333         
334         def fixed_template(self):
335                 """Checks if the output template is fixed."""
336                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
337
338         def trouble(self, message=None):
339                 """Determine action to take when a download problem appears.
340
341                 Depending on if the downloader has been configured to ignore
342                 download errors or not, this method may throw an exception or
343                 not when errors are found, after printing the message.
344                 """
345                 if message is not None:
346                         self.to_stderr(message)
347                 if not self.params.get('ignoreerrors', False):
348                         raise DownloadError(message)
349                 self._download_retcode = 1
350
351         def slow_down(self, start_time, byte_counter):
352                 """Sleep if the download speed is over the rate limit."""
353                 rate_limit = self.params.get('ratelimit', None)
354                 if rate_limit is None or byte_counter == 0:
355                         return
356                 now = time.time()
357                 elapsed = now - start_time
358                 if elapsed <= 0.0:
359                         return
360                 speed = float(byte_counter) / elapsed
361                 if speed > rate_limit:
362                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
363         
364         def try_rename(self, old_filename, new_filename):
365                 try:
366                         if old_filename == new_filename:
367                                 return
368                         os.rename(old_filename, new_filename)
369                 except (IOError, OSError), err:
370                         self.trouble(u'ERROR: unable to rename file')
371
372         def report_destination(self, filename):
373                 """Report destination filename."""
374                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
375         
376         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
377                 """Report download progress."""
378                 if self.params.get('noprogress', False):
379                         return
380                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
381                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
382
383         def report_resuming_byte(self, resume_len):
384                 """Report attempt to resume at given byte."""
385                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
386         
387         def report_retry(self, count, retries):
388                 """Report retry in case of HTTP error 5xx"""
389                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
390         
391         def report_file_already_downloaded(self, file_name):
392                 """Report file has already been fully downloaded."""
393                 try:
394                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
395                 except (UnicodeEncodeError), err:
396                         self.to_screen(u'[download] The file has already been downloaded')
397         
398         def report_unable_to_resume(self):
399                 """Report it was impossible to resume download."""
400                 self.to_screen(u'[download] Unable to resume')
401         
402         def report_finish(self):
403                 """Report download finished."""
404                 if self.params.get('noprogress', False):
405                         self.to_screen(u'[download] Download completed')
406                 else:
407                         self.to_screen(u'')
408         
409         def increment_downloads(self):
410                 """Increment the ordinal that assigns a number to each file."""
411                 self._num_downloads += 1
412
413         def process_info(self, info_dict):
414                 """Process a single dictionary returned by an InfoExtractor."""
415                 # Do nothing else if in simulate mode
416                 if self.params.get('simulate', False):
417                         # Forced printings
418                         if self.params.get('forcetitle', False):
419                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
420                         if self.params.get('forceurl', False):
421                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
422                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
423                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
424                         if self.params.get('forcedescription', False) and 'description' in info_dict:
425                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
426
427                         return
428                         
429                 try:
430                         template_dict = dict(info_dict)
431                         template_dict['epoch'] = unicode(long(time.time()))
432                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
433                         filename = self.params['outtmpl'] % template_dict
434                 except (ValueError, KeyError), err:
435                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
436                         return
437                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
438                         self.to_stderr(u'WARNING: file exists and will be skipped')
439                         return
440
441                 try:
442                         self.pmkdir(filename)
443                 except (OSError, IOError), err:
444                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
445                         return
446
447                 try:
448                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
449                 except (OSError, IOError), err:
450                         raise UnavailableVideoError
451                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
452                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
453                         return
454                 except (ContentTooShortError, ), err:
455                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
456                         return
457
458                 if success:
459                         try:
460                                 self.post_process(filename, info_dict)
461                         except (PostProcessingError), err:
462                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
463                                 return
464
465         def download(self, url_list):
466                 """Download a given list of URLs."""
467                 if len(url_list) > 1 and self.fixed_template():
468                         raise SameFileError(self.params['outtmpl'])
469
470                 for url in url_list:
471                         suitable_found = False
472                         for ie in self._ies:
473                                 # Go to next InfoExtractor if not suitable
474                                 if not ie.suitable(url):
475                                         continue
476
477                                 # Suitable InfoExtractor found
478                                 suitable_found = True
479
480                                 # Extract information from URL and process it
481                                 ie.extract(url)
482
483                                 # Suitable InfoExtractor had been found; go to next URL
484                                 break
485
486                         if not suitable_found:
487                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
488
489                 return self._download_retcode
490
491         def post_process(self, filename, ie_info):
492                 """Run the postprocessing chain on the given file."""
493                 info = dict(ie_info)
494                 info['filepath'] = filename
495                 for pp in self._pps:
496                         info = pp.run(info)
497                         if info is None:
498                                 break
499         
500         def _download_with_rtmpdump(self, filename, url, player_url):
501                 self.report_destination(filename)
502                 tmpfilename = self.temp_name(filename)
503
504                 # Check for rtmpdump first
505                 try:
506                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
507                 except (OSError, IOError):
508                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
509                         return False
510
511                 # Download using rtmpdump. rtmpdump returns exit code 2 when
512                 # the connection was interrumpted and resuming appears to be
513                 # possible. This is part of rtmpdump's normal usage, AFAIK.
514                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
515                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
516                 while retval == 2 or retval == 1:
517                         prevsize = os.path.getsize(tmpfilename)
518                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
519                         time.sleep(5.0) # This seems to be needed
520                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
521                         cursize = os.path.getsize(tmpfilename)
522                         if prevsize == cursize and retval == 1:
523                                 break
524                 if retval == 0:
525                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
526                         self.try_rename(tmpfilename, filename)
527                         return True
528                 else:
529                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
530                         return False
531
532         def _do_download(self, filename, url, player_url):
533                 # Check file already present
534                 if self.params.get('continuedl', False) and os.path.isfile(filename):
535                         self.report_file_already_downloaded(filename)
536                         return True
537
538                 # Attempt to download using rtmpdump
539                 if url.startswith('rtmp'):
540                         return self._download_with_rtmpdump(filename, url, player_url)
541
542                 tmpfilename = self.temp_name(filename)
543                 stream = None
544                 open_mode = 'wb'
545                 basic_request = urllib2.Request(url, None, std_headers)
546                 request = urllib2.Request(url, None, std_headers)
547
548                 # Establish possible resume length
549                 if os.path.isfile(tmpfilename):
550                         resume_len = os.path.getsize(tmpfilename)
551                 else:
552                         resume_len = 0
553
554                 # Request parameters in case of being able to resume
555                 if self.params.get('continuedl', False) and resume_len != 0:
556                         self.report_resuming_byte(resume_len)
557                         request.add_header('Range','bytes=%d-' % resume_len)
558                         open_mode = 'ab'
559
560                 count = 0
561                 retries = self.params.get('retries', 0)
562                 while count <= retries:
563                         # Establish connection
564                         try:
565                                 data = urllib2.urlopen(request)
566                                 break
567                         except (urllib2.HTTPError, ), err:
568                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
569                                         # Unexpected HTTP error
570                                         raise
571                                 elif err.code == 416:
572                                         # Unable to resume (requested range not satisfiable)
573                                         try:
574                                                 # Open the connection again without the range header
575                                                 data = urllib2.urlopen(basic_request)
576                                                 content_length = data.info()['Content-Length']
577                                         except (urllib2.HTTPError, ), err:
578                                                 if err.code < 500 or err.code >= 600:
579                                                         raise
580                                         else:
581                                                 # Examine the reported length
582                                                 if (content_length is not None and
583                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
584                                                         # The file had already been fully downloaded.
585                                                         # Explanation to the above condition: in issue #175 it was revealed that
586                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
587                                                         # changing the file size slightly and causing problems for some users. So
588                                                         # I decided to implement a suggested change and consider the file
589                                                         # completely downloaded if the file size differs less than 100 bytes from
590                                                         # the one in the hard drive.
591                                                         self.report_file_already_downloaded(filename)
592                                                         self.try_rename(tmpfilename, filename)
593                                                         return True
594                                                 else:
595                                                         # The length does not match, we start the download over
596                                                         self.report_unable_to_resume()
597                                                         open_mode = 'wb'
598                                                         break
599                         # Retry
600                         count += 1
601                         if count <= retries:
602                                 self.report_retry(count, retries)
603
604                 if count > retries:
605                         self.trouble(u'ERROR: giving up after %s retries' % retries)
606                         return False
607
608                 data_len = data.info().get('Content-length', None)
609                 data_len_str = self.format_bytes(data_len)
610                 byte_counter = 0
611                 block_size = 1024
612                 start = time.time()
613                 while True:
614                         # Download and write
615                         before = time.time()
616                         data_block = data.read(block_size)
617                         after = time.time()
618                         data_block_len = len(data_block)
619                         if data_block_len == 0:
620                                 break
621                         byte_counter += data_block_len
622
623                         # Open file just in time
624                         if stream is None:
625                                 try:
626                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
627                                         self.report_destination(filename)
628                                 except (OSError, IOError), err:
629                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
630                                         return False
631                         try:
632                                 stream.write(data_block)
633                         except (IOError, OSError), err:
634                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
635                                 return False
636                         block_size = self.best_block_size(after - before, data_block_len)
637
638                         # Progress message
639                         percent_str = self.calc_percent(byte_counter, data_len)
640                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
641                         speed_str = self.calc_speed(start, time.time(), byte_counter)
642                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
643
644                         # Apply rate limit
645                         self.slow_down(start, byte_counter)
646
647                 stream.close()
648                 self.report_finish()
649                 if data_len is not None and str(byte_counter) != data_len:
650                         raise ContentTooShortError(byte_counter, long(data_len))
651                 self.try_rename(tmpfilename, filename)
652                 return True
653
654 class InfoExtractor(object):
655         """Information Extractor class.
656
657         Information extractors are the classes that, given a URL, extract
658         information from the video (or videos) the URL refers to. This
659         information includes the real video URL, the video title and simplified
660         title, author and others. The information is stored in a dictionary
661         which is then passed to the FileDownloader. The FileDownloader
662         processes this information possibly downloading the video to the file
663         system, among other possible outcomes. The dictionaries must include
664         the following fields:
665
666         id:             Video identifier.
667         url:            Final video URL.
668         uploader:       Nickname of the video uploader.
669         title:          Literal title.
670         stitle:         Simplified title.
671         ext:            Video filename extension.
672         format:         Video format.
673         player_url:     SWF Player URL (may be None).
674
675         The following fields are optional. Their primary purpose is to allow
676         youtube-dl to serve as the backend for a video search function, such
677         as the one in youtube2mp3.  They are only used when their respective
678         forced printing functions are called:
679
680         thumbnail:      Full URL to a video thumbnail image.
681         description:    One-line video description.
682
683         Subclasses of this one should re-define the _real_initialize() and
684         _real_extract() methods, as well as the suitable() static method.
685         Probably, they should also be instantiated and added to the main
686         downloader.
687         """
688
689         _ready = False
690         _downloader = None
691
692         def __init__(self, downloader=None):
693                 """Constructor. Receives an optional downloader."""
694                 self._ready = False
695                 self.set_downloader(downloader)
696
697         @staticmethod
698         def suitable(url):
699                 """Receives a URL and returns True if suitable for this IE."""
700                 return False
701
702         def initialize(self):
703                 """Initializes an instance (authentication, etc)."""
704                 if not self._ready:
705                         self._real_initialize()
706                         self._ready = True
707
708         def extract(self, url):
709                 """Extracts URL information and returns it in list of dicts."""
710                 self.initialize()
711                 return self._real_extract(url)
712
713         def set_downloader(self, downloader):
714                 """Sets the downloader for this IE."""
715                 self._downloader = downloader
716         
717         def _real_initialize(self):
718                 """Real initialization process. Redefine in subclasses."""
719                 pass
720
721         def _real_extract(self, url):
722                 """Real extraction process. Redefine in subclasses."""
723                 pass
724
725 class YoutubeIE(InfoExtractor):
726         """Information extractor for youtube.com."""
727
728         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
729         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
730         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
731         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
732         _NETRC_MACHINE = 'youtube'
733         # Listed in order of quality
734         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
735         _video_extensions = {
736                 '13': '3gp',
737                 '17': 'mp4',
738                 '18': 'mp4',
739                 '22': 'mp4',
740                 '37': 'mp4',
741                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
742                 '43': 'webm',
743                 '45': 'webm',
744         }
745
746         @staticmethod
747         def suitable(url):
748                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
749
750         def report_lang(self):
751                 """Report attempt to set language."""
752                 self._downloader.to_screen(u'[youtube] Setting language')
753
754         def report_login(self):
755                 """Report attempt to log in."""
756                 self._downloader.to_screen(u'[youtube] Logging in')
757         
758         def report_age_confirmation(self):
759                 """Report attempt to confirm age."""
760                 self._downloader.to_screen(u'[youtube] Confirming age')
761         
762         def report_video_webpage_download(self, video_id):
763                 """Report attempt to download video webpage."""
764                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
765         
766         def report_video_info_webpage_download(self, video_id):
767                 """Report attempt to download video info webpage."""
768                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
769         
770         def report_information_extraction(self, video_id):
771                 """Report attempt to extract video information."""
772                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
773         
774         def report_unavailable_format(self, video_id, format):
775                 """Report extracted video URL."""
776                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
777         
778         def report_rtmp_download(self):
779                 """Indicate the download will use the RTMP protocol."""
780                 self._downloader.to_screen(u'[youtube] RTMP download detected')
781         
782         def _real_initialize(self):
783                 if self._downloader is None:
784                         return
785
786                 username = None
787                 password = None
788                 downloader_params = self._downloader.params
789
790                 # Attempt to use provided username and password or .netrc data
791                 if downloader_params.get('username', None) is not None:
792                         username = downloader_params['username']
793                         password = downloader_params['password']
794                 elif downloader_params.get('usenetrc', False):
795                         try:
796                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
797                                 if info is not None:
798                                         username = info[0]
799                                         password = info[2]
800                                 else:
801                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
802                         except (IOError, netrc.NetrcParseError), err:
803                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
804                                 return
805
806                 # Set language
807                 request = urllib2.Request(self._LANG_URL, None, std_headers)
808                 try:
809                         self.report_lang()
810                         urllib2.urlopen(request).read()
811                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
812                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
813                         return
814
815                 # No authentication to be performed
816                 if username is None:
817                         return
818
819                 # Log in
820                 login_form = {
821                                 'current_form': 'loginForm',
822                                 'next':         '/',
823                                 'action_login': 'Log In',
824                                 'username':     username,
825                                 'password':     password,
826                                 }
827                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
828                 try:
829                         self.report_login()
830                         login_results = urllib2.urlopen(request).read()
831                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
832                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
833                                 return
834                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
835                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
836                         return
837         
838                 # Confirm age
839                 age_form = {
840                                 'next_url':             '/',
841                                 'action_confirm':       'Confirm',
842                                 }
843                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
844                 try:
845                         self.report_age_confirmation()
846                         age_results = urllib2.urlopen(request).read()
847                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
848                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
849                         return
850
851         def _real_extract(self, url):
852                 # Extract video id from URL
853                 mobj = re.match(self._VALID_URL, url)
854                 if mobj is None:
855                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
856                         return
857                 video_id = mobj.group(2)
858
859                 # Get video webpage
860                 self.report_video_webpage_download(video_id)
861                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id, None, std_headers)
862                 try:
863                         video_webpage = urllib2.urlopen(request).read()
864                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
865                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
866                         return
867
868                 # Attempt to extract SWF player URL
869                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
870                 if mobj is not None:
871                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
872                 else:
873                         player_url = None
874
875                 # Get video info
876                 self.report_video_info_webpage_download(video_id)
877                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
878                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
879                                            % (video_id, el_type))
880                         request = urllib2.Request(video_info_url, None, std_headers)
881                         try:
882                                 video_info_webpage = urllib2.urlopen(request).read()
883                                 video_info = parse_qs(video_info_webpage)
884                                 if 'token' in video_info:
885                                         break
886                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
887                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
888                                 return
889                 if 'token' not in video_info:
890                         if 'reason' in video_info:
891                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
892                         else:
893                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
894                         return
895
896                 # Start extracting information
897                 self.report_information_extraction(video_id)
898
899                 # uploader
900                 if 'author' not in video_info:
901                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
902                         return
903                 video_uploader = urllib.unquote_plus(video_info['author'][0])
904
905                 # title
906                 if 'title' not in video_info:
907                         self._downloader.trouble(u'ERROR: unable to extract video title')
908                         return
909                 video_title = urllib.unquote_plus(video_info['title'][0])
910                 video_title = video_title.decode('utf-8')
911                 video_title = sanitize_title(video_title)
912
913                 # simplified title
914                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
915                 simple_title = simple_title.strip(ur'_')
916
917                 # thumbnail image
918                 if 'thumbnail_url' not in video_info:
919                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
920                         video_thumbnail = ''
921                 else:   # don't panic if we can't find it
922                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
923
924                 # upload date
925                 upload_date = u'NA'
926                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
927                 if mobj is not None:
928                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
929                         format_expressions = ['%d %B %Y', '%B %d %Y']
930                         for expression in format_expressions:
931                                 try:
932                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
933                                 except:
934                                         pass
935
936                 # description
937                 video_description = 'No description available.'
938                 if self._downloader.params.get('forcedescription', False):
939                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
940                         if mobj is not None:
941                                 video_description = mobj.group(1)
942
943                 # token
944                 video_token = urllib.unquote_plus(video_info['token'][0])
945
946                 # Decide which formats to download
947                 requested_format = self._downloader.params.get('format', None)
948                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
949
950                 if 'fmt_url_map' in video_info:
951                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
952                         format_limit = self._downloader.params.get('format_limit', None)
953                         if format_limit is not None and format_limit in self._available_formats:
954                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
955                         else:
956                                 format_list = self._available_formats
957                         existing_formats = [x for x in format_list if x in url_map]
958                         if len(existing_formats) == 0:
959                                 self._downloader.trouble(u'ERROR: no known formats available for video')
960                                 return
961                         if requested_format is None:
962                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
963                         elif requested_format == '-1':
964                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
965                         else:
966                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
967
968                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
969                         self.report_rtmp_download()
970                         video_url_list = [(None, video_info['conn'][0])]
971
972                 else:
973                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
974                         return
975
976                 for format_param, video_real_url in video_url_list:
977                         # At this point we have a new video
978                         self._downloader.increment_downloads()
979
980                         # Extension
981                         video_extension = self._video_extensions.get(format_param, 'flv')
982
983                         # Find the video URL in fmt_url_map or conn paramters
984                         try:
985                                 # Process video information
986                                 self._downloader.process_info({
987                                         'id':           video_id.decode('utf-8'),
988                                         'url':          video_real_url.decode('utf-8'),
989                                         'uploader':     video_uploader.decode('utf-8'),
990                                         'upload_date':  upload_date,
991                                         'title':        video_title,
992                                         'stitle':       simple_title,
993                                         'ext':          video_extension.decode('utf-8'),
994                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
995                                         'thumbnail':    video_thumbnail.decode('utf-8'),
996                                         'description':  video_description.decode('utf-8'),
997                                         'player_url':   player_url,
998                                 })
999                         except UnavailableVideoError, err:
1000                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
1001
1002
1003 class MetacafeIE(InfoExtractor):
1004         """Information Extractor for metacafe.com."""
1005
1006         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1007         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1008         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1009         _youtube_ie = None
1010
1011         def __init__(self, youtube_ie, downloader=None):
1012                 InfoExtractor.__init__(self, downloader)
1013                 self._youtube_ie = youtube_ie
1014
1015         @staticmethod
1016         def suitable(url):
1017                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1018
1019         def report_disclaimer(self):
1020                 """Report disclaimer retrieval."""
1021                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1022
1023         def report_age_confirmation(self):
1024                 """Report attempt to confirm age."""
1025                 self._downloader.to_screen(u'[metacafe] Confirming age')
1026         
1027         def report_download_webpage(self, video_id):
1028                 """Report webpage download."""
1029                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1030         
1031         def report_extraction(self, video_id):
1032                 """Report information extraction."""
1033                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1034
1035         def _real_initialize(self):
1036                 # Retrieve disclaimer
1037                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1038                 try:
1039                         self.report_disclaimer()
1040                         disclaimer = urllib2.urlopen(request).read()
1041                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1042                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1043                         return
1044
1045                 # Confirm age
1046                 disclaimer_form = {
1047                         'filters': '0',
1048                         'submit': "Continue - I'm over 18",
1049                         }
1050                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1051                 try:
1052                         self.report_age_confirmation()
1053                         disclaimer = urllib2.urlopen(request).read()
1054                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1055                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1056                         return
1057         
1058         def _real_extract(self, url):
1059                 # Extract id and simplified title from URL
1060                 mobj = re.match(self._VALID_URL, url)
1061                 if mobj is None:
1062                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1063                         return
1064
1065                 video_id = mobj.group(1)
1066
1067                 # Check if video comes from YouTube
1068                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1069                 if mobj2 is not None:
1070                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1071                         return
1072
1073                 # At this point we have a new video
1074                 self._downloader.increment_downloads()
1075
1076                 simple_title = mobj.group(2).decode('utf-8')
1077
1078                 # Retrieve video webpage to extract further information
1079                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1080                 try:
1081                         self.report_download_webpage(video_id)
1082                         webpage = urllib2.urlopen(request).read()
1083                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1084                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1085                         return
1086
1087                 # Extract URL, uploader and title from webpage
1088                 self.report_extraction(video_id)
1089                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1090                 if mobj is not None:
1091                         mediaURL = urllib.unquote(mobj.group(1))
1092                         video_extension = mediaURL[-3:]
1093                         
1094                         # Extract gdaKey if available
1095                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1096                         if mobj is None:
1097                                 video_url = mediaURL
1098                         else:
1099                                 gdaKey = mobj.group(1)
1100                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1101                 else:
1102                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1103                         if mobj is None:
1104                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1105                                 return
1106                         vardict = parse_qs(mobj.group(1))
1107                         if 'mediaData' not in vardict:
1108                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1109                                 return
1110                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1111                         if mobj is None:
1112                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1113                                 return
1114                         mediaURL = mobj.group(1).replace('\\/', '/')
1115                         video_extension = mediaURL[-3:]
1116                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1117
1118                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1119                 if mobj is None:
1120                         self._downloader.trouble(u'ERROR: unable to extract title')
1121                         return
1122                 video_title = mobj.group(1).decode('utf-8')
1123                 video_title = sanitize_title(video_title)
1124
1125                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1126                 if mobj is None:
1127                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1128                         return
1129                 video_uploader = mobj.group(1)
1130
1131                 try:
1132                         # Process video information
1133                         self._downloader.process_info({
1134                                 'id':           video_id.decode('utf-8'),
1135                                 'url':          video_url.decode('utf-8'),
1136                                 'uploader':     video_uploader.decode('utf-8'),
1137                                 'upload_date':  u'NA',
1138                                 'title':        video_title,
1139                                 'stitle':       simple_title,
1140                                 'ext':          video_extension.decode('utf-8'),
1141                                 'format':       u'NA',
1142                                 'player_url':   None,
1143                         })
1144                 except UnavailableVideoError:
1145                         self._downloader.trouble(u'ERROR: unable to download video')
1146
1147
1148 class DailymotionIE(InfoExtractor):
1149         """Information Extractor for Dailymotion"""
1150
1151         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1152
1153         def __init__(self, downloader=None):
1154                 InfoExtractor.__init__(self, downloader)
1155
1156         @staticmethod
1157         def suitable(url):
1158                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1159
1160         def report_download_webpage(self, video_id):
1161                 """Report webpage download."""
1162                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1163         
1164         def report_extraction(self, video_id):
1165                 """Report information extraction."""
1166                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1167
1168         def _real_initialize(self):
1169                 return
1170
1171         def _real_extract(self, url):
1172                 # Extract id and simplified title from URL
1173                 mobj = re.match(self._VALID_URL, url)
1174                 if mobj is None:
1175                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1176                         return
1177
1178                 # At this point we have a new video
1179                 self._downloader.increment_downloads()
1180                 video_id = mobj.group(1)
1181
1182                 simple_title = mobj.group(2).decode('utf-8')
1183                 video_extension = 'flv'
1184
1185                 # Retrieve video webpage to extract further information
1186                 request = urllib2.Request(url)
1187                 try:
1188                         self.report_download_webpage(video_id)
1189                         webpage = urllib2.urlopen(request).read()
1190                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1191                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1192                         return
1193
1194                 # Extract URL, uploader and title from webpage
1195                 self.report_extraction(video_id)
1196                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1197                 if mobj is None:
1198                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1199                         return
1200                 mediaURL = urllib.unquote(mobj.group(1))
1201
1202                 # if needed add http://www.dailymotion.com/ if relative URL
1203
1204                 video_url = mediaURL
1205
1206                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1207                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1208                 if mobj is None:
1209                         self._downloader.trouble(u'ERROR: unable to extract title')
1210                         return
1211                 video_title = mobj.group(1).decode('utf-8')
1212                 video_title = sanitize_title(video_title)
1213
1214                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1215                 if mobj is None:
1216                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1217                         return
1218                 video_uploader = mobj.group(1)
1219
1220                 try:
1221                         # Process video information
1222                         self._downloader.process_info({
1223                                 'id':           video_id.decode('utf-8'),
1224                                 'url':          video_url.decode('utf-8'),
1225                                 'uploader':     video_uploader.decode('utf-8'),
1226                                 'upload_date':  u'NA',
1227                                 'title':        video_title,
1228                                 'stitle':       simple_title,
1229                                 'ext':          video_extension.decode('utf-8'),
1230                                 'format':       u'NA',
1231                                 'player_url':   None,
1232                         })
1233                 except UnavailableVideoError:
1234                         self._downloader.trouble(u'ERROR: unable to download video')
1235
1236 class GoogleIE(InfoExtractor):
1237         """Information extractor for video.google.com."""
1238
1239         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1240
1241         def __init__(self, downloader=None):
1242                 InfoExtractor.__init__(self, downloader)
1243
1244         @staticmethod
1245         def suitable(url):
1246                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1247
1248         def report_download_webpage(self, video_id):
1249                 """Report webpage download."""
1250                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1251
1252         def report_extraction(self, video_id):
1253                 """Report information extraction."""
1254                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1255
1256         def _real_initialize(self):
1257                 return
1258
1259         def _real_extract(self, url):
1260                 # Extract id from URL
1261                 mobj = re.match(self._VALID_URL, url)
1262                 if mobj is None:
1263                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1264                         return
1265
1266                 # At this point we have a new video
1267                 self._downloader.increment_downloads()
1268                 video_id = mobj.group(1)
1269
1270                 video_extension = 'mp4'
1271
1272                 # Retrieve video webpage to extract further information
1273                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1274                 try:
1275                         self.report_download_webpage(video_id)
1276                         webpage = urllib2.urlopen(request).read()
1277                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1278                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1279                         return
1280
1281                 # Extract URL, uploader, and title from webpage
1282                 self.report_extraction(video_id)
1283                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1284                 if mobj is None:
1285                         video_extension = 'flv'
1286                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1287                 if mobj is None:
1288                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1289                         return
1290                 mediaURL = urllib.unquote(mobj.group(1))
1291                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1292                 mediaURL = mediaURL.replace('\\x26', '\x26')
1293
1294                 video_url = mediaURL
1295
1296                 mobj = re.search(r'<title>(.*)</title>', webpage)
1297                 if mobj is None:
1298                         self._downloader.trouble(u'ERROR: unable to extract title')
1299                         return
1300                 video_title = mobj.group(1).decode('utf-8')
1301                 video_title = sanitize_title(video_title)
1302                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1303
1304                 # Extract video description
1305                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1306                 if mobj is None:
1307                         self._downloader.trouble(u'ERROR: unable to extract video description')
1308                         return
1309                 video_description = mobj.group(1).decode('utf-8')
1310                 if not video_description:
1311                         video_description = 'No description available.'
1312
1313                 # Extract video thumbnail
1314                 if self._downloader.params.get('forcethumbnail', False):
1315                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1316                         try:
1317                                 webpage = urllib2.urlopen(request).read()
1318                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1319                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1320                                 return
1321                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1322                         if mobj is None:
1323                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1324                                 return
1325                         video_thumbnail = mobj.group(1)
1326                 else:   # we need something to pass to process_info
1327                         video_thumbnail = ''
1328
1329
1330                 try:
1331                         # Process video information
1332                         self._downloader.process_info({
1333                                 'id':           video_id.decode('utf-8'),
1334                                 'url':          video_url.decode('utf-8'),
1335                                 'uploader':     u'NA',
1336                                 'upload_date':  u'NA',
1337                                 'title':        video_title,
1338                                 'stitle':       simple_title,
1339                                 'ext':          video_extension.decode('utf-8'),
1340                                 'format':       u'NA',
1341                                 'player_url':   None,
1342                         })
1343                 except UnavailableVideoError:
1344                         self._downloader.trouble(u'ERROR: unable to download video')
1345
1346
1347 class PhotobucketIE(InfoExtractor):
1348         """Information extractor for photobucket.com."""
1349
1350         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1351
1352         def __init__(self, downloader=None):
1353                 InfoExtractor.__init__(self, downloader)
1354
1355         @staticmethod
1356         def suitable(url):
1357                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1358
1359         def report_download_webpage(self, video_id):
1360                 """Report webpage download."""
1361                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1362
1363         def report_extraction(self, video_id):
1364                 """Report information extraction."""
1365                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1366
1367         def _real_initialize(self):
1368                 return
1369
1370         def _real_extract(self, url):
1371                 # Extract id from URL
1372                 mobj = re.match(self._VALID_URL, url)
1373                 if mobj is None:
1374                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1375                         return
1376
1377                 # At this point we have a new video
1378                 self._downloader.increment_downloads()
1379                 video_id = mobj.group(1)
1380
1381                 video_extension = 'flv'
1382
1383                 # Retrieve video webpage to extract further information
1384                 request = urllib2.Request(url)
1385                 try:
1386                         self.report_download_webpage(video_id)
1387                         webpage = urllib2.urlopen(request).read()
1388                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1389                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1390                         return
1391
1392                 # Extract URL, uploader, and title from webpage
1393                 self.report_extraction(video_id)
1394                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1395                 if mobj is None:
1396                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1397                         return
1398                 mediaURL = urllib.unquote(mobj.group(1))
1399
1400                 video_url = mediaURL
1401
1402                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1403                 if mobj is None:
1404                         self._downloader.trouble(u'ERROR: unable to extract title')
1405                         return
1406                 video_title = mobj.group(1).decode('utf-8')
1407                 video_title = sanitize_title(video_title)
1408                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1409
1410                 video_uploader = mobj.group(2).decode('utf-8')
1411
1412                 try:
1413                         # Process video information
1414                         self._downloader.process_info({
1415                                 'id':           video_id.decode('utf-8'),
1416                                 'url':          video_url.decode('utf-8'),
1417                                 'uploader':     video_uploader,
1418                                 'upload_date':  u'NA',
1419                                 'title':        video_title,
1420                                 'stitle':       simple_title,
1421                                 'ext':          video_extension.decode('utf-8'),
1422                                 'format':       u'NA',
1423                                 'player_url':   None,
1424                         })
1425                 except UnavailableVideoError:
1426                         self._downloader.trouble(u'ERROR: unable to download video')
1427
1428
1429 class YahooIE(InfoExtractor):
1430         """Information extractor for video.yahoo.com."""
1431
1432         # _VALID_URL matches all Yahoo! Video URLs
1433         # _VPAGE_URL matches only the extractable '/watch/' URLs
1434         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1435         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1436
1437         def __init__(self, downloader=None):
1438                 InfoExtractor.__init__(self, downloader)
1439
1440         @staticmethod
1441         def suitable(url):
1442                 return (re.match(YahooIE._VALID_URL, url) is not None)
1443
1444         def report_download_webpage(self, video_id):
1445                 """Report webpage download."""
1446                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1447
1448         def report_extraction(self, video_id):
1449                 """Report information extraction."""
1450                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1451
1452         def _real_initialize(self):
1453                 return
1454
1455         def _real_extract(self, url, new_video=True):
1456                 # Extract ID from URL
1457                 mobj = re.match(self._VALID_URL, url)
1458                 if mobj is None:
1459                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1460                         return
1461
1462                 # At this point we have a new video
1463                 self._downloader.increment_downloads()
1464                 video_id = mobj.group(2)
1465                 video_extension = 'flv'
1466
1467                 # Rewrite valid but non-extractable URLs as
1468                 # extractable English language /watch/ URLs
1469                 if re.match(self._VPAGE_URL, url) is None:
1470                         request = urllib2.Request(url)
1471                         try:
1472                                 webpage = urllib2.urlopen(request).read()
1473                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1474                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1475                                 return
1476
1477                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1478                         if mobj is None:
1479                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1480                                 return
1481                         yahoo_id = mobj.group(1)
1482
1483                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1484                         if mobj is None:
1485                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1486                                 return
1487                         yahoo_vid = mobj.group(1)
1488
1489                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1490                         return self._real_extract(url, new_video=False)
1491
1492                 # Retrieve video webpage to extract further information
1493                 request = urllib2.Request(url)
1494                 try:
1495                         self.report_download_webpage(video_id)
1496                         webpage = urllib2.urlopen(request).read()
1497                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1498                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1499                         return
1500
1501                 # Extract uploader and title from webpage
1502                 self.report_extraction(video_id)
1503                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1504                 if mobj is None:
1505                         self._downloader.trouble(u'ERROR: unable to extract video title')
1506                         return
1507                 video_title = mobj.group(1).decode('utf-8')
1508                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1509
1510                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1511                 if mobj is None:
1512                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1513                         return
1514                 video_uploader = mobj.group(1).decode('utf-8')
1515
1516                 # Extract video thumbnail
1517                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1518                 if mobj is None:
1519                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1520                         return
1521                 video_thumbnail = mobj.group(1).decode('utf-8')
1522
1523                 # Extract video description
1524                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1525                 if mobj is None:
1526                         self._downloader.trouble(u'ERROR: unable to extract video description')
1527                         return
1528                 video_description = mobj.group(1).decode('utf-8')
1529                 if not video_description: video_description = 'No description available.'
1530
1531                 # Extract video height and width
1532                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1533                 if mobj is None:
1534                         self._downloader.trouble(u'ERROR: unable to extract video height')
1535                         return
1536                 yv_video_height = mobj.group(1)
1537
1538                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1539                 if mobj is None:
1540                         self._downloader.trouble(u'ERROR: unable to extract video width')
1541                         return
1542                 yv_video_width = mobj.group(1)
1543
1544                 # Retrieve video playlist to extract media URL
1545                 # I'm not completely sure what all these options are, but we
1546                 # seem to need most of them, otherwise the server sends a 401.
1547                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1548                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1549                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1550                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1551                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1552                 try:
1553                         self.report_download_webpage(video_id)
1554                         webpage = urllib2.urlopen(request).read()
1555                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1556                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1557                         return
1558
1559                 # Extract media URL from playlist XML
1560                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1561                 if mobj is None:
1562                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1563                         return
1564                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1565                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1566
1567                 try:
1568                         # Process video information
1569                         self._downloader.process_info({
1570                                 'id':           video_id.decode('utf-8'),
1571                                 'url':          video_url,
1572                                 'uploader':     video_uploader,
1573                                 'upload_date':  u'NA',
1574                                 'title':        video_title,
1575                                 'stitle':       simple_title,
1576                                 'ext':          video_extension.decode('utf-8'),
1577                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1578                                 'description':  video_description,
1579                                 'thumbnail':    video_thumbnail,
1580                                 'description':  video_description,
1581                                 'player_url':   None,
1582                         })
1583                 except UnavailableVideoError:
1584                         self._downloader.trouble(u'ERROR: unable to download video')
1585
1586
1587 class GenericIE(InfoExtractor):
1588         """Generic last-resort information extractor."""
1589
1590         def __init__(self, downloader=None):
1591                 InfoExtractor.__init__(self, downloader)
1592
1593         @staticmethod
1594         def suitable(url):
1595                 return True
1596
1597         def report_download_webpage(self, video_id):
1598                 """Report webpage download."""
1599                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1600                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1601
1602         def report_extraction(self, video_id):
1603                 """Report information extraction."""
1604                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1605
1606         def _real_initialize(self):
1607                 return
1608
1609         def _real_extract(self, url):
1610                 # At this point we have a new video
1611                 self._downloader.increment_downloads()
1612
1613                 video_id = url.split('/')[-1]
1614                 request = urllib2.Request(url)
1615                 try:
1616                         self.report_download_webpage(video_id)
1617                         webpage = urllib2.urlopen(request).read()
1618                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1619                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1620                         return
1621                 except ValueError, err:
1622                         # since this is the last-resort InfoExtractor, if
1623                         # this error is thrown, it'll be thrown here
1624                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1625                         return
1626
1627                 self.report_extraction(video_id)
1628                 # Start with something easy: JW Player in SWFObject
1629                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1630                 if mobj is None:
1631                         # Broaden the search a little bit
1632                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1633                 if mobj is None:
1634                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1635                         return
1636
1637                 # It's possible that one of the regexes
1638                 # matched, but returned an empty group:
1639                 if mobj.group(1) is None:
1640                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1641                         return
1642
1643                 video_url = urllib.unquote(mobj.group(1))
1644                 video_id  = os.path.basename(video_url)
1645
1646                 # here's a fun little line of code for you:
1647                 video_extension = os.path.splitext(video_id)[1][1:]
1648                 video_id        = os.path.splitext(video_id)[0]
1649
1650                 # it's tempting to parse this further, but you would
1651                 # have to take into account all the variations like
1652                 #   Video Title - Site Name
1653                 #   Site Name | Video Title
1654                 #   Video Title - Tagline | Site Name
1655                 # and so on and so forth; it's just not practical
1656                 mobj = re.search(r'<title>(.*)</title>', webpage)
1657                 if mobj is None:
1658                         self._downloader.trouble(u'ERROR: unable to extract title')
1659                         return
1660                 video_title = mobj.group(1).decode('utf-8')
1661                 video_title = sanitize_title(video_title)
1662                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1663
1664                 # video uploader is domain name
1665                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1666                 if mobj is None:
1667                         self._downloader.trouble(u'ERROR: unable to extract title')
1668                         return
1669                 video_uploader = mobj.group(1).decode('utf-8')
1670
1671                 try:
1672                         # Process video information
1673                         self._downloader.process_info({
1674                                 'id':           video_id.decode('utf-8'),
1675                                 'url':          video_url.decode('utf-8'),
1676                                 'uploader':     video_uploader,
1677                                 'upload_date':  u'NA',
1678                                 'title':        video_title,
1679                                 'stitle':       simple_title,
1680                                 'ext':          video_extension.decode('utf-8'),
1681                                 'format':       u'NA',
1682                                 'player_url':   None,
1683                         })
1684                 except UnavailableVideoError, err:
1685                         self._downloader.trouble(u'ERROR: unable to download video')
1686
1687
1688 class YoutubeSearchIE(InfoExtractor):
1689         """Information Extractor for YouTube search queries."""
1690         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1691         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1692         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1693         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1694         _youtube_ie = None
1695         _max_youtube_results = 1000
1696
1697         def __init__(self, youtube_ie, downloader=None):
1698                 InfoExtractor.__init__(self, downloader)
1699                 self._youtube_ie = youtube_ie
1700         
1701         @staticmethod
1702         def suitable(url):
1703                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1704
1705         def report_download_page(self, query, pagenum):
1706                 """Report attempt to download playlist page with given number."""
1707                 query = query.decode(preferredencoding())
1708                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1709
1710         def _real_initialize(self):
1711                 self._youtube_ie.initialize()
1712         
1713         def _real_extract(self, query):
1714                 mobj = re.match(self._VALID_QUERY, query)
1715                 if mobj is None:
1716                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1717                         return
1718
1719                 prefix, query = query.split(':')
1720                 prefix = prefix[8:]
1721                 query  = query.encode('utf-8')
1722                 if prefix == '':
1723                         self._download_n_results(query, 1)
1724                         return
1725                 elif prefix == 'all':
1726                         self._download_n_results(query, self._max_youtube_results)
1727                         return
1728                 else:
1729                         try:
1730                                 n = long(prefix)
1731                                 if n <= 0:
1732                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1733                                         return
1734                                 elif n > self._max_youtube_results:
1735                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1736                                         n = self._max_youtube_results
1737                                 self._download_n_results(query, n)
1738                                 return
1739                         except ValueError: # parsing prefix as integer fails
1740                                 self._download_n_results(query, 1)
1741                                 return
1742
1743         def _download_n_results(self, query, n):
1744                 """Downloads a specified number of results for a query"""
1745
1746                 video_ids = []
1747                 already_seen = set()
1748                 pagenum = 1
1749
1750                 while True:
1751                         self.report_download_page(query, pagenum)
1752                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1753                         request = urllib2.Request(result_url, None, std_headers)
1754                         try:
1755                                 page = urllib2.urlopen(request).read()
1756                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1757                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1758                                 return
1759
1760                         # Extract video identifiers
1761                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1762                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1763                                 if video_id not in already_seen:
1764                                         video_ids.append(video_id)
1765                                         already_seen.add(video_id)
1766                                         if len(video_ids) == n:
1767                                                 # Specified n videos reached
1768                                                 for id in video_ids:
1769                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1770                                                 return
1771
1772                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1773                                 for id in video_ids:
1774                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1775                                 return
1776
1777                         pagenum = pagenum + 1
1778
1779 class GoogleSearchIE(InfoExtractor):
1780         """Information Extractor for Google Video search queries."""
1781         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1782         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1783         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1784         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1785         _google_ie = None
1786         _max_google_results = 1000
1787
1788         def __init__(self, google_ie, downloader=None):
1789                 InfoExtractor.__init__(self, downloader)
1790                 self._google_ie = google_ie
1791         
1792         @staticmethod
1793         def suitable(url):
1794                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1795
1796         def report_download_page(self, query, pagenum):
1797                 """Report attempt to download playlist page with given number."""
1798                 query = query.decode(preferredencoding())
1799                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1800
1801         def _real_initialize(self):
1802                 self._google_ie.initialize()
1803         
1804         def _real_extract(self, query):
1805                 mobj = re.match(self._VALID_QUERY, query)
1806                 if mobj is None:
1807                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1808                         return
1809
1810                 prefix, query = query.split(':')
1811                 prefix = prefix[8:]
1812                 query  = query.encode('utf-8')
1813                 if prefix == '':
1814                         self._download_n_results(query, 1)
1815                         return
1816                 elif prefix == 'all':
1817                         self._download_n_results(query, self._max_google_results)
1818                         return
1819                 else:
1820                         try:
1821                                 n = long(prefix)
1822                                 if n <= 0:
1823                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1824                                         return
1825                                 elif n > self._max_google_results:
1826                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1827                                         n = self._max_google_results
1828                                 self._download_n_results(query, n)
1829                                 return
1830                         except ValueError: # parsing prefix as integer fails
1831                                 self._download_n_results(query, 1)
1832                                 return
1833
1834         def _download_n_results(self, query, n):
1835                 """Downloads a specified number of results for a query"""
1836
1837                 video_ids = []
1838                 already_seen = set()
1839                 pagenum = 1
1840
1841                 while True:
1842                         self.report_download_page(query, pagenum)
1843                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1844                         request = urllib2.Request(result_url, None, std_headers)
1845                         try:
1846                                 page = urllib2.urlopen(request).read()
1847                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1848                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1849                                 return
1850
1851                         # Extract video identifiers
1852                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1853                                 video_id = mobj.group(1)
1854                                 if video_id not in already_seen:
1855                                         video_ids.append(video_id)
1856                                         already_seen.add(video_id)
1857                                         if len(video_ids) == n:
1858                                                 # Specified n videos reached
1859                                                 for id in video_ids:
1860                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1861                                                 return
1862
1863                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1864                                 for id in video_ids:
1865                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1866                                 return
1867
1868                         pagenum = pagenum + 1
1869
1870 class YahooSearchIE(InfoExtractor):
1871         """Information Extractor for Yahoo! Video search queries."""
1872         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1873         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1874         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1875         _MORE_PAGES_INDICATOR = r'\s*Next'
1876         _yahoo_ie = None
1877         _max_yahoo_results = 1000
1878
1879         def __init__(self, yahoo_ie, downloader=None):
1880                 InfoExtractor.__init__(self, downloader)
1881                 self._yahoo_ie = yahoo_ie
1882         
1883         @staticmethod
1884         def suitable(url):
1885                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1886
1887         def report_download_page(self, query, pagenum):
1888                 """Report attempt to download playlist page with given number."""
1889                 query = query.decode(preferredencoding())
1890                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1891
1892         def _real_initialize(self):
1893                 self._yahoo_ie.initialize()
1894         
1895         def _real_extract(self, query):
1896                 mobj = re.match(self._VALID_QUERY, query)
1897                 if mobj is None:
1898                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1899                         return
1900
1901                 prefix, query = query.split(':')
1902                 prefix = prefix[8:]
1903                 query  = query.encode('utf-8')
1904                 if prefix == '':
1905                         self._download_n_results(query, 1)
1906                         return
1907                 elif prefix == 'all':
1908                         self._download_n_results(query, self._max_yahoo_results)
1909                         return
1910                 else:
1911                         try:
1912                                 n = long(prefix)
1913                                 if n <= 0:
1914                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1915                                         return
1916                                 elif n > self._max_yahoo_results:
1917                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1918                                         n = self._max_yahoo_results
1919                                 self._download_n_results(query, n)
1920                                 return
1921                         except ValueError: # parsing prefix as integer fails
1922                                 self._download_n_results(query, 1)
1923                                 return
1924
1925         def _download_n_results(self, query, n):
1926                 """Downloads a specified number of results for a query"""
1927
1928                 video_ids = []
1929                 already_seen = set()
1930                 pagenum = 1
1931
1932                 while True:
1933                         self.report_download_page(query, pagenum)
1934                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1935                         request = urllib2.Request(result_url, None, std_headers)
1936                         try:
1937                                 page = urllib2.urlopen(request).read()
1938                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1940                                 return
1941
1942                         # Extract video identifiers
1943                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1944                                 video_id = mobj.group(1)
1945                                 if video_id not in already_seen:
1946                                         video_ids.append(video_id)
1947                                         already_seen.add(video_id)
1948                                         if len(video_ids) == n:
1949                                                 # Specified n videos reached
1950                                                 for id in video_ids:
1951                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1952                                                 return
1953
1954                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1955                                 for id in video_ids:
1956                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1957                                 return
1958
1959                         pagenum = pagenum + 1
1960
1961 class YoutubePlaylistIE(InfoExtractor):
1962         """Information Extractor for YouTube playlists."""
1963
1964         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1965         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1966         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1967         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1968         _youtube_ie = None
1969
1970         def __init__(self, youtube_ie, downloader=None):
1971                 InfoExtractor.__init__(self, downloader)
1972                 self._youtube_ie = youtube_ie
1973         
1974         @staticmethod
1975         def suitable(url):
1976                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1977
1978         def report_download_page(self, playlist_id, pagenum):
1979                 """Report attempt to download playlist page with given number."""
1980                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1981
1982         def _real_initialize(self):
1983                 self._youtube_ie.initialize()
1984         
1985         def _real_extract(self, url):
1986                 # Extract playlist id
1987                 mobj = re.match(self._VALID_URL, url)
1988                 if mobj is None:
1989                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1990                         return
1991
1992                 # Download playlist pages
1993                 playlist_id = mobj.group(1)
1994                 video_ids = []
1995                 pagenum = 1
1996
1997                 while True:
1998                         self.report_download_page(playlist_id, pagenum)
1999                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
2000                         try:
2001                                 page = urllib2.urlopen(request).read()
2002                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2003                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2004                                 return
2005
2006                         # Extract video identifiers
2007                         ids_in_page = []
2008                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2009                                 if mobj.group(1) not in ids_in_page:
2010                                         ids_in_page.append(mobj.group(1))
2011                         video_ids.extend(ids_in_page)
2012
2013                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2014                                 break
2015                         pagenum = pagenum + 1
2016
2017                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2018                 playlistend = self._downloader.params.get('playlistend', -1)
2019                 video_ids = video_ids[playliststart:playlistend]
2020
2021                 for id in video_ids:
2022                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2023                 return
2024
2025 class YoutubeUserIE(InfoExtractor):
2026         """Information Extractor for YouTube users."""
2027
2028         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2029         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2030         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2031         _youtube_ie = None
2032
2033         def __init__(self, youtube_ie, downloader=None):
2034                 InfoExtractor.__init__(self, downloader)
2035                 self._youtube_ie = youtube_ie
2036         
2037         @staticmethod
2038         def suitable(url):
2039                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2040
2041         def report_download_page(self, username):
2042                 """Report attempt to download user page."""
2043                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2044
2045         def _real_initialize(self):
2046                 self._youtube_ie.initialize()
2047         
2048         def _real_extract(self, url):
2049                 # Extract username
2050                 mobj = re.match(self._VALID_URL, url)
2051                 if mobj is None:
2052                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2053                         return
2054
2055                 # Download user page
2056                 username = mobj.group(1)
2057                 video_ids = []
2058                 pagenum = 1
2059
2060                 self.report_download_page(username)
2061                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2062                 try:
2063                         page = urllib2.urlopen(request).read()
2064                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2065                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2066                         return
2067
2068                 # Extract video identifiers
2069                 ids_in_page = []
2070
2071                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2072                         if mobj.group(1) not in ids_in_page:
2073                                 ids_in_page.append(mobj.group(1))
2074                 video_ids.extend(ids_in_page)
2075
2076                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2077                 playlistend = self._downloader.params.get('playlistend', -1)
2078                 video_ids = video_ids[playliststart:playlistend]
2079
2080                 for id in video_ids:
2081                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2082                 return
2083
2084 class DepositFilesIE(InfoExtractor):
2085         """Information extractor for depositfiles.com"""
2086
2087         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2088
2089         def __init__(self, downloader=None):
2090                 InfoExtractor.__init__(self, downloader)
2091
2092         @staticmethod
2093         def suitable(url):
2094                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2095
2096         def report_download_webpage(self, file_id):
2097                 """Report webpage download."""
2098                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2099
2100         def report_extraction(self, file_id):
2101                 """Report information extraction."""
2102                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2103
2104         def _real_initialize(self):
2105                 return
2106
2107         def _real_extract(self, url):
2108                 # At this point we have a new file
2109                 self._downloader.increment_downloads()
2110
2111                 file_id = url.split('/')[-1]
2112                 # Rebuild url in english locale
2113                 url = 'http://depositfiles.com/en/files/' + file_id
2114
2115                 # Retrieve file webpage with 'Free download' button pressed
2116                 free_download_indication = { 'gateway_result' : '1' }
2117                 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2118                 try:
2119                         self.report_download_webpage(file_id)
2120                         webpage = urllib2.urlopen(request).read()
2121                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2122                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2123                         return
2124
2125                 # Search for the real file URL
2126                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2127                 if (mobj is None) or (mobj.group(1) is None):
2128                         # Try to figure out reason of the error.
2129                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2130                         if (mobj is not None) and (mobj.group(1) is not None):
2131                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2132                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2133                         else:
2134                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2135                         return
2136
2137                 file_url = mobj.group(1)
2138                 file_extension = os.path.splitext(file_url)[1][1:]
2139
2140                 # Search for file title
2141                 mobj = re.search(r'<b title="(.*?)">', webpage)
2142                 if mobj is None:
2143                         self._downloader.trouble(u'ERROR: unable to extract title')
2144                         return
2145                 file_title = mobj.group(1).decode('utf-8')
2146
2147                 try:
2148                         # Process file information
2149                         self._downloader.process_info({
2150                                 'id':           file_id.decode('utf-8'),
2151                                 'url':          file_url.decode('utf-8'),
2152                                 'uploader':     u'NA',
2153                                 'upload_date':  u'NA',
2154                                 'title':        file_title,
2155                                 'stitle':       file_title,
2156                                 'ext':          file_extension.decode('utf-8'),
2157                                 'format':       u'NA',
2158                                 'player_url':   None,
2159                         })
2160                 except UnavailableVideoError, err:
2161                         self._downloader.trouble(u'ERROR: unable to download file')
2162
2163 class PostProcessor(object):
2164         """Post Processor class.
2165
2166         PostProcessor objects can be added to downloaders with their
2167         add_post_processor() method. When the downloader has finished a
2168         successful download, it will take its internal chain of PostProcessors
2169         and start calling the run() method on each one of them, first with
2170         an initial argument and then with the returned value of the previous
2171         PostProcessor.
2172
2173         The chain will be stopped if one of them ever returns None or the end
2174         of the chain is reached.
2175
2176         PostProcessor objects follow a "mutual registration" process similar
2177         to InfoExtractor objects.
2178         """
2179
2180         _downloader = None
2181
2182         def __init__(self, downloader=None):
2183                 self._downloader = downloader
2184
2185         def set_downloader(self, downloader):
2186                 """Sets the downloader for this PP."""
2187                 self._downloader = downloader
2188         
2189         def run(self, information):
2190                 """Run the PostProcessor.
2191
2192                 The "information" argument is a dictionary like the ones
2193                 composed by InfoExtractors. The only difference is that this
2194                 one has an extra field called "filepath" that points to the
2195                 downloaded file.
2196
2197                 When this method returns None, the postprocessing chain is
2198                 stopped. However, this method may return an information
2199                 dictionary that will be passed to the next postprocessing
2200                 object in the chain. It can be the one it received after
2201                 changing some fields.
2202
2203                 In addition, this method may raise a PostProcessingError
2204                 exception that will be taken into account by the downloader
2205                 it was called from.
2206                 """
2207                 return information # by default, do nothing
2208         
2209 ### MAIN PROGRAM ###
2210 if __name__ == '__main__':
2211         try:
2212                 # Modules needed only when running the main program
2213                 import getpass
2214                 import optparse
2215
2216                 # Function to update the program file with the latest version from bitbucket.org
2217                 def update_self(downloader, filename):
2218                         # Note: downloader only used for options
2219                         if not os.access (filename, os.W_OK):
2220                                 sys.exit('ERROR: no write permissions on %s' % filename)
2221
2222                         downloader.to_screen('Updating to latest stable version...')
2223                         latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2224                         latest_version = urllib.urlopen(latest_url).read().strip()
2225                         prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2226                         newcontent = urllib.urlopen(prog_url).read()
2227                         stream = open(filename, 'w')
2228                         stream.write(newcontent)
2229                         stream.close()
2230                         downloader.to_screen('Updated to version %s' % latest_version)
2231
2232                 # Parse command line
2233                 parser = optparse.OptionParser(
2234                         usage='Usage: %prog [options] url...',
2235                         version='2010.12.09',
2236                         conflict_handler='resolve',
2237                 )
2238
2239                 parser.add_option('-h', '--help',
2240                                 action='help', help='print this help text and exit')
2241                 parser.add_option('-v', '--version',
2242                                 action='version', help='print program version and exit')
2243                 parser.add_option('-U', '--update',
2244                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2245                 parser.add_option('-i', '--ignore-errors',
2246                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2247                 parser.add_option('-r', '--rate-limit',
2248                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2249                 parser.add_option('-R', '--retries',
2250                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2251                 parser.add_option('--playlist-start',
2252                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2253                 parser.add_option('--playlist-end',
2254                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2255
2256                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2257                 authentication.add_option('-u', '--username',
2258                                 dest='username', metavar='USERNAME', help='account username')
2259                 authentication.add_option('-p', '--password',
2260                                 dest='password', metavar='PASSWORD', help='account password')
2261                 authentication.add_option('-n', '--netrc',
2262                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2263                 parser.add_option_group(authentication)
2264
2265                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2266                 video_format.add_option('-f', '--format',
2267                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2268                 video_format.add_option('-m', '--mobile-version',
2269                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2270                 video_format.add_option('--all-formats',
2271                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2272                 video_format.add_option('--max-quality',
2273                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2274                 video_format.add_option('-b', '--best-quality',
2275                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2276                 parser.add_option_group(video_format)
2277
2278                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2279                 verbosity.add_option('-q', '--quiet',
2280                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2281                 verbosity.add_option('-s', '--simulate',
2282                                 action='store_true', dest='simulate', help='do not download video', default=False)
2283                 verbosity.add_option('-g', '--get-url',
2284                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2285                 verbosity.add_option('-e', '--get-title',
2286                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2287                 verbosity.add_option('--get-thumbnail',
2288                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2289                 verbosity.add_option('--get-description',
2290                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2291                 verbosity.add_option('--no-progress',
2292                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2293                 parser.add_option_group(verbosity)
2294
2295                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2296                 filesystem.add_option('-t', '--title',
2297                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2298                 filesystem.add_option('-l', '--literal',
2299                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2300                 filesystem.add_option('-A', '--auto-number',
2301                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2302                 filesystem.add_option('-o', '--output',
2303                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2304                 filesystem.add_option('-a', '--batch-file',
2305                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2306                 filesystem.add_option('-w', '--no-overwrites',
2307                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2308                 filesystem.add_option('-c', '--continue',
2309                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2310                 filesystem.add_option('--cookies',
2311                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2312                 parser.add_option_group(filesystem)
2313
2314                 (opts, args) = parser.parse_args()
2315
2316                 # Open appropriate CookieJar
2317                 if opts.cookiefile is None:
2318                         jar = cookielib.CookieJar()
2319                 else:
2320                         try:
2321                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2322                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2323                                         jar.load()
2324                         except (IOError, OSError), err:
2325                                 sys.exit(u'ERROR: unable to open cookie file')
2326
2327                 # General configuration
2328                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2329                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2330                 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2331                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2332
2333                 # Batch file verification
2334                 batchurls = []
2335                 if opts.batchfile is not None:
2336                         try:
2337                                 if opts.batchfile == '-':
2338                                         batchfd = sys.stdin
2339                                 else:
2340                                         batchfd = open(opts.batchfile, 'r')
2341                                 batchurls = batchfd.readlines()
2342                                 batchurls = [x.strip() for x in batchurls]
2343                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2344                         except IOError:
2345                                 sys.exit(u'ERROR: batch file could not be read')
2346                 all_urls = batchurls + args
2347
2348                 # Conflicting, missing and erroneous options
2349                 if opts.bestquality:
2350                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2351                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2352                         parser.error(u'using .netrc conflicts with giving username/password')
2353                 if opts.password is not None and opts.username is None:
2354                         parser.error(u'account username missing')
2355                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2356                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2357                 if opts.usetitle and opts.useliteral:
2358                         parser.error(u'using title conflicts with using literal title')
2359                 if opts.username is not None and opts.password is None:
2360                         opts.password = getpass.getpass(u'Type account password and press return:')
2361                 if opts.ratelimit is not None:
2362                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2363                         if numeric_limit is None:
2364                                 parser.error(u'invalid rate limit specified')
2365                         opts.ratelimit = numeric_limit
2366                 if opts.retries is not None:
2367                         try:
2368                                 opts.retries = long(opts.retries)
2369                         except (TypeError, ValueError), err:
2370                                 parser.error(u'invalid retry count specified')
2371                 try:
2372                         opts.playliststart = long(opts.playliststart)
2373                         if opts.playliststart <= 0:
2374                                 raise ValueError
2375                 except (TypeError, ValueError), err:
2376                         parser.error(u'invalid playlist start number specified')
2377                 try:
2378                         opts.playlistend = long(opts.playlistend)
2379                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2380                                 raise ValueError
2381                 except (TypeError, ValueError), err:
2382                         parser.error(u'invalid playlist end number specified')
2383
2384                 # Information extractors
2385                 youtube_ie = YoutubeIE()
2386                 metacafe_ie = MetacafeIE(youtube_ie)
2387                 dailymotion_ie = DailymotionIE()
2388                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2389                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2390                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2391                 google_ie = GoogleIE()
2392                 google_search_ie = GoogleSearchIE(google_ie)
2393                 photobucket_ie = PhotobucketIE()
2394                 yahoo_ie = YahooIE()
2395                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2396                 deposit_files_ie = DepositFilesIE()
2397                 generic_ie = GenericIE()
2398
2399                 # File downloader
2400                 fd = FileDownloader({
2401                         'usenetrc': opts.usenetrc,
2402                         'username': opts.username,
2403                         'password': opts.password,
2404                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2405                         'forceurl': opts.geturl,
2406                         'forcetitle': opts.gettitle,
2407                         'forcethumbnail': opts.getthumbnail,
2408                         'forcedescription': opts.getdescription,
2409                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2410                         'format': opts.format,
2411                         'format_limit': opts.format_limit,
2412                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2413                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2414                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2415                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2416                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2417                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2418                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2419                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2420                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2421                                 or u'%(id)s.%(ext)s'),
2422                         'ignoreerrors': opts.ignoreerrors,
2423                         'ratelimit': opts.ratelimit,
2424                         'nooverwrites': opts.nooverwrites,
2425                         'retries': opts.retries,
2426                         'continuedl': opts.continue_dl,
2427                         'noprogress': opts.noprogress,
2428                         'playliststart': opts.playliststart,
2429                         'playlistend': opts.playlistend,
2430                         'logtostderr': opts.outtmpl == '-',
2431                         })
2432                 fd.add_info_extractor(youtube_search_ie)
2433                 fd.add_info_extractor(youtube_pl_ie)
2434                 fd.add_info_extractor(youtube_user_ie)
2435                 fd.add_info_extractor(metacafe_ie)
2436                 fd.add_info_extractor(dailymotion_ie)
2437                 fd.add_info_extractor(youtube_ie)
2438                 fd.add_info_extractor(google_ie)
2439                 fd.add_info_extractor(google_search_ie)
2440                 fd.add_info_extractor(photobucket_ie)
2441                 fd.add_info_extractor(yahoo_ie)
2442                 fd.add_info_extractor(yahoo_search_ie)
2443                 fd.add_info_extractor(deposit_files_ie)
2444
2445                 # This must come last since it's the
2446                 # fallback if none of the others work
2447                 fd.add_info_extractor(generic_ie)
2448
2449                 # Update version
2450                 if opts.update_self:
2451                         update_self(fd, sys.argv[0])
2452
2453                 # Maybe do nothing
2454                 if len(all_urls) < 1:
2455                         if not opts.update_self:
2456                                 parser.error(u'you must provide at least one URL')
2457                         else:
2458                                 sys.exit()
2459                 retcode = fd.download(all_urls)
2460
2461                 # Dump cookie jar if requested
2462                 if opts.cookiefile is not None:
2463                         try:
2464                                 jar.save()
2465                         except (IOError, OSError), err:
2466                                 sys.exit(u'ERROR: unable to save cookie jar')
2467
2468                 sys.exit(retcode)
2469
2470         except DownloadError:
2471                 sys.exit(1)
2472         except SameFileError:
2473                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2474         except KeyboardInterrupt:
2475                 sys.exit(u'\nERROR: Interrupted by user')