Suppport for youtube video streams (Mentioned in #108)
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # Author: PaweÅ‚ Paprota
9 # Author: Gergely Imreh
10 # Author: Philipp Hagemeister <phihag@phihag.de>
11 # License: Public domain code
12 from __future__ import with_statement
13 import contextlib
14 import cookielib
15 import ctypes
16 import datetime
17 import email.utils
18 import gzip
19 import htmlentitydefs
20 import httplib
21 import locale
22 import math
23 import netrc
24 import os
25 import os.path
26 import re
27 import socket
28 import string
29 import subprocess
30 import sys
31 import time
32 import urllib
33 import urllib2
34 import warnings
35 import zlib
36
37 try:
38         import cStringIO as StringIO
39 except ImportError:
40         import StringIO
41
42 # parse_qs was moved from the cgi module to the urlparse module recently.
43 try:
44         from urlparse import parse_qs
45 except ImportError:
46         from cgi import parse_qs
47
48 try:
49         import lxml.etree
50 except ImportError: # Python < 2.6
51         pass # Handled below
52
53 std_headers = {
54         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
55         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
56         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
57         'Accept-Encoding': 'gzip, deflate',
58         'Accept-Language': 'en-us,en;q=0.5',
59 }
60
61 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
62
63 try:
64         import json
65 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
66         import re
67         class json(object):
68                 @staticmethod
69                 def loads(s):
70                         s = s.decode('UTF-8')
71                         def raiseError(msg, i):
72                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
73                         def skipSpace(i, expectMore=True):
74                                 while i < len(s) and s[i] in ' \t\r\n':
75                                         i += 1
76                                 if expectMore:
77                                         if i >= len(s):
78                                                 raiseError('Premature end', i)
79                                 return i
80                         def decodeEscape(match):
81                                 esc = match.group(1)
82                                 _STATIC = {
83                                         '"': '"',
84                                         '\\': '\\',
85                                         '/': '/',
86                                         'b': unichr(0x8),
87                                         'f': unichr(0xc),
88                                         'n': '\n',
89                                         'r': '\r',
90                                         't': '\t',
91                                 }
92                                 if esc in _STATIC:
93                                         return _STATIC[esc]
94                                 if esc[0] == 'u':
95                                         if len(esc) == 1+4:
96                                                 return unichr(int(esc[1:5], 16))
97                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
98                                                 hi = int(esc[1:5], 16)
99                                                 low = int(esc[7:11], 16)
100                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
101                                 raise ValueError('Unknown escape ' + str(esc))
102                         def parseString(i):
103                                 i += 1
104                                 e = i
105                                 while True:
106                                         e = s.index('"', e)
107                                         bslashes = 0
108                                         while s[e-bslashes-1] == '\\':
109                                                 bslashes += 1
110                                         if bslashes % 2 == 1:
111                                                 e += 1
112                                                 continue
113                                         break
114                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
115                                 stri = rexp.sub(decodeEscape, s[i:e])
116                                 return (e+1,stri)
117                         def parseObj(i):
118                                 i += 1
119                                 res = {}
120                                 i = skipSpace(i)
121                                 if s[i] == '}': # Empty dictionary
122                                         return (i+1,res)
123                                 while True:
124                                         if s[i] != '"':
125                                                 raiseError('Expected a string object key', i)
126                                         i,key = parseString(i)
127                                         i = skipSpace(i)
128                                         if i >= len(s) or s[i] != ':':
129                                                 raiseError('Expected a colon', i)
130                                         i,val = parse(i+1)
131                                         res[key] = val
132                                         i = skipSpace(i)
133                                         if s[i] == '}':
134                                                 return (i+1, res)
135                                         if s[i] != ',':
136                                                 raiseError('Expected comma or closing curly brace', i)
137                                         i = skipSpace(i+1)
138                         def parseArray(i):
139                                 res = []
140                                 i = skipSpace(i+1)
141                                 if s[i] == ']': # Empty array
142                                         return (i+1,res)
143                                 while True:
144                                         i,val = parse(i)
145                                         res.append(val)
146                                         i = skipSpace(i) # Raise exception if premature end
147                                         if s[i] == ']':
148                                                 return (i+1, res)
149                                         if s[i] != ',':
150                                                 raiseError('Expected a comma or closing bracket', i)
151                                         i = skipSpace(i+1)
152                         def parseDiscrete(i):
153                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
154                                         if s.startswith(k, i):
155                                                 return (i+len(k), v)
156                                 raiseError('Not a boolean (or null)', i)
157                         def parseNumber(i):
158                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
159                                 if mobj is None:
160                                         raiseError('Not a number', i)
161                                 nums = mobj.group(1)
162                                 if '.' in nums or 'e' in nums or 'E' in nums:
163                                         return (i+len(nums), float(nums))
164                                 return (i+len(nums), int(nums))
165                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
166                         def parse(i):
167                                 i = skipSpace(i)
168                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
169                                 i = skipSpace(i, False)
170                                 return (i,res)
171                         i,res = parse(0)
172                         if i < len(s):
173                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
174                         return res
175
176 def preferredencoding():
177         """Get preferred encoding.
178
179         Returns the best encoding scheme for the system, based on
180         locale.getpreferredencoding() and some further tweaks.
181         """
182         def yield_preferredencoding():
183                 try:
184                         pref = locale.getpreferredencoding()
185                         u'TEST'.encode(pref)
186                 except:
187                         pref = 'UTF-8'
188                 while True:
189                         yield pref
190         return yield_preferredencoding().next()
191
192 def htmlentity_transform(matchobj):
193         """Transforms an HTML entity to a Unicode character.
194
195         This function receives a match object and is intended to be used with
196         the re.sub() function.
197         """
198         entity = matchobj.group(1)
199
200         # Known non-numeric HTML entity
201         if entity in htmlentitydefs.name2codepoint:
202                 return unichr(htmlentitydefs.name2codepoint[entity])
203
204         # Unicode character
205         mobj = re.match(ur'(?u)#(x?\d+)', entity)
206         if mobj is not None:
207                 numstr = mobj.group(1)
208                 if numstr.startswith(u'x'):
209                         base = 16
210                         numstr = u'0%s' % numstr
211                 else:
212                         base = 10
213                 return unichr(long(numstr, base))
214
215         # Unknown entity in name, return its literal representation
216         return (u'&%s;' % entity)
217
218 def sanitize_title(utitle):
219         """Sanitizes a video title so it could be used as part of a filename."""
220         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
221         return utitle.replace(unicode(os.sep), u'%')
222
223 def sanitize_open(filename, open_mode):
224         """Try to open the given filename, and slightly tweak it if this fails.
225
226         Attempts to open the given filename. If this fails, it tries to change
227         the filename slightly, step by step, until it's either able to open it
228         or it fails and raises a final exception, like the standard open()
229         function.
230
231         It returns the tuple (stream, definitive_file_name).
232         """
233         try:
234                 if filename == u'-':
235                         if sys.platform == 'win32':
236                                 import msvcrt
237                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
238                         return (sys.stdout, filename)
239                 stream = open(filename, open_mode)
240                 return (stream, filename)
241         except (IOError, OSError), err:
242                 # In case of error, try to remove win32 forbidden chars
243                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
244
245                 # An exception here should be caught in the caller
246                 stream = open(filename, open_mode)
247                 return (stream, filename)
248
249 def timeconvert(timestr):
250     """Convert RFC 2822 defined time string into system timestamp"""
251     timestamp = None
252     timetuple = email.utils.parsedate_tz(timestr)
253     if timetuple is not None:
254         timestamp = email.utils.mktime_tz(timetuple)
255     return timestamp
256
257 class DownloadError(Exception):
258         """Download Error exception.
259
260         This exception may be thrown by FileDownloader objects if they are not
261         configured to continue on errors. They will contain the appropriate
262         error message.
263         """
264         pass
265
266 class SameFileError(Exception):
267         """Same File exception.
268
269         This exception will be thrown by FileDownloader objects if they detect
270         multiple files would have to be downloaded to the same file on disk.
271         """
272         pass
273
274 class PostProcessingError(Exception):
275         """Post Processing exception.
276
277         This exception may be raised by PostProcessor's .run() method to
278         indicate an error in the postprocessing task.
279         """
280         pass
281
282 class UnavailableVideoError(Exception):
283         """Unavailable Format exception.
284
285         This exception will be thrown when a video is requested
286         in a format that is not available for that video.
287         """
288         pass
289
290 class ContentTooShortError(Exception):
291         """Content Too Short exception.
292
293         This exception may be raised by FileDownloader objects when a file they
294         download is too small for what the server announced first, indicating
295         the connection was probably interrupted.
296         """
297         # Both in bytes
298         downloaded = None
299         expected = None
300
301         def __init__(self, downloaded, expected):
302                 self.downloaded = downloaded
303                 self.expected = expected
304
305 class YoutubeDLHandler(urllib2.HTTPHandler):
306         """Handler for HTTP requests and responses.
307
308         This class, when installed with an OpenerDirector, automatically adds
309         the standard headers to every HTTP request and handles gzipped and
310         deflated responses from web servers. If compression is to be avoided in
311         a particular request, the original request in the program code only has
312         to include the HTTP header "Youtubedl-No-Compression", which will be
313         removed before making the real request.
314         
315         Part of this code was copied from:
316
317           http://techknack.net/python-urllib2-handlers/
318           
319         Andrew Rowls, the author of that code, agreed to release it to the
320         public domain.
321         """
322
323         @staticmethod
324         def deflate(data):
325                 try:
326                         return zlib.decompress(data, -zlib.MAX_WBITS)
327                 except zlib.error:
328                         return zlib.decompress(data)
329         
330         @staticmethod
331         def addinfourl_wrapper(stream, headers, url, code):
332                 if hasattr(urllib2.addinfourl, 'getcode'):
333                         return urllib2.addinfourl(stream, headers, url, code)
334                 ret = urllib2.addinfourl(stream, headers, url)
335                 ret.code = code
336                 return ret
337         
338         def http_request(self, req):
339                 for h in std_headers:
340                         if h in req.headers:
341                                 del req.headers[h]
342                         req.add_header(h, std_headers[h])
343                 if 'Youtubedl-no-compression' in req.headers:
344                         if 'Accept-encoding' in req.headers:
345                                 del req.headers['Accept-encoding']
346                         del req.headers['Youtubedl-no-compression']
347                 return req
348
349         def http_response(self, req, resp):
350                 old_resp = resp
351                 # gzip
352                 if resp.headers.get('Content-encoding', '') == 'gzip':
353                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
354                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
355                         resp.msg = old_resp.msg
356                 # deflate
357                 if resp.headers.get('Content-encoding', '') == 'deflate':
358                         gz = StringIO.StringIO(self.deflate(resp.read()))
359                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
360                         resp.msg = old_resp.msg
361                 return resp
362
363 class FileDownloader(object):
364         """File Downloader class.
365
366         File downloader objects are the ones responsible of downloading the
367         actual video file and writing it to disk if the user has requested
368         it, among some other tasks. In most cases there should be one per
369         program. As, given a video URL, the downloader doesn't know how to
370         extract all the needed information, task that InfoExtractors do, it
371         has to pass the URL to one of them.
372
373         For this, file downloader objects have a method that allows
374         InfoExtractors to be registered in a given order. When it is passed
375         a URL, the file downloader handles it to the first InfoExtractor it
376         finds that reports being able to handle it. The InfoExtractor extracts
377         all the information about the video or videos the URL refers to, and
378         asks the FileDownloader to process the video information, possibly
379         downloading the video.
380
381         File downloaders accept a lot of parameters. In order not to saturate
382         the object constructor with arguments, it receives a dictionary of
383         options instead. These options are available through the params
384         attribute for the InfoExtractors to use. The FileDownloader also
385         registers itself as the downloader in charge for the InfoExtractors
386         that are added to it, so this is a "mutual registration".
387
388         Available options:
389
390         username:         Username for authentication purposes.
391         password:         Password for authentication purposes.
392         usenetrc:         Use netrc for authentication instead.
393         quiet:            Do not print messages to stdout.
394         forceurl:         Force printing final URL.
395         forcetitle:       Force printing title.
396         forcethumbnail:   Force printing thumbnail URL.
397         forcedescription: Force printing description.
398         forcefilename:    Force printing final filename.
399         simulate:         Do not download the video files.
400         format:           Video format code.
401         format_limit:     Highest quality format to try.
402         outtmpl:          Template for output names.
403         ignoreerrors:     Do not stop on download errors.
404         ratelimit:        Download speed limit, in bytes/sec.
405         nooverwrites:     Prevent overwriting files.
406         retries:          Number of times to retry for HTTP error 5xx
407         continuedl:       Try to continue downloads if possible.
408         noprogress:       Do not print the progress bar.
409         playliststart:    Playlist item to start at.
410         playlistend:      Playlist item to end at.
411         logtostderr:      Log messages to stderr instead of stdout.
412         consoletitle:     Display progress in console window's titlebar.
413         nopart:           Do not use temporary .part files.
414         updatetime:       Use the Last-modified header to set output file timestamps.
415         writedescription: Write the video description to a .description file
416         writeinfojson:    Write the video description to a .info.json file
417         """
418
419         params = None
420         _ies = []
421         _pps = []
422         _download_retcode = None
423         _num_downloads = None
424         _screen_file = None
425
426         def __init__(self, params):
427                 """Create a FileDownloader object with the given options."""
428                 self._ies = []
429                 self._pps = []
430                 self._download_retcode = 0
431                 self._num_downloads = 0
432                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
433                 self.params = params
434
435         @staticmethod
436         def pmkdir(filename):
437                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
438                 components = filename.split(os.sep)
439                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
440                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
441                 for dir in aggregate:
442                         if not os.path.exists(dir):
443                                 os.mkdir(dir)
444
445         @staticmethod
446         def format_bytes(bytes):
447                 if bytes is None:
448                         return 'N/A'
449                 if type(bytes) is str:
450                         bytes = float(bytes)
451                 if bytes == 0.0:
452                         exponent = 0
453                 else:
454                         exponent = long(math.log(bytes, 1024.0))
455                 suffix = 'bkMGTPEZY'[exponent]
456                 converted = float(bytes) / float(1024**exponent)
457                 return '%.2f%s' % (converted, suffix)
458
459         @staticmethod
460         def calc_percent(byte_counter, data_len):
461                 if data_len is None:
462                         return '---.-%'
463                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
464
465         @staticmethod
466         def calc_eta(start, now, total, current):
467                 if total is None:
468                         return '--:--'
469                 dif = now - start
470                 if current == 0 or dif < 0.001: # One millisecond
471                         return '--:--'
472                 rate = float(current) / dif
473                 eta = long((float(total) - float(current)) / rate)
474                 (eta_mins, eta_secs) = divmod(eta, 60)
475                 if eta_mins > 99:
476                         return '--:--'
477                 return '%02d:%02d' % (eta_mins, eta_secs)
478
479         @staticmethod
480         def calc_speed(start, now, bytes):
481                 dif = now - start
482                 if bytes == 0 or dif < 0.001: # One millisecond
483                         return '%10s' % '---b/s'
484                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
485
486         @staticmethod
487         def best_block_size(elapsed_time, bytes):
488                 new_min = max(bytes / 2.0, 1.0)
489                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
490                 if elapsed_time < 0.001:
491                         return long(new_max)
492                 rate = bytes / elapsed_time
493                 if rate > new_max:
494                         return long(new_max)
495                 if rate < new_min:
496                         return long(new_min)
497                 return long(rate)
498
499         @staticmethod
500         def parse_bytes(bytestr):
501                 """Parse a string indicating a byte quantity into a long integer."""
502                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
503                 if matchobj is None:
504                         return None
505                 number = float(matchobj.group(1))
506                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
507                 return long(round(number * multiplier))
508
509         def add_info_extractor(self, ie):
510                 """Add an InfoExtractor object to the end of the list."""
511                 self._ies.append(ie)
512                 ie.set_downloader(self)
513
514         def add_post_processor(self, pp):
515                 """Add a PostProcessor object to the end of the chain."""
516                 self._pps.append(pp)
517                 pp.set_downloader(self)
518
519         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
520                 """Print message to stdout if not in quiet mode."""
521                 try:
522                         if not self.params.get('quiet', False):
523                                 terminator = [u'\n', u''][skip_eol]
524                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
525                         self._screen_file.flush()
526                 except (UnicodeEncodeError), err:
527                         if not ignore_encoding_errors:
528                                 raise
529
530         def to_stderr(self, message):
531                 """Print message to stderr."""
532                 print >>sys.stderr, message.encode(preferredencoding())
533
534         def to_cons_title(self, message):
535                 """Set console/terminal window title to message."""
536                 if not self.params.get('consoletitle', False):
537                         return
538                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
539                         # c_wchar_p() might not be necessary if `message` is
540                         # already of type unicode()
541                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
542                 elif 'TERM' in os.environ:
543                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
544
545         def fixed_template(self):
546                 """Checks if the output template is fixed."""
547                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
548
549         def trouble(self, message=None):
550                 """Determine action to take when a download problem appears.
551
552                 Depending on if the downloader has been configured to ignore
553                 download errors or not, this method may throw an exception or
554                 not when errors are found, after printing the message.
555                 """
556                 if message is not None:
557                         self.to_stderr(message)
558                 if not self.params.get('ignoreerrors', False):
559                         raise DownloadError(message)
560                 self._download_retcode = 1
561
562         def slow_down(self, start_time, byte_counter):
563                 """Sleep if the download speed is over the rate limit."""
564                 rate_limit = self.params.get('ratelimit', None)
565                 if rate_limit is None or byte_counter == 0:
566                         return
567                 now = time.time()
568                 elapsed = now - start_time
569                 if elapsed <= 0.0:
570                         return
571                 speed = float(byte_counter) / elapsed
572                 if speed > rate_limit:
573                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
574
575         def temp_name(self, filename):
576                 """Returns a temporary filename for the given filename."""
577                 if self.params.get('nopart', False) or filename == u'-' or \
578                                 (os.path.exists(filename) and not os.path.isfile(filename)):
579                         return filename
580                 return filename + u'.part'
581
582         def undo_temp_name(self, filename):
583                 if filename.endswith(u'.part'):
584                         return filename[:-len(u'.part')]
585                 return filename
586
587         def try_rename(self, old_filename, new_filename):
588                 try:
589                         if old_filename == new_filename:
590                                 return
591                         os.rename(old_filename, new_filename)
592                 except (IOError, OSError), err:
593                         self.trouble(u'ERROR: unable to rename file')
594         
595         def try_utime(self, filename, last_modified_hdr):
596                 """Try to set the last-modified time of the given file."""
597                 if last_modified_hdr is None:
598                         return
599                 if not os.path.isfile(filename):
600                         return
601                 timestr = last_modified_hdr
602                 if timestr is None:
603                         return
604                 filetime = timeconvert(timestr)
605                 if filetime is None:
606                         return
607                 try:
608                         os.utime(filename,(time.time(), filetime))
609                 except:
610                         pass
611
612         def report_writedescription(self, descfn):
613                 """ Report that the description file is being written """
614                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
615
616         def report_writeinfojson(self, infofn):
617                 """ Report that the metadata file has been written """
618                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
619
620         def report_destination(self, filename):
621                 """Report destination filename."""
622                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
623
624         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
625                 """Report download progress."""
626                 if self.params.get('noprogress', False):
627                         return
628                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
629                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
630                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
631                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
632
633         def report_resuming_byte(self, resume_len):
634                 """Report attempt to resume at given byte."""
635                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
636
637         def report_retry(self, count, retries):
638                 """Report retry in case of HTTP error 5xx"""
639                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
640
641         def report_file_already_downloaded(self, file_name):
642                 """Report file has already been fully downloaded."""
643                 try:
644                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
645                 except (UnicodeEncodeError), err:
646                         self.to_screen(u'[download] The file has already been downloaded')
647
648         def report_unable_to_resume(self):
649                 """Report it was impossible to resume download."""
650                 self.to_screen(u'[download] Unable to resume')
651
652         def report_finish(self):
653                 """Report download finished."""
654                 if self.params.get('noprogress', False):
655                         self.to_screen(u'[download] Download completed')
656                 else:
657                         self.to_screen(u'')
658
659         def increment_downloads(self):
660                 """Increment the ordinal that assigns a number to each file."""
661                 self._num_downloads += 1
662
663         def prepare_filename(self, info_dict):
664                 """Generate the output filename."""
665                 try:
666                         template_dict = dict(info_dict)
667                         template_dict['epoch'] = unicode(long(time.time()))
668                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
669                         filename = self.params['outtmpl'] % template_dict
670                         return filename
671                 except (ValueError, KeyError), err:
672                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
673                         return None
674
675         def process_info(self, info_dict):
676                 """Process a single dictionary returned by an InfoExtractor."""
677                 filename = self.prepare_filename(info_dict)
678                 # Do nothing else if in simulate mode
679                 if self.params.get('simulate', False):
680                         # Forced printings
681                         if self.params.get('forcetitle', False):
682                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
683                         if self.params.get('forceurl', False):
684                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
685                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
686                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
687                         if self.params.get('forcedescription', False) and 'description' in info_dict:
688                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
689                         if self.params.get('forcefilename', False) and filename is not None:
690                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
691
692                         return
693
694                 if filename is None:
695                         return
696                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
697                         self.to_stderr(u'WARNING: file exists and will be skipped')
698                         return
699
700                 try:
701                         self.pmkdir(filename)
702                 except (OSError, IOError), err:
703                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
704                         return
705
706                 if self.params.get('writedescription', False):
707                         try:
708                                 descfn = filename + '.description'
709                                 self.report_writedescription(descfn)
710                                 with contextlib.closing(open(descfn, 'wb')) as descfile:
711                                         descfile.write(info_dict['description'].encode('utf-8'))
712                         except (OSError, IOError):
713                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
714                                 return
715
716                 if self.params.get('writeinfojson', False):
717                         infofn = filename + '.info.json'
718                         self.report_writeinfojson(infofn)
719                         try:
720                                 json.dump
721                         except (NameError,AttributeError):
722                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
723                                 return
724                         try:
725                                 with contextlib.closing(open(infofn, 'wb')) as infof:
726                                         json.dump(info_dict, infof)
727                         except (OSError, IOError):
728                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
729                                 return
730
731                 try:
732                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
733                 except (OSError, IOError), err:
734                         raise UnavailableVideoError
735                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
736                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
737                         return
738                 except (ContentTooShortError, ), err:
739                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
740                         return
741
742                 if success:
743                         try:
744                                 self.post_process(filename, info_dict)
745                         except (PostProcessingError), err:
746                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
747                                 return
748
749         def download(self, url_list):
750                 """Download a given list of URLs."""
751                 if len(url_list) > 1 and self.fixed_template():
752                         raise SameFileError(self.params['outtmpl'])
753
754                 for url in url_list:
755                         suitable_found = False
756                         for ie in self._ies:
757                                 # Go to next InfoExtractor if not suitable
758                                 if not ie.suitable(url):
759                                         continue
760
761                                 # Suitable InfoExtractor found
762                                 suitable_found = True
763
764                                 # Extract information from URL and process it
765                                 ie.extract(url)
766
767                                 # Suitable InfoExtractor had been found; go to next URL
768                                 break
769
770                         if not suitable_found:
771                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
772
773                 return self._download_retcode
774
775         def post_process(self, filename, ie_info):
776                 """Run the postprocessing chain on the given file."""
777                 info = dict(ie_info)
778                 info['filepath'] = filename
779                 for pp in self._pps:
780                         info = pp.run(info)
781                         if info is None:
782                                 break
783
784         def _download_with_rtmpdump(self, filename, url, player_url):
785                 self.report_destination(filename)
786                 tmpfilename = self.temp_name(filename)
787
788                 # Check for rtmpdump first
789                 try:
790                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
791                 except (OSError, IOError):
792                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
793                         return False
794
795                 # Download using rtmpdump. rtmpdump returns exit code 2 when
796                 # the connection was interrumpted and resuming appears to be
797                 # possible. This is part of rtmpdump's normal usage, AFAIK.
798                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
799                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
800                 while retval == 2 or retval == 1:
801                         prevsize = os.path.getsize(tmpfilename)
802                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
803                         time.sleep(5.0) # This seems to be needed
804                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
805                         cursize = os.path.getsize(tmpfilename)
806                         if prevsize == cursize and retval == 1:
807                                 break
808                 if retval == 0:
809                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
810                         self.try_rename(tmpfilename, filename)
811                         return True
812                 else:
813                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
814                         return False
815
816         def _do_download(self, filename, url, player_url):
817                 # Check file already present
818                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
819                         self.report_file_already_downloaded(filename)
820                         return True
821
822                 # Attempt to download using rtmpdump
823                 if url.startswith('rtmp'):
824                         return self._download_with_rtmpdump(filename, url, player_url)
825
826                 tmpfilename = self.temp_name(filename)
827                 stream = None
828                 open_mode = 'wb'
829
830                 # Do not include the Accept-Encoding header
831                 headers = {'Youtubedl-no-compression': 'True'}
832                 basic_request = urllib2.Request(url, None, headers)
833                 request = urllib2.Request(url, None, headers)
834
835                 # Establish possible resume length
836                 if os.path.isfile(tmpfilename):
837                         resume_len = os.path.getsize(tmpfilename)
838                 else:
839                         resume_len = 0
840
841                 # Request parameters in case of being able to resume
842                 if self.params.get('continuedl', False) and resume_len != 0:
843                         self.report_resuming_byte(resume_len)
844                         request.add_header('Range','bytes=%d-' % resume_len)
845                         open_mode = 'ab'
846
847                 count = 0
848                 retries = self.params.get('retries', 0)
849                 while count <= retries:
850                         # Establish connection
851                         try:
852                                 data = urllib2.urlopen(request)
853                                 break
854                         except (urllib2.HTTPError, ), err:
855                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
856                                         # Unexpected HTTP error
857                                         raise
858                                 elif err.code == 416:
859                                         # Unable to resume (requested range not satisfiable)
860                                         try:
861                                                 # Open the connection again without the range header
862                                                 data = urllib2.urlopen(basic_request)
863                                                 content_length = data.info()['Content-Length']
864                                         except (urllib2.HTTPError, ), err:
865                                                 if err.code < 500 or err.code >= 600:
866                                                         raise
867                                         else:
868                                                 # Examine the reported length
869                                                 if (content_length is not None and
870                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
871                                                         # The file had already been fully downloaded.
872                                                         # Explanation to the above condition: in issue #175 it was revealed that
873                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
874                                                         # changing the file size slightly and causing problems for some users. So
875                                                         # I decided to implement a suggested change and consider the file
876                                                         # completely downloaded if the file size differs less than 100 bytes from
877                                                         # the one in the hard drive.
878                                                         self.report_file_already_downloaded(filename)
879                                                         self.try_rename(tmpfilename, filename)
880                                                         return True
881                                                 else:
882                                                         # The length does not match, we start the download over
883                                                         self.report_unable_to_resume()
884                                                         open_mode = 'wb'
885                                                         break
886                         # Retry
887                         count += 1
888                         if count <= retries:
889                                 self.report_retry(count, retries)
890
891                 if count > retries:
892                         self.trouble(u'ERROR: giving up after %s retries' % retries)
893                         return False
894
895                 data_len = data.info().get('Content-length', None)
896                 if data_len is not None:
897                         data_len = long(data_len) + resume_len
898                 data_len_str = self.format_bytes(data_len)
899                 byte_counter = 0 + resume_len
900                 block_size = 1024
901                 start = time.time()
902                 while True:
903                         # Download and write
904                         before = time.time()
905                         data_block = data.read(block_size)
906                         after = time.time()
907                         if len(data_block) == 0:
908                                 break
909                         byte_counter += len(data_block)
910
911                         # Open file just in time
912                         if stream is None:
913                                 try:
914                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
915                                         filename = self.undo_temp_name(tmpfilename)
916                                         self.report_destination(filename)
917                                 except (OSError, IOError), err:
918                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
919                                         return False
920                         try:
921                                 stream.write(data_block)
922                         except (IOError, OSError), err:
923                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
924                                 return False
925                         block_size = self.best_block_size(after - before, len(data_block))
926
927                         # Progress message
928                         percent_str = self.calc_percent(byte_counter, data_len)
929                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
930                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
931                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
932
933                         # Apply rate limit
934                         self.slow_down(start, byte_counter - resume_len)
935
936                 stream.close()
937                 self.report_finish()
938                 if data_len is not None and byte_counter != data_len:
939                         raise ContentTooShortError(byte_counter, long(data_len))
940                 self.try_rename(tmpfilename, filename)
941
942                 # Update file modification time
943                 if self.params.get('updatetime', True):
944                         self.try_utime(filename, data.info().get('last-modified', None))
945
946                 return True
947
948 class InfoExtractor(object):
949         """Information Extractor class.
950
951         Information extractors are the classes that, given a URL, extract
952         information from the video (or videos) the URL refers to. This
953         information includes the real video URL, the video title and simplified
954         title, author and others. The information is stored in a dictionary
955         which is then passed to the FileDownloader. The FileDownloader
956         processes this information possibly downloading the video to the file
957         system, among other possible outcomes. The dictionaries must include
958         the following fields:
959
960         id:             Video identifier.
961         url:            Final video URL.
962         uploader:       Nickname of the video uploader.
963         title:          Literal title.
964         stitle:         Simplified title.
965         ext:            Video filename extension.
966         format:         Video format.
967         player_url:     SWF Player URL (may be None).
968
969         The following fields are optional. Their primary purpose is to allow
970         youtube-dl to serve as the backend for a video search function, such
971         as the one in youtube2mp3.  They are only used when their respective
972         forced printing functions are called:
973
974         thumbnail:      Full URL to a video thumbnail image.
975         description:    One-line video description.
976
977         Subclasses of this one should re-define the _real_initialize() and
978         _real_extract() methods, as well as the suitable() static method.
979         Probably, they should also be instantiated and added to the main
980         downloader.
981         """
982
983         _ready = False
984         _downloader = None
985
986         def __init__(self, downloader=None):
987                 """Constructor. Receives an optional downloader."""
988                 self._ready = False
989                 self.set_downloader(downloader)
990
991         @staticmethod
992         def suitable(url):
993                 """Receives a URL and returns True if suitable for this IE."""
994                 return False
995
996         def initialize(self):
997                 """Initializes an instance (authentication, etc)."""
998                 if not self._ready:
999                         self._real_initialize()
1000                         self._ready = True
1001
1002         def extract(self, url):
1003                 """Extracts URL information and returns it in list of dicts."""
1004                 self.initialize()
1005                 return self._real_extract(url)
1006
1007         def set_downloader(self, downloader):
1008                 """Sets the downloader for this IE."""
1009                 self._downloader = downloader
1010
1011         def _real_initialize(self):
1012                 """Real initialization process. Redefine in subclasses."""
1013                 pass
1014
1015         def _real_extract(self, url):
1016                 """Real extraction process. Redefine in subclasses."""
1017                 pass
1018
1019 class YoutubeIE(InfoExtractor):
1020         """Information extractor for youtube.com."""
1021
1022         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1023         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1024         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1025         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1026         _NETRC_MACHINE = 'youtube'
1027         # Listed in order of quality
1028         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1029         _video_extensions = {
1030                 '13': '3gp',
1031                 '17': 'mp4',
1032                 '18': 'mp4',
1033                 '22': 'mp4',
1034                 '37': 'mp4',
1035                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1036                 '43': 'webm',
1037                 '45': 'webm',
1038         }
1039
1040         @staticmethod
1041         def suitable(url):
1042                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1043
1044         def report_lang(self):
1045                 """Report attempt to set language."""
1046                 self._downloader.to_screen(u'[youtube] Setting language')
1047
1048         def report_login(self):
1049                 """Report attempt to log in."""
1050                 self._downloader.to_screen(u'[youtube] Logging in')
1051
1052         def report_age_confirmation(self):
1053                 """Report attempt to confirm age."""
1054                 self._downloader.to_screen(u'[youtube] Confirming age')
1055
1056         def report_video_webpage_download(self, video_id):
1057                 """Report attempt to download video webpage."""
1058                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1059
1060         def report_video_info_webpage_download(self, video_id):
1061                 """Report attempt to download video info webpage."""
1062                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1063
1064         def report_information_extraction(self, video_id):
1065                 """Report attempt to extract video information."""
1066                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1067
1068         def report_unavailable_format(self, video_id, format):
1069                 """Report extracted video URL."""
1070                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1071
1072         def report_rtmp_download(self):
1073                 """Indicate the download will use the RTMP protocol."""
1074                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1075
1076         def _real_initialize(self):
1077                 if self._downloader is None:
1078                         return
1079
1080                 username = None
1081                 password = None
1082                 downloader_params = self._downloader.params
1083
1084                 # Attempt to use provided username and password or .netrc data
1085                 if downloader_params.get('username', None) is not None:
1086                         username = downloader_params['username']
1087                         password = downloader_params['password']
1088                 elif downloader_params.get('usenetrc', False):
1089                         try:
1090                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1091                                 if info is not None:
1092                                         username = info[0]
1093                                         password = info[2]
1094                                 else:
1095                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1096                         except (IOError, netrc.NetrcParseError), err:
1097                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1098                                 return
1099
1100                 # Set language
1101                 request = urllib2.Request(self._LANG_URL)
1102                 try:
1103                         self.report_lang()
1104                         urllib2.urlopen(request).read()
1105                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1106                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1107                         return
1108
1109                 # No authentication to be performed
1110                 if username is None:
1111                         return
1112
1113                 # Log in
1114                 login_form = {
1115                                 'current_form': 'loginForm',
1116                                 'next':         '/',
1117                                 'action_login': 'Log In',
1118                                 'username':     username,
1119                                 'password':     password,
1120                                 }
1121                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1122                 try:
1123                         self.report_login()
1124                         login_results = urllib2.urlopen(request).read()
1125                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1126                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1127                                 return
1128                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1129                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1130                         return
1131
1132                 # Confirm age
1133                 age_form = {
1134                                 'next_url':             '/',
1135                                 'action_confirm':       'Confirm',
1136                                 }
1137                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1138                 try:
1139                         self.report_age_confirmation()
1140                         age_results = urllib2.urlopen(request).read()
1141                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1142                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1143                         return
1144
1145         def _real_extract(self, url):
1146                 # Extract video id from URL
1147                 mobj = re.match(self._VALID_URL, url)
1148                 if mobj is None:
1149                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1150                         return
1151                 video_id = mobj.group(2)
1152
1153                 # Get video webpage
1154                 self.report_video_webpage_download(video_id)
1155                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1156                 try:
1157                         video_webpage = urllib2.urlopen(request).read()
1158                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1159                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1160                         return
1161
1162                 # Attempt to extract SWF player URL
1163                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1164                 if mobj is not None:
1165                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1166                 else:
1167                         player_url = None
1168
1169                 # Get video info
1170                 self.report_video_info_webpage_download(video_id)
1171                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1172                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1173                                            % (video_id, el_type))
1174                         request = urllib2.Request(video_info_url)
1175                         try:
1176                                 video_info_webpage = urllib2.urlopen(request).read()
1177                                 video_info = parse_qs(video_info_webpage)
1178                                 if 'token' in video_info:
1179                                         break
1180                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1182                                 return
1183                 if 'token' not in video_info:
1184                         if 'reason' in video_info:
1185                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1186                         else:
1187                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1188                         return
1189
1190                 # Start extracting information
1191                 self.report_information_extraction(video_id)
1192
1193                 # uploader
1194                 if 'author' not in video_info:
1195                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1196                         return
1197                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1198
1199                 # title
1200                 if 'title' not in video_info:
1201                         self._downloader.trouble(u'ERROR: unable to extract video title')
1202                         return
1203                 video_title = urllib.unquote_plus(video_info['title'][0])
1204                 video_title = video_title.decode('utf-8')
1205                 video_title = sanitize_title(video_title)
1206
1207                 # simplified title
1208                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1209                 simple_title = simple_title.strip(ur'_')
1210
1211                 # thumbnail image
1212                 if 'thumbnail_url' not in video_info:
1213                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1214                         video_thumbnail = ''
1215                 else:   # don't panic if we can't find it
1216                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1217
1218                 # upload date
1219                 upload_date = u'NA'
1220                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1221                 if mobj is not None:
1222                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1223                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1224                         for expression in format_expressions:
1225                                 try:
1226                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1227                                 except:
1228                                         pass
1229
1230                 # description
1231                 try:
1232                         lxml.etree
1233                 except NameError:
1234                         video_description = u'No description available.'
1235                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1236                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1237                                 if mobj is not None:
1238                                         video_description = mobj.group(1).decode('utf-8')
1239                 else:
1240                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1241                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1242                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1243                         # TODO use another parser
1244
1245                 # token
1246                 video_token = urllib.unquote_plus(video_info['token'][0])
1247
1248                 # Decide which formats to download
1249                 req_format = self._downloader.params.get('format', None)
1250
1251                 raw_map = None
1252                 if 'fmt_url_map' in video_info and len(video_info['fmt_url_map']) >= 1:
1253                         raw_map = video_info['fmt_url_map'][0]
1254                 elif 'fmt_stream_map' in video_info and len(video_info['fmt_stream_map']) >= 1:
1255                         raw_map = video_info['fmt_stream_map'][0]
1256
1257                 if raw_map is not None:
1258                         url_map = dict(tuple(pair.split('|')[:2]) for pair in raw_map.split(','))
1259                         format_limit = self._downloader.params.get('format_limit', None)
1260                         if format_limit is not None and format_limit in self._available_formats:
1261                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1262                         else:
1263                                 format_list = self._available_formats
1264                         existing_formats = [x for x in format_list if x in url_map]
1265                         if len(existing_formats) == 0:
1266                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1267                                 return
1268                         if req_format is None:
1269                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1270                         elif req_format == '-1':
1271                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1272                         else:
1273                                 # Specific format
1274                                 if req_format not in url_map:
1275                                         self._downloader.trouble(u'ERROR: requested format not available')
1276                                         return
1277                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1278
1279                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1280                         self.report_rtmp_download()
1281                         video_url_list = [(None, video_info['conn'][0])]
1282
1283                 else:
1284                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1285                         return
1286
1287                 for format_param, video_real_url in video_url_list:
1288                         # At this point we have a new video
1289                         self._downloader.increment_downloads()
1290
1291                         # Extension
1292                         video_extension = self._video_extensions.get(format_param, 'flv')
1293
1294                         # Find the video URL in fmt_url_map or conn paramters
1295                         try:
1296                                 # Process video information
1297                                 self._downloader.process_info({
1298                                         'id':           video_id.decode('utf-8'),
1299                                         'url':          video_real_url.decode('utf-8'),
1300                                         'uploader':     video_uploader.decode('utf-8'),
1301                                         'upload_date':  upload_date,
1302                                         'title':        video_title,
1303                                         'stitle':       simple_title,
1304                                         'ext':          video_extension.decode('utf-8'),
1305                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1306                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1307                                         'description':  video_description,
1308                                         'player_url':   player_url,
1309                                 })
1310                         except UnavailableVideoError, err:
1311                                 self._downloader.trouble(u'\nERROR: unable to download video')
1312
1313
1314 class MetacafeIE(InfoExtractor):
1315         """Information Extractor for metacafe.com."""
1316
1317         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1318         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1319         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1320         _youtube_ie = None
1321
1322         def __init__(self, youtube_ie, downloader=None):
1323                 InfoExtractor.__init__(self, downloader)
1324                 self._youtube_ie = youtube_ie
1325
1326         @staticmethod
1327         def suitable(url):
1328                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1329
1330         def report_disclaimer(self):
1331                 """Report disclaimer retrieval."""
1332                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1333
1334         def report_age_confirmation(self):
1335                 """Report attempt to confirm age."""
1336                 self._downloader.to_screen(u'[metacafe] Confirming age')
1337
1338         def report_download_webpage(self, video_id):
1339                 """Report webpage download."""
1340                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1341
1342         def report_extraction(self, video_id):
1343                 """Report information extraction."""
1344                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1345
1346         def _real_initialize(self):
1347                 # Retrieve disclaimer
1348                 request = urllib2.Request(self._DISCLAIMER)
1349                 try:
1350                         self.report_disclaimer()
1351                         disclaimer = urllib2.urlopen(request).read()
1352                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1353                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1354                         return
1355
1356                 # Confirm age
1357                 disclaimer_form = {
1358                         'filters': '0',
1359                         'submit': "Continue - I'm over 18",
1360                         }
1361                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1362                 try:
1363                         self.report_age_confirmation()
1364                         disclaimer = urllib2.urlopen(request).read()
1365                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1366                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1367                         return
1368
1369         def _real_extract(self, url):
1370                 # Extract id and simplified title from URL
1371                 mobj = re.match(self._VALID_URL, url)
1372                 if mobj is None:
1373                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1374                         return
1375
1376                 video_id = mobj.group(1)
1377
1378                 # Check if video comes from YouTube
1379                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1380                 if mobj2 is not None:
1381                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1382                         return
1383
1384                 # At this point we have a new video
1385                 self._downloader.increment_downloads()
1386
1387                 simple_title = mobj.group(2).decode('utf-8')
1388
1389                 # Retrieve video webpage to extract further information
1390                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1391                 try:
1392                         self.report_download_webpage(video_id)
1393                         webpage = urllib2.urlopen(request).read()
1394                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1395                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1396                         return
1397
1398                 # Extract URL, uploader and title from webpage
1399                 self.report_extraction(video_id)
1400                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1401                 if mobj is not None:
1402                         mediaURL = urllib.unquote(mobj.group(1))
1403                         video_extension = mediaURL[-3:]
1404
1405                         # Extract gdaKey if available
1406                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1407                         if mobj is None:
1408                                 video_url = mediaURL
1409                         else:
1410                                 gdaKey = mobj.group(1)
1411                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1412                 else:
1413                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1414                         if mobj is None:
1415                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1416                                 return
1417                         vardict = parse_qs(mobj.group(1))
1418                         if 'mediaData' not in vardict:
1419                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1420                                 return
1421                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1422                         if mobj is None:
1423                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1424                                 return
1425                         mediaURL = mobj.group(1).replace('\\/', '/')
1426                         video_extension = mediaURL[-3:]
1427                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1428
1429                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1430                 if mobj is None:
1431                         self._downloader.trouble(u'ERROR: unable to extract title')
1432                         return
1433                 video_title = mobj.group(1).decode('utf-8')
1434                 video_title = sanitize_title(video_title)
1435
1436                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1437                 if mobj is None:
1438                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1439                         return
1440                 video_uploader = mobj.group(1)
1441
1442                 try:
1443                         # Process video information
1444                         self._downloader.process_info({
1445                                 'id':           video_id.decode('utf-8'),
1446                                 'url':          video_url.decode('utf-8'),
1447                                 'uploader':     video_uploader.decode('utf-8'),
1448                                 'upload_date':  u'NA',
1449                                 'title':        video_title,
1450                                 'stitle':       simple_title,
1451                                 'ext':          video_extension.decode('utf-8'),
1452                                 'format':       u'NA',
1453                                 'player_url':   None,
1454                         })
1455                 except UnavailableVideoError:
1456                         self._downloader.trouble(u'\nERROR: unable to download video')
1457
1458
1459 class DailymotionIE(InfoExtractor):
1460         """Information Extractor for Dailymotion"""
1461
1462         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1463
1464         def __init__(self, downloader=None):
1465                 InfoExtractor.__init__(self, downloader)
1466
1467         @staticmethod
1468         def suitable(url):
1469                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1470
1471         def report_download_webpage(self, video_id):
1472                 """Report webpage download."""
1473                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1474
1475         def report_extraction(self, video_id):
1476                 """Report information extraction."""
1477                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1478
1479         def _real_initialize(self):
1480                 return
1481
1482         def _real_extract(self, url):
1483                 # Extract id and simplified title from URL
1484                 mobj = re.match(self._VALID_URL, url)
1485                 if mobj is None:
1486                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1487                         return
1488
1489                 # At this point we have a new video
1490                 self._downloader.increment_downloads()
1491                 video_id = mobj.group(1)
1492
1493                 simple_title = mobj.group(2).decode('utf-8')
1494                 video_extension = 'flv'
1495
1496                 # Retrieve video webpage to extract further information
1497                 request = urllib2.Request(url)
1498                 try:
1499                         self.report_download_webpage(video_id)
1500                         webpage = urllib2.urlopen(request).read()
1501                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1502                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1503                         return
1504
1505                 # Extract URL, uploader and title from webpage
1506                 self.report_extraction(video_id)
1507                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1508                 if mobj is None:
1509                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1510                         return
1511                 mediaURL = urllib.unquote(mobj.group(1))
1512
1513                 # if needed add http://www.dailymotion.com/ if relative URL
1514
1515                 video_url = mediaURL
1516
1517                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1518                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1519                 if mobj is None:
1520                         self._downloader.trouble(u'ERROR: unable to extract title')
1521                         return
1522                 video_title = mobj.group(1).decode('utf-8')
1523                 video_title = sanitize_title(video_title)
1524
1525                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1526                 if mobj is None:
1527                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1528                         return
1529                 video_uploader = mobj.group(1)
1530
1531                 try:
1532                         # Process video information
1533                         self._downloader.process_info({
1534                                 'id':           video_id.decode('utf-8'),
1535                                 'url':          video_url.decode('utf-8'),
1536                                 'uploader':     video_uploader.decode('utf-8'),
1537                                 'upload_date':  u'NA',
1538                                 'title':        video_title,
1539                                 'stitle':       simple_title,
1540                                 'ext':          video_extension.decode('utf-8'),
1541                                 'format':       u'NA',
1542                                 'player_url':   None,
1543                         })
1544                 except UnavailableVideoError:
1545                         self._downloader.trouble(u'\nERROR: unable to download video')
1546
1547 class GoogleIE(InfoExtractor):
1548         """Information extractor for video.google.com."""
1549
1550         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1551
1552         def __init__(self, downloader=None):
1553                 InfoExtractor.__init__(self, downloader)
1554
1555         @staticmethod
1556         def suitable(url):
1557                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1558
1559         def report_download_webpage(self, video_id):
1560                 """Report webpage download."""
1561                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1562
1563         def report_extraction(self, video_id):
1564                 """Report information extraction."""
1565                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1566
1567         def _real_initialize(self):
1568                 return
1569
1570         def _real_extract(self, url):
1571                 # Extract id from URL
1572                 mobj = re.match(self._VALID_URL, url)
1573                 if mobj is None:
1574                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1575                         return
1576
1577                 # At this point we have a new video
1578                 self._downloader.increment_downloads()
1579                 video_id = mobj.group(1)
1580
1581                 video_extension = 'mp4'
1582
1583                 # Retrieve video webpage to extract further information
1584                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1585                 try:
1586                         self.report_download_webpage(video_id)
1587                         webpage = urllib2.urlopen(request).read()
1588                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1589                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1590                         return
1591
1592                 # Extract URL, uploader, and title from webpage
1593                 self.report_extraction(video_id)
1594                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1595                 if mobj is None:
1596                         video_extension = 'flv'
1597                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1598                 if mobj is None:
1599                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1600                         return
1601                 mediaURL = urllib.unquote(mobj.group(1))
1602                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1603                 mediaURL = mediaURL.replace('\\x26', '\x26')
1604
1605                 video_url = mediaURL
1606
1607                 mobj = re.search(r'<title>(.*)</title>', webpage)
1608                 if mobj is None:
1609                         self._downloader.trouble(u'ERROR: unable to extract title')
1610                         return
1611                 video_title = mobj.group(1).decode('utf-8')
1612                 video_title = sanitize_title(video_title)
1613                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1614
1615                 # Extract video description
1616                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1617                 if mobj is None:
1618                         self._downloader.trouble(u'ERROR: unable to extract video description')
1619                         return
1620                 video_description = mobj.group(1).decode('utf-8')
1621                 if not video_description:
1622                         video_description = 'No description available.'
1623
1624                 # Extract video thumbnail
1625                 if self._downloader.params.get('forcethumbnail', False):
1626                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1627                         try:
1628                                 webpage = urllib2.urlopen(request).read()
1629                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1630                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1631                                 return
1632                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1633                         if mobj is None:
1634                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1635                                 return
1636                         video_thumbnail = mobj.group(1)
1637                 else:   # we need something to pass to process_info
1638                         video_thumbnail = ''
1639
1640
1641                 try:
1642                         # Process video information
1643                         self._downloader.process_info({
1644                                 'id':           video_id.decode('utf-8'),
1645                                 'url':          video_url.decode('utf-8'),
1646                                 'uploader':     u'NA',
1647                                 'upload_date':  u'NA',
1648                                 'title':        video_title,
1649                                 'stitle':       simple_title,
1650                                 'ext':          video_extension.decode('utf-8'),
1651                                 'format':       u'NA',
1652                                 'player_url':   None,
1653                         })
1654                 except UnavailableVideoError:
1655                         self._downloader.trouble(u'\nERROR: unable to download video')
1656
1657
1658 class PhotobucketIE(InfoExtractor):
1659         """Information extractor for photobucket.com."""
1660
1661         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1662
1663         def __init__(self, downloader=None):
1664                 InfoExtractor.__init__(self, downloader)
1665
1666         @staticmethod
1667         def suitable(url):
1668                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1669
1670         def report_download_webpage(self, video_id):
1671                 """Report webpage download."""
1672                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1673
1674         def report_extraction(self, video_id):
1675                 """Report information extraction."""
1676                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1677
1678         def _real_initialize(self):
1679                 return
1680
1681         def _real_extract(self, url):
1682                 # Extract id from URL
1683                 mobj = re.match(self._VALID_URL, url)
1684                 if mobj is None:
1685                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1686                         return
1687
1688                 # At this point we have a new video
1689                 self._downloader.increment_downloads()
1690                 video_id = mobj.group(1)
1691
1692                 video_extension = 'flv'
1693
1694                 # Retrieve video webpage to extract further information
1695                 request = urllib2.Request(url)
1696                 try:
1697                         self.report_download_webpage(video_id)
1698                         webpage = urllib2.urlopen(request).read()
1699                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1700                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1701                         return
1702
1703                 # Extract URL, uploader, and title from webpage
1704                 self.report_extraction(video_id)
1705                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1706                 if mobj is None:
1707                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1708                         return
1709                 mediaURL = urllib.unquote(mobj.group(1))
1710
1711                 video_url = mediaURL
1712
1713                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1714                 if mobj is None:
1715                         self._downloader.trouble(u'ERROR: unable to extract title')
1716                         return
1717                 video_title = mobj.group(1).decode('utf-8')
1718                 video_title = sanitize_title(video_title)
1719                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1720
1721                 video_uploader = mobj.group(2).decode('utf-8')
1722
1723                 try:
1724                         # Process video information
1725                         self._downloader.process_info({
1726                                 'id':           video_id.decode('utf-8'),
1727                                 'url':          video_url.decode('utf-8'),
1728                                 'uploader':     video_uploader,
1729                                 'upload_date':  u'NA',
1730                                 'title':        video_title,
1731                                 'stitle':       simple_title,
1732                                 'ext':          video_extension.decode('utf-8'),
1733                                 'format':       u'NA',
1734                                 'player_url':   None,
1735                         })
1736                 except UnavailableVideoError:
1737                         self._downloader.trouble(u'\nERROR: unable to download video')
1738
1739
1740 class YahooIE(InfoExtractor):
1741         """Information extractor for video.yahoo.com."""
1742
1743         # _VALID_URL matches all Yahoo! Video URLs
1744         # _VPAGE_URL matches only the extractable '/watch/' URLs
1745         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1746         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1747
1748         def __init__(self, downloader=None):
1749                 InfoExtractor.__init__(self, downloader)
1750
1751         @staticmethod
1752         def suitable(url):
1753                 return (re.match(YahooIE._VALID_URL, url) is not None)
1754
1755         def report_download_webpage(self, video_id):
1756                 """Report webpage download."""
1757                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1758
1759         def report_extraction(self, video_id):
1760                 """Report information extraction."""
1761                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1762
1763         def _real_initialize(self):
1764                 return
1765
1766         def _real_extract(self, url, new_video=True):
1767                 # Extract ID from URL
1768                 mobj = re.match(self._VALID_URL, url)
1769                 if mobj is None:
1770                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1771                         return
1772
1773                 # At this point we have a new video
1774                 self._downloader.increment_downloads()
1775                 video_id = mobj.group(2)
1776                 video_extension = 'flv'
1777
1778                 # Rewrite valid but non-extractable URLs as
1779                 # extractable English language /watch/ URLs
1780                 if re.match(self._VPAGE_URL, url) is None:
1781                         request = urllib2.Request(url)
1782                         try:
1783                                 webpage = urllib2.urlopen(request).read()
1784                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1785                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1786                                 return
1787
1788                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1789                         if mobj is None:
1790                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1791                                 return
1792                         yahoo_id = mobj.group(1)
1793
1794                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1795                         if mobj is None:
1796                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1797                                 return
1798                         yahoo_vid = mobj.group(1)
1799
1800                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1801                         return self._real_extract(url, new_video=False)
1802
1803                 # Retrieve video webpage to extract further information
1804                 request = urllib2.Request(url)
1805                 try:
1806                         self.report_download_webpage(video_id)
1807                         webpage = urllib2.urlopen(request).read()
1808                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1809                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1810                         return
1811
1812                 # Extract uploader and title from webpage
1813                 self.report_extraction(video_id)
1814                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1815                 if mobj is None:
1816                         self._downloader.trouble(u'ERROR: unable to extract video title')
1817                         return
1818                 video_title = mobj.group(1).decode('utf-8')
1819                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1820
1821                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1822                 if mobj is None:
1823                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1824                         return
1825                 video_uploader = mobj.group(1).decode('utf-8')
1826
1827                 # Extract video thumbnail
1828                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1829                 if mobj is None:
1830                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1831                         return
1832                 video_thumbnail = mobj.group(1).decode('utf-8')
1833
1834                 # Extract video description
1835                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1836                 if mobj is None:
1837                         self._downloader.trouble(u'ERROR: unable to extract video description')
1838                         return
1839                 video_description = mobj.group(1).decode('utf-8')
1840                 if not video_description: video_description = 'No description available.'
1841
1842                 # Extract video height and width
1843                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1844                 if mobj is None:
1845                         self._downloader.trouble(u'ERROR: unable to extract video height')
1846                         return
1847                 yv_video_height = mobj.group(1)
1848
1849                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1850                 if mobj is None:
1851                         self._downloader.trouble(u'ERROR: unable to extract video width')
1852                         return
1853                 yv_video_width = mobj.group(1)
1854
1855                 # Retrieve video playlist to extract media URL
1856                 # I'm not completely sure what all these options are, but we
1857                 # seem to need most of them, otherwise the server sends a 401.
1858                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1859                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1860                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1861                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1862                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1863                 try:
1864                         self.report_download_webpage(video_id)
1865                         webpage = urllib2.urlopen(request).read()
1866                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1867                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1868                         return
1869
1870                 # Extract media URL from playlist XML
1871                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1872                 if mobj is None:
1873                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1874                         return
1875                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1876                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1877
1878                 try:
1879                         # Process video information
1880                         self._downloader.process_info({
1881                                 'id':           video_id.decode('utf-8'),
1882                                 'url':          video_url,
1883                                 'uploader':     video_uploader,
1884                                 'upload_date':  u'NA',
1885                                 'title':        video_title,
1886                                 'stitle':       simple_title,
1887                                 'ext':          video_extension.decode('utf-8'),
1888                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1889                                 'description':  video_description,
1890                                 'thumbnail':    video_thumbnail,
1891                                 'description':  video_description,
1892                                 'player_url':   None,
1893                         })
1894                 except UnavailableVideoError:
1895                         self._downloader.trouble(u'\nERROR: unable to download video')
1896
1897
1898 class GenericIE(InfoExtractor):
1899         """Generic last-resort information extractor."""
1900
1901         def __init__(self, downloader=None):
1902                 InfoExtractor.__init__(self, downloader)
1903
1904         @staticmethod
1905         def suitable(url):
1906                 return True
1907
1908         def report_download_webpage(self, video_id):
1909                 """Report webpage download."""
1910                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1911                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1912
1913         def report_extraction(self, video_id):
1914                 """Report information extraction."""
1915                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1916
1917         def _real_initialize(self):
1918                 return
1919
1920         def _real_extract(self, url):
1921                 # At this point we have a new video
1922                 self._downloader.increment_downloads()
1923
1924                 video_id = url.split('/')[-1]
1925                 request = urllib2.Request(url)
1926                 try:
1927                         self.report_download_webpage(video_id)
1928                         webpage = urllib2.urlopen(request).read()
1929                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1930                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1931                         return
1932                 except ValueError, err:
1933                         # since this is the last-resort InfoExtractor, if
1934                         # this error is thrown, it'll be thrown here
1935                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1936                         return
1937
1938                 self.report_extraction(video_id)
1939                 # Start with something easy: JW Player in SWFObject
1940                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1941                 if mobj is None:
1942                         # Broaden the search a little bit
1943                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1944                 if mobj is None:
1945                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1946                         return
1947
1948                 # It's possible that one of the regexes
1949                 # matched, but returned an empty group:
1950                 if mobj.group(1) is None:
1951                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1952                         return
1953
1954                 video_url = urllib.unquote(mobj.group(1))
1955                 video_id  = os.path.basename(video_url)
1956
1957                 # here's a fun little line of code for you:
1958                 video_extension = os.path.splitext(video_id)[1][1:]
1959                 video_id        = os.path.splitext(video_id)[0]
1960
1961                 # it's tempting to parse this further, but you would
1962                 # have to take into account all the variations like
1963                 #   Video Title - Site Name
1964                 #   Site Name | Video Title
1965                 #   Video Title - Tagline | Site Name
1966                 # and so on and so forth; it's just not practical
1967                 mobj = re.search(r'<title>(.*)</title>', webpage)
1968                 if mobj is None:
1969                         self._downloader.trouble(u'ERROR: unable to extract title')
1970                         return
1971                 video_title = mobj.group(1).decode('utf-8')
1972                 video_title = sanitize_title(video_title)
1973                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1974
1975                 # video uploader is domain name
1976                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1977                 if mobj is None:
1978                         self._downloader.trouble(u'ERROR: unable to extract title')
1979                         return
1980                 video_uploader = mobj.group(1).decode('utf-8')
1981
1982                 try:
1983                         # Process video information
1984                         self._downloader.process_info({
1985                                 'id':           video_id.decode('utf-8'),
1986                                 'url':          video_url.decode('utf-8'),
1987                                 'uploader':     video_uploader,
1988                                 'upload_date':  u'NA',
1989                                 'title':        video_title,
1990                                 'stitle':       simple_title,
1991                                 'ext':          video_extension.decode('utf-8'),
1992                                 'format':       u'NA',
1993                                 'player_url':   None,
1994                         })
1995                 except UnavailableVideoError, err:
1996                         self._downloader.trouble(u'\nERROR: unable to download video')
1997
1998
1999 class YoutubeSearchIE(InfoExtractor):
2000         """Information Extractor for YouTube search queries."""
2001         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2002         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2003         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2004         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2005         _youtube_ie = None
2006         _max_youtube_results = 1000
2007
2008         def __init__(self, youtube_ie, downloader=None):
2009                 InfoExtractor.__init__(self, downloader)
2010                 self._youtube_ie = youtube_ie
2011
2012         @staticmethod
2013         def suitable(url):
2014                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2015
2016         def report_download_page(self, query, pagenum):
2017                 """Report attempt to download playlist page with given number."""
2018                 query = query.decode(preferredencoding())
2019                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2020
2021         def _real_initialize(self):
2022                 self._youtube_ie.initialize()
2023
2024         def _real_extract(self, query):
2025                 mobj = re.match(self._VALID_QUERY, query)
2026                 if mobj is None:
2027                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2028                         return
2029
2030                 prefix, query = query.split(':')
2031                 prefix = prefix[8:]
2032                 query  = query.encode('utf-8')
2033                 if prefix == '':
2034                         self._download_n_results(query, 1)
2035                         return
2036                 elif prefix == 'all':
2037                         self._download_n_results(query, self._max_youtube_results)
2038                         return
2039                 else:
2040                         try:
2041                                 n = long(prefix)
2042                                 if n <= 0:
2043                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2044                                         return
2045                                 elif n > self._max_youtube_results:
2046                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2047                                         n = self._max_youtube_results
2048                                 self._download_n_results(query, n)
2049                                 return
2050                         except ValueError: # parsing prefix as integer fails
2051                                 self._download_n_results(query, 1)
2052                                 return
2053
2054         def _download_n_results(self, query, n):
2055                 """Downloads a specified number of results for a query"""
2056
2057                 video_ids = []
2058                 already_seen = set()
2059                 pagenum = 1
2060
2061                 while True:
2062                         self.report_download_page(query, pagenum)
2063                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2064                         request = urllib2.Request(result_url)
2065                         try:
2066                                 page = urllib2.urlopen(request).read()
2067                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2068                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2069                                 return
2070
2071                         # Extract video identifiers
2072                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2073                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2074                                 if video_id not in already_seen:
2075                                         video_ids.append(video_id)
2076                                         already_seen.add(video_id)
2077                                         if len(video_ids) == n:
2078                                                 # Specified n videos reached
2079                                                 for id in video_ids:
2080                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2081                                                 return
2082
2083                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2084                                 for id in video_ids:
2085                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2086                                 return
2087
2088                         pagenum = pagenum + 1
2089
2090 class GoogleSearchIE(InfoExtractor):
2091         """Information Extractor for Google Video search queries."""
2092         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2093         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2094         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2095         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2096         _google_ie = None
2097         _max_google_results = 1000
2098
2099         def __init__(self, google_ie, downloader=None):
2100                 InfoExtractor.__init__(self, downloader)
2101                 self._google_ie = google_ie
2102
2103         @staticmethod
2104         def suitable(url):
2105                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2106
2107         def report_download_page(self, query, pagenum):
2108                 """Report attempt to download playlist page with given number."""
2109                 query = query.decode(preferredencoding())
2110                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2111
2112         def _real_initialize(self):
2113                 self._google_ie.initialize()
2114
2115         def _real_extract(self, query):
2116                 mobj = re.match(self._VALID_QUERY, query)
2117                 if mobj is None:
2118                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2119                         return
2120
2121                 prefix, query = query.split(':')
2122                 prefix = prefix[8:]
2123                 query  = query.encode('utf-8')
2124                 if prefix == '':
2125                         self._download_n_results(query, 1)
2126                         return
2127                 elif prefix == 'all':
2128                         self._download_n_results(query, self._max_google_results)
2129                         return
2130                 else:
2131                         try:
2132                                 n = long(prefix)
2133                                 if n <= 0:
2134                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2135                                         return
2136                                 elif n > self._max_google_results:
2137                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2138                                         n = self._max_google_results
2139                                 self._download_n_results(query, n)
2140                                 return
2141                         except ValueError: # parsing prefix as integer fails
2142                                 self._download_n_results(query, 1)
2143                                 return
2144
2145         def _download_n_results(self, query, n):
2146                 """Downloads a specified number of results for a query"""
2147
2148                 video_ids = []
2149                 already_seen = set()
2150                 pagenum = 1
2151
2152                 while True:
2153                         self.report_download_page(query, pagenum)
2154                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2155                         request = urllib2.Request(result_url)
2156                         try:
2157                                 page = urllib2.urlopen(request).read()
2158                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2159                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2160                                 return
2161
2162                         # Extract video identifiers
2163                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2164                                 video_id = mobj.group(1)
2165                                 if video_id not in already_seen:
2166                                         video_ids.append(video_id)
2167                                         already_seen.add(video_id)
2168                                         if len(video_ids) == n:
2169                                                 # Specified n videos reached
2170                                                 for id in video_ids:
2171                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2172                                                 return
2173
2174                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2175                                 for id in video_ids:
2176                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2177                                 return
2178
2179                         pagenum = pagenum + 1
2180
2181 class YahooSearchIE(InfoExtractor):
2182         """Information Extractor for Yahoo! Video search queries."""
2183         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2184         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2185         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2186         _MORE_PAGES_INDICATOR = r'\s*Next'
2187         _yahoo_ie = None
2188         _max_yahoo_results = 1000
2189
2190         def __init__(self, yahoo_ie, downloader=None):
2191                 InfoExtractor.__init__(self, downloader)
2192                 self._yahoo_ie = yahoo_ie
2193
2194         @staticmethod
2195         def suitable(url):
2196                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2197
2198         def report_download_page(self, query, pagenum):
2199                 """Report attempt to download playlist page with given number."""
2200                 query = query.decode(preferredencoding())
2201                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2202
2203         def _real_initialize(self):
2204                 self._yahoo_ie.initialize()
2205
2206         def _real_extract(self, query):
2207                 mobj = re.match(self._VALID_QUERY, query)
2208                 if mobj is None:
2209                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2210                         return
2211
2212                 prefix, query = query.split(':')
2213                 prefix = prefix[8:]
2214                 query  = query.encode('utf-8')
2215                 if prefix == '':
2216                         self._download_n_results(query, 1)
2217                         return
2218                 elif prefix == 'all':
2219                         self._download_n_results(query, self._max_yahoo_results)
2220                         return
2221                 else:
2222                         try:
2223                                 n = long(prefix)
2224                                 if n <= 0:
2225                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2226                                         return
2227                                 elif n > self._max_yahoo_results:
2228                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2229                                         n = self._max_yahoo_results
2230                                 self._download_n_results(query, n)
2231                                 return
2232                         except ValueError: # parsing prefix as integer fails
2233                                 self._download_n_results(query, 1)
2234                                 return
2235
2236         def _download_n_results(self, query, n):
2237                 """Downloads a specified number of results for a query"""
2238
2239                 video_ids = []
2240                 already_seen = set()
2241                 pagenum = 1
2242
2243                 while True:
2244                         self.report_download_page(query, pagenum)
2245                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2246                         request = urllib2.Request(result_url)
2247                         try:
2248                                 page = urllib2.urlopen(request).read()
2249                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2250                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2251                                 return
2252
2253                         # Extract video identifiers
2254                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2255                                 video_id = mobj.group(1)
2256                                 if video_id not in already_seen:
2257                                         video_ids.append(video_id)
2258                                         already_seen.add(video_id)
2259                                         if len(video_ids) == n:
2260                                                 # Specified n videos reached
2261                                                 for id in video_ids:
2262                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2263                                                 return
2264
2265                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2266                                 for id in video_ids:
2267                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2268                                 return
2269
2270                         pagenum = pagenum + 1
2271
2272 class YoutubePlaylistIE(InfoExtractor):
2273         """Information Extractor for YouTube playlists."""
2274
2275         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2276         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2277         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2278         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2279         _youtube_ie = None
2280
2281         def __init__(self, youtube_ie, downloader=None):
2282                 InfoExtractor.__init__(self, downloader)
2283                 self._youtube_ie = youtube_ie
2284
2285         @staticmethod
2286         def suitable(url):
2287                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2288
2289         def report_download_page(self, playlist_id, pagenum):
2290                 """Report attempt to download playlist page with given number."""
2291                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2292
2293         def _real_initialize(self):
2294                 self._youtube_ie.initialize()
2295
2296         def _real_extract(self, url):
2297                 # Extract playlist id
2298                 mobj = re.match(self._VALID_URL, url)
2299                 if mobj is None:
2300                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2301                         return
2302
2303                 # Single video case
2304                 if mobj.group(3) is not None:
2305                         self._youtube_ie.extract(mobj.group(3))
2306                         return
2307
2308                 # Download playlist pages
2309                 # prefix is 'p' as default for playlists but there are other types that need extra care
2310                 playlist_prefix = mobj.group(1)
2311                 if playlist_prefix == 'a':
2312                         playlist_access = 'artist'
2313                 else:
2314                         playlist_prefix = 'p'
2315                         playlist_access = 'view_play_list'
2316                 playlist_id = mobj.group(2)
2317                 video_ids = []
2318                 pagenum = 1
2319
2320                 while True:
2321                         self.report_download_page(playlist_id, pagenum)
2322                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2323                         try:
2324                                 page = urllib2.urlopen(request).read()
2325                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2326                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2327                                 return
2328
2329                         # Extract video identifiers
2330                         ids_in_page = []
2331                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2332                                 if mobj.group(1) not in ids_in_page:
2333                                         ids_in_page.append(mobj.group(1))
2334                         video_ids.extend(ids_in_page)
2335
2336                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2337                                 break
2338                         pagenum = pagenum + 1
2339
2340                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2341                 playlistend = self._downloader.params.get('playlistend', -1)
2342                 video_ids = video_ids[playliststart:playlistend]
2343
2344                 for id in video_ids:
2345                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2346                 return
2347
2348 class YoutubeUserIE(InfoExtractor):
2349         """Information Extractor for YouTube users."""
2350
2351         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2352         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2353         _GDATA_PAGE_SIZE = 50
2354         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2355         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2356         _youtube_ie = None
2357
2358         def __init__(self, youtube_ie, downloader=None):
2359                 InfoExtractor.__init__(self, downloader)
2360                 self._youtube_ie = youtube_ie
2361
2362         @staticmethod
2363         def suitable(url):
2364                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2365
2366         def report_download_page(self, username, start_index):
2367                 """Report attempt to download user page."""
2368                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2369                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2370
2371         def _real_initialize(self):
2372                 self._youtube_ie.initialize()
2373
2374         def _real_extract(self, url):
2375                 # Extract username
2376                 mobj = re.match(self._VALID_URL, url)
2377                 if mobj is None:
2378                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2379                         return
2380
2381                 username = mobj.group(1)
2382
2383                 # Download video ids using YouTube Data API. Result size per
2384                 # query is limited (currently to 50 videos) so we need to query
2385                 # page by page until there are no video ids - it means we got
2386                 # all of them.
2387
2388                 video_ids = []
2389                 pagenum = 0
2390
2391                 while True:
2392                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2393                         self.report_download_page(username, start_index)
2394
2395                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2396
2397                         try:
2398                                 page = urllib2.urlopen(request).read()
2399                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2400                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2401                                 return
2402
2403                         # Extract video identifiers
2404                         ids_in_page = []
2405
2406                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2407                                 if mobj.group(1) not in ids_in_page:
2408                                         ids_in_page.append(mobj.group(1))
2409
2410                         video_ids.extend(ids_in_page)
2411
2412                         # A little optimization - if current page is not
2413                         # "full", ie. does not contain PAGE_SIZE video ids then
2414                         # we can assume that this page is the last one - there
2415                         # are no more ids on further pages - no need to query
2416                         # again.
2417
2418                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2419                                 break
2420
2421                         pagenum += 1
2422
2423                 all_ids_count = len(video_ids)
2424                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2425                 playlistend = self._downloader.params.get('playlistend', -1)
2426
2427                 if playlistend == -1:
2428                         video_ids = video_ids[playliststart:]
2429                 else:
2430                         video_ids = video_ids[playliststart:playlistend]
2431                         
2432                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2433                                            (username, all_ids_count, len(video_ids)))
2434
2435                 for video_id in video_ids:
2436                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2437
2438
2439 class DepositFilesIE(InfoExtractor):
2440         """Information extractor for depositfiles.com"""
2441
2442         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2443
2444         def __init__(self, downloader=None):
2445                 InfoExtractor.__init__(self, downloader)
2446
2447         @staticmethod
2448         def suitable(url):
2449                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2450
2451         def report_download_webpage(self, file_id):
2452                 """Report webpage download."""
2453                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2454
2455         def report_extraction(self, file_id):
2456                 """Report information extraction."""
2457                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2458
2459         def _real_initialize(self):
2460                 return
2461
2462         def _real_extract(self, url):
2463                 # At this point we have a new file
2464                 self._downloader.increment_downloads()
2465
2466                 file_id = url.split('/')[-1]
2467                 # Rebuild url in english locale
2468                 url = 'http://depositfiles.com/en/files/' + file_id
2469
2470                 # Retrieve file webpage with 'Free download' button pressed
2471                 free_download_indication = { 'gateway_result' : '1' }
2472                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2473                 try:
2474                         self.report_download_webpage(file_id)
2475                         webpage = urllib2.urlopen(request).read()
2476                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2477                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2478                         return
2479
2480                 # Search for the real file URL
2481                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2482                 if (mobj is None) or (mobj.group(1) is None):
2483                         # Try to figure out reason of the error.
2484                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2485                         if (mobj is not None) and (mobj.group(1) is not None):
2486                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2487                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2488                         else:
2489                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2490                         return
2491
2492                 file_url = mobj.group(1)
2493                 file_extension = os.path.splitext(file_url)[1][1:]
2494
2495                 # Search for file title
2496                 mobj = re.search(r'<b title="(.*?)">', webpage)
2497                 if mobj is None:
2498                         self._downloader.trouble(u'ERROR: unable to extract title')
2499                         return
2500                 file_title = mobj.group(1).decode('utf-8')
2501
2502                 try:
2503                         # Process file information
2504                         self._downloader.process_info({
2505                                 'id':           file_id.decode('utf-8'),
2506                                 'url':          file_url.decode('utf-8'),
2507                                 'uploader':     u'NA',
2508                                 'upload_date':  u'NA',
2509                                 'title':        file_title,
2510                                 'stitle':       file_title,
2511                                 'ext':          file_extension.decode('utf-8'),
2512                                 'format':       u'NA',
2513                                 'player_url':   None,
2514                         })
2515                 except UnavailableVideoError, err:
2516                         self._downloader.trouble(u'ERROR: unable to download file')
2517
2518 class FacebookIE(InfoExtractor):
2519         """Information Extractor for Facebook"""
2520
2521         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2522         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2523         _NETRC_MACHINE = 'facebook'
2524         _available_formats = ['highqual', 'lowqual']
2525         _video_extensions = {
2526                 'highqual': 'mp4',
2527                 'lowqual': 'mp4',
2528         }
2529
2530         def __init__(self, downloader=None):
2531                 InfoExtractor.__init__(self, downloader)
2532
2533         @staticmethod
2534         def suitable(url):
2535                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2536
2537         def _reporter(self, message):
2538                 """Add header and report message."""
2539                 self._downloader.to_screen(u'[facebook] %s' % message)
2540
2541         def report_login(self):
2542                 """Report attempt to log in."""
2543                 self._reporter(u'Logging in')
2544
2545         def report_video_webpage_download(self, video_id):
2546                 """Report attempt to download video webpage."""
2547                 self._reporter(u'%s: Downloading video webpage' % video_id)
2548
2549         def report_information_extraction(self, video_id):
2550                 """Report attempt to extract video information."""
2551                 self._reporter(u'%s: Extracting video information' % video_id)
2552
2553         def _parse_page(self, video_webpage):
2554                 """Extract video information from page"""
2555                 # General data
2556                 data = {'title': r'class="video_title datawrap">(.*?)</',
2557                         'description': r'<div class="datawrap">(.*?)</div>',
2558                         'owner': r'\("video_owner_name", "(.*?)"\)',
2559                         'upload_date': r'data-date="(.*?)"',
2560                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2561                         }
2562                 video_info = {}
2563                 for piece in data.keys():
2564                         mobj = re.search(data[piece], video_webpage)
2565                         if mobj is not None:
2566                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2567
2568                 # Video urls
2569                 video_urls = {}
2570                 for fmt in self._available_formats:
2571                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2572                         if mobj is not None:
2573                                 # URL is in a Javascript segment inside an escaped Unicode format within
2574                                 # the generally utf-8 page
2575                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2576                 video_info['video_urls'] = video_urls
2577
2578                 return video_info
2579
2580         def _real_initialize(self):
2581                 if self._downloader is None:
2582                         return
2583
2584                 useremail = None
2585                 password = None
2586                 downloader_params = self._downloader.params
2587
2588                 # Attempt to use provided username and password or .netrc data
2589                 if downloader_params.get('username', None) is not None:
2590                         useremail = downloader_params['username']
2591                         password = downloader_params['password']
2592                 elif downloader_params.get('usenetrc', False):
2593                         try:
2594                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2595                                 if info is not None:
2596                                         useremail = info[0]
2597                                         password = info[2]
2598                                 else:
2599                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2600                         except (IOError, netrc.NetrcParseError), err:
2601                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2602                                 return
2603
2604                 if useremail is None:
2605                         return
2606
2607                 # Log in
2608                 login_form = {
2609                         'email': useremail,
2610                         'pass': password,
2611                         'login': 'Log+In'
2612                         }
2613                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2614                 try:
2615                         self.report_login()
2616                         login_results = urllib2.urlopen(request).read()
2617                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2618                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2619                                 return
2620                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2621                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2622                         return
2623
2624         def _real_extract(self, url):
2625                 mobj = re.match(self._VALID_URL, url)
2626                 if mobj is None:
2627                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2628                         return
2629                 video_id = mobj.group('ID')
2630
2631                 # Get video webpage
2632                 self.report_video_webpage_download(video_id)
2633                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2634                 try:
2635                         page = urllib2.urlopen(request)
2636                         video_webpage = page.read()
2637                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2638                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2639                         return
2640
2641                 # Start extracting information
2642                 self.report_information_extraction(video_id)
2643
2644                 # Extract information
2645                 video_info = self._parse_page(video_webpage)
2646
2647                 # uploader
2648                 if 'owner' not in video_info:
2649                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2650                         return
2651                 video_uploader = video_info['owner']
2652
2653                 # title
2654                 if 'title' not in video_info:
2655                         self._downloader.trouble(u'ERROR: unable to extract video title')
2656                         return
2657                 video_title = video_info['title']
2658                 video_title = video_title.decode('utf-8')
2659                 video_title = sanitize_title(video_title)
2660
2661                 # simplified title
2662                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2663                 simple_title = simple_title.strip(ur'_')
2664
2665                 # thumbnail image
2666                 if 'thumbnail' not in video_info:
2667                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2668                         video_thumbnail = ''
2669                 else:
2670                         video_thumbnail = video_info['thumbnail']
2671
2672                 # upload date
2673                 upload_date = u'NA'
2674                 if 'upload_date' in video_info:
2675                         upload_time = video_info['upload_date']
2676                         timetuple = email.utils.parsedate_tz(upload_time)
2677                         if timetuple is not None:
2678                                 try:
2679                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2680                                 except:
2681                                         pass
2682
2683                 # description
2684                 video_description = video_info.get('description', 'No description available.')
2685
2686                 url_map = video_info['video_urls']
2687                 if len(url_map.keys()) > 0:
2688                         # Decide which formats to download
2689                         req_format = self._downloader.params.get('format', None)
2690                         format_limit = self._downloader.params.get('format_limit', None)
2691
2692                         if format_limit is not None and format_limit in self._available_formats:
2693                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2694                         else:
2695                                 format_list = self._available_formats
2696                         existing_formats = [x for x in format_list if x in url_map]
2697                         if len(existing_formats) == 0:
2698                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2699                                 return
2700                         if req_format is None:
2701                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2702                         elif req_format == '-1':
2703                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2704                         else:
2705                                 # Specific format
2706                                 if req_format not in url_map:
2707                                         self._downloader.trouble(u'ERROR: requested format not available')
2708                                         return
2709                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2710
2711                 for format_param, video_real_url in video_url_list:
2712
2713                         # At this point we have a new video
2714                         self._downloader.increment_downloads()
2715
2716                         # Extension
2717                         video_extension = self._video_extensions.get(format_param, 'mp4')
2718
2719                         # Find the video URL in fmt_url_map or conn paramters
2720                         try:
2721                                 # Process video information
2722                                 self._downloader.process_info({
2723                                         'id':           video_id.decode('utf-8'),
2724                                         'url':          video_real_url.decode('utf-8'),
2725                                         'uploader':     video_uploader.decode('utf-8'),
2726                                         'upload_date':  upload_date,
2727                                         'title':        video_title,
2728                                         'stitle':       simple_title,
2729                                         'ext':          video_extension.decode('utf-8'),
2730                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2731                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2732                                         'description':  video_description.decode('utf-8'),
2733                                         'player_url':   None,
2734                                 })
2735                         except UnavailableVideoError, err:
2736                                 self._downloader.trouble(u'\nERROR: unable to download video')
2737
2738 class BlipTVIE(InfoExtractor):
2739         """Information extractor for blip.tv"""
2740
2741         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2742         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2743
2744         @staticmethod
2745         def suitable(url):
2746                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2747
2748         def report_extraction(self, file_id):
2749                 """Report information extraction."""
2750                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2751
2752         def _simplify_title(self, title):
2753                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2754                 res = res.strip(ur'_')
2755                 return res
2756
2757         def _real_extract(self, url):
2758                 mobj = re.match(self._VALID_URL, url)
2759                 if mobj is None:
2760                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2761                         return
2762
2763                 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2764                 request = urllib2.Request(json_url)
2765                 self.report_extraction(mobj.group(1))
2766                 try:
2767                         json_code = urllib2.urlopen(request).read()
2768                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2769                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2770                         return
2771                 try:
2772                         json_data = json.loads(json_code)
2773                         data = json_data['Post'] if 'Post' in json_data else json_data
2774
2775                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2776                         video_url = data['media']['url']
2777                         umobj = re.match(self._URL_EXT, video_url)
2778                         if umobj is None:
2779                                 raise ValueError('Can not determine filename extension')
2780                         ext = umobj.group(1)
2781
2782                         self._downloader.increment_downloads()
2783
2784                         info = {
2785                                 'id': data['item_id'],
2786                                 'url': video_url,
2787                                 'uploader': data['display_name'],
2788                                 'upload_date': upload_date,
2789                                 'title': data['title'],
2790                                 'stitle': self._simplify_title(data['title']),
2791                                 'ext': ext,
2792                                 'format': data['media']['mimeType'],
2793                                 'thumbnail': data['thumbnailUrl'],
2794                                 'description': data['description'],
2795                                 'player_url': data['embedUrl']
2796                         }
2797                 except (ValueError,KeyError), err:
2798                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2799                         return
2800
2801                 try:
2802                         self._downloader.process_info(info)
2803                 except UnavailableVideoError, err:
2804                         self._downloader.trouble(u'\nERROR: unable to download video')
2805
2806
2807 class PostProcessor(object):
2808         """Post Processor class.
2809
2810         PostProcessor objects can be added to downloaders with their
2811         add_post_processor() method. When the downloader has finished a
2812         successful download, it will take its internal chain of PostProcessors
2813         and start calling the run() method on each one of them, first with
2814         an initial argument and then with the returned value of the previous
2815         PostProcessor.
2816
2817         The chain will be stopped if one of them ever returns None or the end
2818         of the chain is reached.
2819
2820         PostProcessor objects follow a "mutual registration" process similar
2821         to InfoExtractor objects.
2822         """
2823
2824         _downloader = None
2825
2826         def __init__(self, downloader=None):
2827                 self._downloader = downloader
2828
2829         def set_downloader(self, downloader):
2830                 """Sets the downloader for this PP."""
2831                 self._downloader = downloader
2832
2833         def run(self, information):
2834                 """Run the PostProcessor.
2835
2836                 The "information" argument is a dictionary like the ones
2837                 composed by InfoExtractors. The only difference is that this
2838                 one has an extra field called "filepath" that points to the
2839                 downloaded file.
2840
2841                 When this method returns None, the postprocessing chain is
2842                 stopped. However, this method may return an information
2843                 dictionary that will be passed to the next postprocessing
2844                 object in the chain. It can be the one it received after
2845                 changing some fields.
2846
2847                 In addition, this method may raise a PostProcessingError
2848                 exception that will be taken into account by the downloader
2849                 it was called from.
2850                 """
2851                 return information # by default, do nothing
2852
2853 class FFmpegExtractAudioPP(PostProcessor):
2854
2855         def __init__(self, downloader=None, preferredcodec=None):
2856                 PostProcessor.__init__(self, downloader)
2857                 if preferredcodec is None:
2858                         preferredcodec = 'best'
2859                 self._preferredcodec = preferredcodec
2860
2861         @staticmethod
2862         def get_audio_codec(path):
2863                 try:
2864                         cmd = ['ffprobe', '-show_streams', '--', path]
2865                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2866                         output = handle.communicate()[0]
2867                         if handle.wait() != 0:
2868                                 return None
2869                 except (IOError, OSError):
2870                         return None
2871                 audio_codec = None
2872                 for line in output.split('\n'):
2873                         if line.startswith('codec_name='):
2874                                 audio_codec = line.split('=')[1].strip()
2875                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2876                                 return audio_codec
2877                 return None
2878
2879         @staticmethod
2880         def run_ffmpeg(path, out_path, codec, more_opts):
2881                 try:
2882                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2883                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2884                         return (ret == 0)
2885                 except (IOError, OSError):
2886                         return False
2887
2888         def run(self, information):
2889                 path = information['filepath']
2890
2891                 filecodec = self.get_audio_codec(path)
2892                 if filecodec is None:
2893                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2894                         return None
2895
2896                 more_opts = []
2897                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2898                         if filecodec == 'aac' or filecodec == 'mp3':
2899                                 # Lossless if possible
2900                                 acodec = 'copy'
2901                                 extension = filecodec
2902                                 if filecodec == 'aac':
2903                                         more_opts = ['-f', 'adts']
2904                         else:
2905                                 # MP3 otherwise.
2906                                 acodec = 'libmp3lame'
2907                                 extension = 'mp3'
2908                                 more_opts = ['-ab', '128k']
2909                 else:
2910                         # We convert the audio (lossy)
2911                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2912                         extension = self._preferredcodec
2913                         more_opts = ['-ab', '128k']
2914                         if self._preferredcodec == 'aac':
2915                                 more_opts += ['-f', 'adts']
2916
2917                 (prefix, ext) = os.path.splitext(path)
2918                 new_path = prefix + '.' + extension
2919                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2920                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2921
2922                 if not status:
2923                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2924                         return None
2925
2926                 try:
2927                         os.remove(path)
2928                 except (IOError, OSError):
2929                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2930                         return None
2931
2932                 information['filepath'] = new_path
2933                 return information
2934
2935 ### MAIN PROGRAM ###
2936 if __name__ == '__main__':
2937         try:
2938                 # Modules needed only when running the main program
2939                 import getpass
2940                 import optparse
2941
2942                 # Function to update the program file with the latest version from the repository.
2943                 def update_self(downloader, filename):
2944                         # Note: downloader only used for options
2945                         if not os.access(filename, os.W_OK):
2946                                 sys.exit('ERROR: no write permissions on %s' % filename)
2947
2948                         downloader.to_screen('Updating to latest stable version...')
2949                         try:
2950                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2951                                 latest_version = urllib.urlopen(latest_url).read().strip()
2952                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2953                                 newcontent = urllib.urlopen(prog_url).read()
2954                         except (IOError, OSError), err:
2955                                 sys.exit('ERROR: unable to download latest version')
2956                         try:
2957                                 stream = open(filename, 'w')
2958                                 stream.write(newcontent)
2959                                 stream.close()
2960                         except (IOError, OSError), err:
2961                                 sys.exit('ERROR: unable to overwrite current version')
2962                         downloader.to_screen('Updated to version %s' % latest_version)
2963
2964                 # Parse command line
2965                 parser = optparse.OptionParser(
2966                         usage='Usage: %prog [options] url...',
2967                         version='2011.07.09-phihag',
2968                         conflict_handler='resolve',
2969                 )
2970
2971                 parser.add_option('-h', '--help',
2972                                 action='help', help='print this help text and exit')
2973                 parser.add_option('-v', '--version',
2974                                 action='version', help='print program version and exit')
2975                 parser.add_option('-U', '--update',
2976                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2977                 parser.add_option('-i', '--ignore-errors',
2978                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2979                 parser.add_option('-r', '--rate-limit',
2980                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2981                 parser.add_option('-R', '--retries',
2982                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2983                 parser.add_option('--playlist-start',
2984                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2985                 parser.add_option('--playlist-end',
2986                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2987                 parser.add_option('--dump-user-agent',
2988                                 action='store_true', dest='dump_user_agent',
2989                                 help='display the current browser identification', default=False)
2990
2991                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2992                 authentication.add_option('-u', '--username',
2993                                 dest='username', metavar='USERNAME', help='account username')
2994                 authentication.add_option('-p', '--password',
2995                                 dest='password', metavar='PASSWORD', help='account password')
2996                 authentication.add_option('-n', '--netrc',
2997                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2998                 parser.add_option_group(authentication)
2999
3000                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3001                 video_format.add_option('-f', '--format',
3002                                 action='store', dest='format', metavar='FORMAT', help='video format code')
3003                 video_format.add_option('--all-formats',
3004                                 action='store_const', dest='format', help='download all available video formats', const='-1')
3005                 video_format.add_option('--max-quality',
3006                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3007                 parser.add_option_group(video_format)
3008
3009                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3010                 verbosity.add_option('-q', '--quiet',
3011                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3012                 verbosity.add_option('-s', '--simulate',
3013                                 action='store_true', dest='simulate', help='do not download video', default=False)
3014                 verbosity.add_option('-g', '--get-url',
3015                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3016                 verbosity.add_option('-e', '--get-title',
3017                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3018                 verbosity.add_option('--get-thumbnail',
3019                                 action='store_true', dest='getthumbnail',
3020                                 help='simulate, quiet but print thumbnail URL', default=False)
3021                 verbosity.add_option('--get-description',
3022                                 action='store_true', dest='getdescription',
3023                                 help='simulate, quiet but print video description', default=False)
3024                 verbosity.add_option('--get-filename',
3025                                 action='store_true', dest='getfilename',
3026                                 help='simulate, quiet but print output filename', default=False)
3027                 verbosity.add_option('--no-progress',
3028                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3029                 verbosity.add_option('--console-title',
3030                                 action='store_true', dest='consoletitle',
3031                                 help='display progress in console titlebar', default=False)
3032                 parser.add_option_group(verbosity)
3033
3034                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3035                 filesystem.add_option('-t', '--title',
3036                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
3037                 filesystem.add_option('-l', '--literal',
3038                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3039                 filesystem.add_option('-A', '--auto-number',
3040                                 action='store_true', dest='autonumber',
3041                                 help='number downloaded files starting from 00000', default=False)
3042                 filesystem.add_option('-o', '--output',
3043                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3044                 filesystem.add_option('-a', '--batch-file',
3045                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3046                 filesystem.add_option('-w', '--no-overwrites',
3047                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3048                 filesystem.add_option('-c', '--continue',
3049                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3050                 filesystem.add_option('--cookies',
3051                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3052                 filesystem.add_option('--no-part',
3053                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
3054                 filesystem.add_option('--no-mtime',
3055                                 action='store_false', dest='updatetime',
3056                                 help='do not use the Last-modified header to set the file modification time', default=True)
3057                 filesystem.add_option('--write-description',
3058                                 action='store_true', dest='writedescription',
3059                                 help='write video description to a .description file', default=False)
3060                 filesystem.add_option('--write-info-json',
3061                                 action='store_true', dest='writeinfojson',
3062                                 help='write video metadata to a .info.json file', default=False)
3063                 parser.add_option_group(filesystem)
3064
3065                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3066                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3067                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3068                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3069                                 help='"best", "aac" or "mp3"; best by default')
3070                 parser.add_option_group(postproc)
3071
3072                 (opts, args) = parser.parse_args()
3073
3074                 # Open appropriate CookieJar
3075                 if opts.cookiefile is None:
3076                         jar = cookielib.CookieJar()
3077                 else:
3078                         try:
3079                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3080                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3081                                         jar.load()
3082                         except (IOError, OSError), err:
3083                                 sys.exit(u'ERROR: unable to open cookie file')
3084
3085                 # Dump user agent
3086                 if opts.dump_user_agent:
3087                         print std_headers['User-Agent']
3088                         sys.exit(0)
3089
3090                 # General configuration
3091                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3092                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3093                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3094
3095                 # Batch file verification
3096                 batchurls = []
3097                 if opts.batchfile is not None:
3098                         try:
3099                                 if opts.batchfile == '-':
3100                                         batchfd = sys.stdin
3101                                 else:
3102                                         batchfd = open(opts.batchfile, 'r')
3103                                 batchurls = batchfd.readlines()
3104                                 batchurls = [x.strip() for x in batchurls]
3105                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3106                         except IOError:
3107                                 sys.exit(u'ERROR: batch file could not be read')
3108                 all_urls = batchurls + args
3109
3110                 # Conflicting, missing and erroneous options
3111                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3112                         parser.error(u'using .netrc conflicts with giving username/password')
3113                 if opts.password is not None and opts.username is None:
3114                         parser.error(u'account username missing')
3115                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3116                         parser.error(u'using output template conflicts with using title, literal title or auto number')
3117                 if opts.usetitle and opts.useliteral:
3118                         parser.error(u'using title conflicts with using literal title')
3119                 if opts.username is not None and opts.password is None:
3120                         opts.password = getpass.getpass(u'Type account password and press return:')
3121                 if opts.ratelimit is not None:
3122                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3123                         if numeric_limit is None:
3124                                 parser.error(u'invalid rate limit specified')
3125                         opts.ratelimit = numeric_limit
3126                 if opts.retries is not None:
3127                         try:
3128                                 opts.retries = long(opts.retries)
3129                         except (TypeError, ValueError), err:
3130                                 parser.error(u'invalid retry count specified')
3131                 try:
3132                         opts.playliststart = long(opts.playliststart)
3133                         if opts.playliststart <= 0:
3134                                 raise ValueError
3135                 except (TypeError, ValueError), err:
3136                         parser.error(u'invalid playlist start number specified')
3137                 try:
3138                         opts.playlistend = long(opts.playlistend)
3139                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3140                                 raise ValueError
3141                 except (TypeError, ValueError), err:
3142                         parser.error(u'invalid playlist end number specified')
3143                 if opts.extractaudio:
3144                         if opts.audioformat not in ['best', 'aac', 'mp3']:
3145                                 parser.error(u'invalid audio format specified')
3146
3147                 # Information extractors
3148                 youtube_ie = YoutubeIE()
3149                 metacafe_ie = MetacafeIE(youtube_ie)
3150                 dailymotion_ie = DailymotionIE()
3151                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3152                 youtube_user_ie = YoutubeUserIE(youtube_ie)
3153                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3154                 google_ie = GoogleIE()
3155                 google_search_ie = GoogleSearchIE(google_ie)
3156                 photobucket_ie = PhotobucketIE()
3157                 yahoo_ie = YahooIE()
3158                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3159                 deposit_files_ie = DepositFilesIE()
3160                 facebook_ie = FacebookIE()
3161                 bliptv_ie = BlipTVIE()
3162                 generic_ie = GenericIE()
3163
3164                 # File downloader
3165                 fd = FileDownloader({
3166                         'usenetrc': opts.usenetrc,
3167                         'username': opts.username,
3168                         'password': opts.password,
3169                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3170                         'forceurl': opts.geturl,
3171                         'forcetitle': opts.gettitle,
3172                         'forcethumbnail': opts.getthumbnail,
3173                         'forcedescription': opts.getdescription,
3174                         'forcefilename': opts.getfilename,
3175                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3176                         'format': opts.format,
3177                         'format_limit': opts.format_limit,
3178                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3179                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3180                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3181                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3182                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3183                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3184                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3185                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3186                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3187                                 or u'%(id)s.%(ext)s'),
3188                         'ignoreerrors': opts.ignoreerrors,
3189                         'ratelimit': opts.ratelimit,
3190                         'nooverwrites': opts.nooverwrites,
3191                         'retries': opts.retries,
3192                         'continuedl': opts.continue_dl,
3193                         'noprogress': opts.noprogress,
3194                         'playliststart': opts.playliststart,
3195                         'playlistend': opts.playlistend,
3196                         'logtostderr': opts.outtmpl == '-',
3197                         'consoletitle': opts.consoletitle,
3198                         'nopart': opts.nopart,
3199                         'updatetime': opts.updatetime,
3200                         'writedescription': opts.writedescription,
3201                         'writeinfojson': opts.writeinfojson,
3202                         })
3203                 fd.add_info_extractor(youtube_search_ie)
3204                 fd.add_info_extractor(youtube_pl_ie)
3205                 fd.add_info_extractor(youtube_user_ie)
3206                 fd.add_info_extractor(metacafe_ie)
3207                 fd.add_info_extractor(dailymotion_ie)
3208                 fd.add_info_extractor(youtube_ie)
3209                 fd.add_info_extractor(google_ie)
3210                 fd.add_info_extractor(google_search_ie)
3211                 fd.add_info_extractor(photobucket_ie)
3212                 fd.add_info_extractor(yahoo_ie)
3213                 fd.add_info_extractor(yahoo_search_ie)
3214                 fd.add_info_extractor(deposit_files_ie)
3215                 fd.add_info_extractor(facebook_ie)
3216                 fd.add_info_extractor(bliptv_ie)
3217
3218                 # This must come last since it's the
3219                 # fallback if none of the others work
3220                 fd.add_info_extractor(generic_ie)
3221
3222                 # PostProcessors
3223                 if opts.extractaudio:
3224                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3225
3226                 # Update version
3227                 if opts.update_self:
3228                         update_self(fd, sys.argv[0])
3229
3230                 # Maybe do nothing
3231                 if len(all_urls) < 1:
3232                         if not opts.update_self:
3233                                 parser.error(u'you must provide at least one URL')
3234                         else:
3235                                 sys.exit()
3236                 retcode = fd.download(all_urls)
3237
3238                 # Dump cookie jar if requested
3239                 if opts.cookiefile is not None:
3240                         try:
3241                                 jar.save()
3242                         except (IOError, OSError), err:
3243                                 sys.exit(u'ERROR: unable to save cookie jar')
3244
3245                 sys.exit(retcode)
3246
3247         except DownloadError:
3248                 sys.exit(1)
3249         except SameFileError:
3250                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3251         except KeyboardInterrupt:
3252                 sys.exit(u'\nERROR: Interrupted by user')