Merge Vimeo support
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'PaweÅ‚ Paprota',
11         'Gergely Imreh',
12         )
13
14 __license__ = 'Public Domain'
15 __version__ = '2011.08.24-phihag'
16
17 import cookielib
18 import datetime
19 import gzip
20 import htmlentitydefs
21 import httplib
22 import locale
23 import math
24 import netrc
25 import os
26 import os.path
27 import re
28 import socket
29 import string
30 import subprocess
31 import sys
32 import time
33 import urllib
34 import urllib2
35 import warnings
36 import zlib
37
38 if os.name == 'nt':
39         import ctypes
40
41 try:
42         import email.utils
43 except ImportError: # Python 2.4
44         import email.Utils
45 try:
46         import cStringIO as StringIO
47 except ImportError:
48         import StringIO
49
50 # parse_qs was moved from the cgi module to the urlparse module recently.
51 try:
52         from urlparse import parse_qs
53 except ImportError:
54         from cgi import parse_qs
55
56 try:
57         import lxml.etree
58 except ImportError:
59         pass # Handled below
60
61 std_headers = {
62         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
63         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
64         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
65         'Accept-Encoding': 'gzip, deflate',
66         'Accept-Language': 'en-us,en;q=0.5',
67 }
68
69 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
70
71 try:
72         import json
73 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
74         import re
75         class json(object):
76                 @staticmethod
77                 def loads(s):
78                         s = s.decode('UTF-8')
79                         def raiseError(msg, i):
80                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
81                         def skipSpace(i, expectMore=True):
82                                 while i < len(s) and s[i] in ' \t\r\n':
83                                         i += 1
84                                 if expectMore:
85                                         if i >= len(s):
86                                                 raiseError('Premature end', i)
87                                 return i
88                         def decodeEscape(match):
89                                 esc = match.group(1)
90                                 _STATIC = {
91                                         '"': '"',
92                                         '\\': '\\',
93                                         '/': '/',
94                                         'b': unichr(0x8),
95                                         'f': unichr(0xc),
96                                         'n': '\n',
97                                         'r': '\r',
98                                         't': '\t',
99                                 }
100                                 if esc in _STATIC:
101                                         return _STATIC[esc]
102                                 if esc[0] == 'u':
103                                         if len(esc) == 1+4:
104                                                 return unichr(int(esc[1:5], 16))
105                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
106                                                 hi = int(esc[1:5], 16)
107                                                 low = int(esc[7:11], 16)
108                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
109                                 raise ValueError('Unknown escape ' + str(esc))
110                         def parseString(i):
111                                 i += 1
112                                 e = i
113                                 while True:
114                                         e = s.index('"', e)
115                                         bslashes = 0
116                                         while s[e-bslashes-1] == '\\':
117                                                 bslashes += 1
118                                         if bslashes % 2 == 1:
119                                                 e += 1
120                                                 continue
121                                         break
122                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
123                                 stri = rexp.sub(decodeEscape, s[i:e])
124                                 return (e+1,stri)
125                         def parseObj(i):
126                                 i += 1
127                                 res = {}
128                                 i = skipSpace(i)
129                                 if s[i] == '}': # Empty dictionary
130                                         return (i+1,res)
131                                 while True:
132                                         if s[i] != '"':
133                                                 raiseError('Expected a string object key', i)
134                                         i,key = parseString(i)
135                                         i = skipSpace(i)
136                                         if i >= len(s) or s[i] != ':':
137                                                 raiseError('Expected a colon', i)
138                                         i,val = parse(i+1)
139                                         res[key] = val
140                                         i = skipSpace(i)
141                                         if s[i] == '}':
142                                                 return (i+1, res)
143                                         if s[i] != ',':
144                                                 raiseError('Expected comma or closing curly brace', i)
145                                         i = skipSpace(i+1)
146                         def parseArray(i):
147                                 res = []
148                                 i = skipSpace(i+1)
149                                 if s[i] == ']': # Empty array
150                                         return (i+1,res)
151                                 while True:
152                                         i,val = parse(i)
153                                         res.append(val)
154                                         i = skipSpace(i) # Raise exception if premature end
155                                         if s[i] == ']':
156                                                 return (i+1, res)
157                                         if s[i] != ',':
158                                                 raiseError('Expected a comma or closing bracket', i)
159                                         i = skipSpace(i+1)
160                         def parseDiscrete(i):
161                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
162                                         if s.startswith(k, i):
163                                                 return (i+len(k), v)
164                                 raiseError('Not a boolean (or null)', i)
165                         def parseNumber(i):
166                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
167                                 if mobj is None:
168                                         raiseError('Not a number', i)
169                                 nums = mobj.group(1)
170                                 if '.' in nums or 'e' in nums or 'E' in nums:
171                                         return (i+len(nums), float(nums))
172                                 return (i+len(nums), int(nums))
173                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
174                         def parse(i):
175                                 i = skipSpace(i)
176                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
177                                 i = skipSpace(i, False)
178                                 return (i,res)
179                         i,res = parse(0)
180                         if i < len(s):
181                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
182                         return res
183
184 def preferredencoding():
185         """Get preferred encoding.
186
187         Returns the best encoding scheme for the system, based on
188         locale.getpreferredencoding() and some further tweaks.
189         """
190         def yield_preferredencoding():
191                 try:
192                         pref = locale.getpreferredencoding()
193                         u'TEST'.encode(pref)
194                 except:
195                         pref = 'UTF-8'
196                 while True:
197                         yield pref
198         return yield_preferredencoding().next()
199
200 def htmlentity_transform(matchobj):
201         """Transforms an HTML entity to a Unicode character.
202
203         This function receives a match object and is intended to be used with
204         the re.sub() function.
205         """
206         entity = matchobj.group(1)
207
208         # Known non-numeric HTML entity
209         if entity in htmlentitydefs.name2codepoint:
210                 return unichr(htmlentitydefs.name2codepoint[entity])
211
212         # Unicode character
213         mobj = re.match(ur'(?u)#(x?\d+)', entity)
214         if mobj is not None:
215                 numstr = mobj.group(1)
216                 if numstr.startswith(u'x'):
217                         base = 16
218                         numstr = u'0%s' % numstr
219                 else:
220                         base = 10
221                 return unichr(long(numstr, base))
222
223         # Unknown entity in name, return its literal representation
224         return (u'&%s;' % entity)
225
226 def sanitize_title(utitle):
227         """Sanitizes a video title so it could be used as part of a filename."""
228         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
229         return utitle.replace(unicode(os.sep), u'%')
230
231 def sanitize_open(filename, open_mode):
232         """Try to open the given filename, and slightly tweak it if this fails.
233
234         Attempts to open the given filename. If this fails, it tries to change
235         the filename slightly, step by step, until it's either able to open it
236         or it fails and raises a final exception, like the standard open()
237         function.
238
239         It returns the tuple (stream, definitive_file_name).
240         """
241         try:
242                 if filename == u'-':
243                         if sys.platform == 'win32':
244                                 import msvcrt
245                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
246                         return (sys.stdout, filename)
247                 stream = open(filename, open_mode)
248                 return (stream, filename)
249         except (IOError, OSError), err:
250                 # In case of error, try to remove win32 forbidden chars
251                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
252
253                 # An exception here should be caught in the caller
254                 stream = open(filename, open_mode)
255                 return (stream, filename)
256
257 def timeconvert(timestr):
258     """Convert RFC 2822 defined time string into system timestamp"""
259     timestamp = None
260     timetuple = email.utils.parsedate_tz(timestr)
261     if timetuple is not None:
262         timestamp = email.utils.mktime_tz(timetuple)
263     return timestamp
264
265 class DownloadError(Exception):
266         """Download Error exception.
267
268         This exception may be thrown by FileDownloader objects if they are not
269         configured to continue on errors. They will contain the appropriate
270         error message.
271         """
272         pass
273
274 class SameFileError(Exception):
275         """Same File exception.
276
277         This exception will be thrown by FileDownloader objects if they detect
278         multiple files would have to be downloaded to the same file on disk.
279         """
280         pass
281
282 class PostProcessingError(Exception):
283         """Post Processing exception.
284
285         This exception may be raised by PostProcessor's .run() method to
286         indicate an error in the postprocessing task.
287         """
288         pass
289
290 class UnavailableVideoError(Exception):
291         """Unavailable Format exception.
292
293         This exception will be thrown when a video is requested
294         in a format that is not available for that video.
295         """
296         pass
297
298 class ContentTooShortError(Exception):
299         """Content Too Short exception.
300
301         This exception may be raised by FileDownloader objects when a file they
302         download is too small for what the server announced first, indicating
303         the connection was probably interrupted.
304         """
305         # Both in bytes
306         downloaded = None
307         expected = None
308
309         def __init__(self, downloaded, expected):
310                 self.downloaded = downloaded
311                 self.expected = expected
312
313 class YoutubeDLHandler(urllib2.HTTPHandler):
314         """Handler for HTTP requests and responses.
315
316         This class, when installed with an OpenerDirector, automatically adds
317         the standard headers to every HTTP request and handles gzipped and
318         deflated responses from web servers. If compression is to be avoided in
319         a particular request, the original request in the program code only has
320         to include the HTTP header "Youtubedl-No-Compression", which will be
321         removed before making the real request.
322         
323         Part of this code was copied from:
324
325           http://techknack.net/python-urllib2-handlers/
326           
327         Andrew Rowls, the author of that code, agreed to release it to the
328         public domain.
329         """
330
331         @staticmethod
332         def deflate(data):
333                 try:
334                         return zlib.decompress(data, -zlib.MAX_WBITS)
335                 except zlib.error:
336                         return zlib.decompress(data)
337         
338         @staticmethod
339         def addinfourl_wrapper(stream, headers, url, code):
340                 if hasattr(urllib2.addinfourl, 'getcode'):
341                         return urllib2.addinfourl(stream, headers, url, code)
342                 ret = urllib2.addinfourl(stream, headers, url)
343                 ret.code = code
344                 return ret
345         
346         def http_request(self, req):
347                 for h in std_headers:
348                         if h in req.headers:
349                                 del req.headers[h]
350                         req.add_header(h, std_headers[h])
351                 if 'Youtubedl-no-compression' in req.headers:
352                         if 'Accept-encoding' in req.headers:
353                                 del req.headers['Accept-encoding']
354                         del req.headers['Youtubedl-no-compression']
355                 return req
356
357         def http_response(self, req, resp):
358                 old_resp = resp
359                 # gzip
360                 if resp.headers.get('Content-encoding', '') == 'gzip':
361                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
362                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
363                         resp.msg = old_resp.msg
364                 # deflate
365                 if resp.headers.get('Content-encoding', '') == 'deflate':
366                         gz = StringIO.StringIO(self.deflate(resp.read()))
367                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
368                         resp.msg = old_resp.msg
369                 return resp
370
371 class FileDownloader(object):
372         """File Downloader class.
373
374         File downloader objects are the ones responsible of downloading the
375         actual video file and writing it to disk if the user has requested
376         it, among some other tasks. In most cases there should be one per
377         program. As, given a video URL, the downloader doesn't know how to
378         extract all the needed information, task that InfoExtractors do, it
379         has to pass the URL to one of them.
380
381         For this, file downloader objects have a method that allows
382         InfoExtractors to be registered in a given order. When it is passed
383         a URL, the file downloader handles it to the first InfoExtractor it
384         finds that reports being able to handle it. The InfoExtractor extracts
385         all the information about the video or videos the URL refers to, and
386         asks the FileDownloader to process the video information, possibly
387         downloading the video.
388
389         File downloaders accept a lot of parameters. In order not to saturate
390         the object constructor with arguments, it receives a dictionary of
391         options instead. These options are available through the params
392         attribute for the InfoExtractors to use. The FileDownloader also
393         registers itself as the downloader in charge for the InfoExtractors
394         that are added to it, so this is a "mutual registration".
395
396         Available options:
397
398         username:         Username for authentication purposes.
399         password:         Password for authentication purposes.
400         usenetrc:         Use netrc for authentication instead.
401         quiet:            Do not print messages to stdout.
402         forceurl:         Force printing final URL.
403         forcetitle:       Force printing title.
404         forcethumbnail:   Force printing thumbnail URL.
405         forcedescription: Force printing description.
406         forcefilename:    Force printing final filename.
407         simulate:         Do not download the video files.
408         format:           Video format code.
409         format_limit:     Highest quality format to try.
410         outtmpl:          Template for output names.
411         ignoreerrors:     Do not stop on download errors.
412         ratelimit:        Download speed limit, in bytes/sec.
413         nooverwrites:     Prevent overwriting files.
414         retries:          Number of times to retry for HTTP error 5xx
415         continuedl:       Try to continue downloads if possible.
416         noprogress:       Do not print the progress bar.
417         playliststart:    Playlist item to start at.
418         playlistend:      Playlist item to end at.
419         logtostderr:      Log messages to stderr instead of stdout.
420         consoletitle:     Display progress in console window's titlebar.
421         nopart:           Do not use temporary .part files.
422         updatetime:       Use the Last-modified header to set output file timestamps.
423         writedescription: Write the video description to a .description file
424         writeinfojson:    Write the video description to a .info.json file
425         """
426
427         params = None
428         _ies = []
429         _pps = []
430         _download_retcode = None
431         _num_downloads = None
432         _screen_file = None
433
434         def __init__(self, params):
435                 """Create a FileDownloader object with the given options."""
436                 self._ies = []
437                 self._pps = []
438                 self._download_retcode = 0
439                 self._num_downloads = 0
440                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
441                 self.params = params
442
443         @staticmethod
444         def pmkdir(filename):
445                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
446                 components = filename.split(os.sep)
447                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
448                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
449                 for dir in aggregate:
450                         if not os.path.exists(dir):
451                                 os.mkdir(dir)
452
453         @staticmethod
454         def format_bytes(bytes):
455                 if bytes is None:
456                         return 'N/A'
457                 if type(bytes) is str:
458                         bytes = float(bytes)
459                 if bytes == 0.0:
460                         exponent = 0
461                 else:
462                         exponent = long(math.log(bytes, 1024.0))
463                 suffix = 'bkMGTPEZY'[exponent]
464                 converted = float(bytes) / float(1024**exponent)
465                 return '%.2f%s' % (converted, suffix)
466
467         @staticmethod
468         def calc_percent(byte_counter, data_len):
469                 if data_len is None:
470                         return '---.-%'
471                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
472
473         @staticmethod
474         def calc_eta(start, now, total, current):
475                 if total is None:
476                         return '--:--'
477                 dif = now - start
478                 if current == 0 or dif < 0.001: # One millisecond
479                         return '--:--'
480                 rate = float(current) / dif
481                 eta = long((float(total) - float(current)) / rate)
482                 (eta_mins, eta_secs) = divmod(eta, 60)
483                 if eta_mins > 99:
484                         return '--:--'
485                 return '%02d:%02d' % (eta_mins, eta_secs)
486
487         @staticmethod
488         def calc_speed(start, now, bytes):
489                 dif = now - start
490                 if bytes == 0 or dif < 0.001: # One millisecond
491                         return '%10s' % '---b/s'
492                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
493
494         @staticmethod
495         def best_block_size(elapsed_time, bytes):
496                 new_min = max(bytes / 2.0, 1.0)
497                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
498                 if elapsed_time < 0.001:
499                         return long(new_max)
500                 rate = bytes / elapsed_time
501                 if rate > new_max:
502                         return long(new_max)
503                 if rate < new_min:
504                         return long(new_min)
505                 return long(rate)
506
507         @staticmethod
508         def parse_bytes(bytestr):
509                 """Parse a string indicating a byte quantity into a long integer."""
510                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
511                 if matchobj is None:
512                         return None
513                 number = float(matchobj.group(1))
514                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
515                 return long(round(number * multiplier))
516
517         def add_info_extractor(self, ie):
518                 """Add an InfoExtractor object to the end of the list."""
519                 self._ies.append(ie)
520                 ie.set_downloader(self)
521
522         def add_post_processor(self, pp):
523                 """Add a PostProcessor object to the end of the chain."""
524                 self._pps.append(pp)
525                 pp.set_downloader(self)
526
527         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
528                 """Print message to stdout if not in quiet mode."""
529                 try:
530                         if not self.params.get('quiet', False):
531                                 terminator = [u'\n', u''][skip_eol]
532                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
533                         self._screen_file.flush()
534                 except (UnicodeEncodeError), err:
535                         if not ignore_encoding_errors:
536                                 raise
537
538         def to_stderr(self, message):
539                 """Print message to stderr."""
540                 print >>sys.stderr, message.encode(preferredencoding())
541
542         def to_cons_title(self, message):
543                 """Set console/terminal window title to message."""
544                 if not self.params.get('consoletitle', False):
545                         return
546                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
547                         # c_wchar_p() might not be necessary if `message` is
548                         # already of type unicode()
549                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
550                 elif 'TERM' in os.environ:
551                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
552
553         def fixed_template(self):
554                 """Checks if the output template is fixed."""
555                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
556
557         def trouble(self, message=None):
558                 """Determine action to take when a download problem appears.
559
560                 Depending on if the downloader has been configured to ignore
561                 download errors or not, this method may throw an exception or
562                 not when errors are found, after printing the message.
563                 """
564                 if message is not None:
565                         self.to_stderr(message)
566                 if not self.params.get('ignoreerrors', False):
567                         raise DownloadError(message)
568                 self._download_retcode = 1
569
570         def slow_down(self, start_time, byte_counter):
571                 """Sleep if the download speed is over the rate limit."""
572                 rate_limit = self.params.get('ratelimit', None)
573                 if rate_limit is None or byte_counter == 0:
574                         return
575                 now = time.time()
576                 elapsed = now - start_time
577                 if elapsed <= 0.0:
578                         return
579                 speed = float(byte_counter) / elapsed
580                 if speed > rate_limit:
581                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
582
583         def temp_name(self, filename):
584                 """Returns a temporary filename for the given filename."""
585                 if self.params.get('nopart', False) or filename == u'-' or \
586                                 (os.path.exists(filename) and not os.path.isfile(filename)):
587                         return filename
588                 return filename + u'.part'
589
590         def undo_temp_name(self, filename):
591                 if filename.endswith(u'.part'):
592                         return filename[:-len(u'.part')]
593                 return filename
594
595         def try_rename(self, old_filename, new_filename):
596                 try:
597                         if old_filename == new_filename:
598                                 return
599                         os.rename(old_filename, new_filename)
600                 except (IOError, OSError), err:
601                         self.trouble(u'ERROR: unable to rename file')
602         
603         def try_utime(self, filename, last_modified_hdr):
604                 """Try to set the last-modified time of the given file."""
605                 if last_modified_hdr is None:
606                         return
607                 if not os.path.isfile(filename):
608                         return
609                 timestr = last_modified_hdr
610                 if timestr is None:
611                         return
612                 filetime = timeconvert(timestr)
613                 if filetime is None:
614                         return
615                 try:
616                         os.utime(filename,(time.time(), filetime))
617                 except:
618                         pass
619
620         def report_writedescription(self, descfn):
621                 """ Report that the description file is being written """
622                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
623
624         def report_writeinfojson(self, infofn):
625                 """ Report that the metadata file has been written """
626                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
627
628         def report_destination(self, filename):
629                 """Report destination filename."""
630                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
631
632         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
633                 """Report download progress."""
634                 if self.params.get('noprogress', False):
635                         return
636                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
637                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
638                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
639                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
640
641         def report_resuming_byte(self, resume_len):
642                 """Report attempt to resume at given byte."""
643                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
644
645         def report_retry(self, count, retries):
646                 """Report retry in case of HTTP error 5xx"""
647                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
648
649         def report_file_already_downloaded(self, file_name):
650                 """Report file has already been fully downloaded."""
651                 try:
652                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
653                 except (UnicodeEncodeError), err:
654                         self.to_screen(u'[download] The file has already been downloaded')
655
656         def report_unable_to_resume(self):
657                 """Report it was impossible to resume download."""
658                 self.to_screen(u'[download] Unable to resume')
659
660         def report_finish(self):
661                 """Report download finished."""
662                 if self.params.get('noprogress', False):
663                         self.to_screen(u'[download] Download completed')
664                 else:
665                         self.to_screen(u'')
666
667         def increment_downloads(self):
668                 """Increment the ordinal that assigns a number to each file."""
669                 self._num_downloads += 1
670
671         def prepare_filename(self, info_dict):
672                 """Generate the output filename."""
673                 try:
674                         template_dict = dict(info_dict)
675                         template_dict['epoch'] = unicode(long(time.time()))
676                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
677                         filename = self.params['outtmpl'] % template_dict
678                         return filename
679                 except (ValueError, KeyError), err:
680                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
681                         return None
682
683         def process_info(self, info_dict):
684                 """Process a single dictionary returned by an InfoExtractor."""
685                 filename = self.prepare_filename(info_dict)
686                 # Do nothing else if in simulate mode
687                 if self.params.get('simulate', False):
688                         # Forced printings
689                         if self.params.get('forcetitle', False):
690                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
691                         if self.params.get('forceurl', False):
692                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
693                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
694                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
695                         if self.params.get('forcedescription', False) and 'description' in info_dict:
696                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
697                         if self.params.get('forcefilename', False) and filename is not None:
698                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
699
700                         return
701
702                 if filename is None:
703                         return
704                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
705                         self.to_stderr(u'WARNING: file exists and will be skipped')
706                         return
707
708                 try:
709                         self.pmkdir(filename)
710                 except (OSError, IOError), err:
711                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
712                         return
713
714                 if self.params.get('writedescription', False):
715                         try:
716                                 descfn = filename + '.description'
717                                 self.report_writedescription(descfn)
718                                 descfile = open(descfn, 'wb')
719                                 try:
720                                         descfile.write(info_dict['description'].encode('utf-8'))
721                                 finally:
722                                         descfile.close()
723                         except (OSError, IOError):
724                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
725                                 return
726
727                 if self.params.get('writeinfojson', False):
728                         infofn = filename + '.info.json'
729                         self.report_writeinfojson(infofn)
730                         try:
731                                 json.dump
732                         except (NameError,AttributeError):
733                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
734                                 return
735                         try:
736                                 infof = open(infofn, 'wb')
737                                 try:
738                                         json.dump(info_dict, infof)
739                                 finally:
740                                         infof.close()
741                         except (OSError, IOError):
742                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
743                                 return
744
745                 try:
746                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
747                 except (OSError, IOError), err:
748                         raise UnavailableVideoError
749                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
750                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
751                         return
752                 except (ContentTooShortError, ), err:
753                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
754                         return
755
756                 if success:
757                         try:
758                                 self.post_process(filename, info_dict)
759                         except (PostProcessingError), err:
760                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
761                                 return
762
763         def download(self, url_list):
764                 """Download a given list of URLs."""
765                 if len(url_list) > 1 and self.fixed_template():
766                         raise SameFileError(self.params['outtmpl'])
767
768                 for url in url_list:
769                         suitable_found = False
770                         for ie in self._ies:
771                                 # Go to next InfoExtractor if not suitable
772                                 if not ie.suitable(url):
773                                         continue
774
775                                 # Suitable InfoExtractor found
776                                 suitable_found = True
777
778                                 # Extract information from URL and process it
779                                 ie.extract(url)
780
781                                 # Suitable InfoExtractor had been found; go to next URL
782                                 break
783
784                         if not suitable_found:
785                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
786
787                 return self._download_retcode
788
789         def post_process(self, filename, ie_info):
790                 """Run the postprocessing chain on the given file."""
791                 info = dict(ie_info)
792                 info['filepath'] = filename
793                 for pp in self._pps:
794                         info = pp.run(info)
795                         if info is None:
796                                 break
797
798         def _download_with_rtmpdump(self, filename, url, player_url):
799                 self.report_destination(filename)
800                 tmpfilename = self.temp_name(filename)
801
802                 # Check for rtmpdump first
803                 try:
804                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
805                 except (OSError, IOError):
806                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
807                         return False
808
809                 # Download using rtmpdump. rtmpdump returns exit code 2 when
810                 # the connection was interrumpted and resuming appears to be
811                 # possible. This is part of rtmpdump's normal usage, AFAIK.
812                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
813                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
814                 while retval == 2 or retval == 1:
815                         prevsize = os.path.getsize(tmpfilename)
816                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
817                         time.sleep(5.0) # This seems to be needed
818                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
819                         cursize = os.path.getsize(tmpfilename)
820                         if prevsize == cursize and retval == 1:
821                                 break
822                 if retval == 0:
823                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
824                         self.try_rename(tmpfilename, filename)
825                         return True
826                 else:
827                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
828                         return False
829
830         def _do_download(self, filename, url, player_url):
831                 # Check file already present
832                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
833                         self.report_file_already_downloaded(filename)
834                         return True
835
836                 # Attempt to download using rtmpdump
837                 if url.startswith('rtmp'):
838                         return self._download_with_rtmpdump(filename, url, player_url)
839
840                 tmpfilename = self.temp_name(filename)
841                 stream = None
842                 open_mode = 'wb'
843
844                 # Do not include the Accept-Encoding header
845                 headers = {'Youtubedl-no-compression': 'True'}
846                 basic_request = urllib2.Request(url, None, headers)
847                 request = urllib2.Request(url, None, headers)
848
849                 # Establish possible resume length
850                 if os.path.isfile(tmpfilename):
851                         resume_len = os.path.getsize(tmpfilename)
852                 else:
853                         resume_len = 0
854
855                 # Request parameters in case of being able to resume
856                 if self.params.get('continuedl', False) and resume_len != 0:
857                         self.report_resuming_byte(resume_len)
858                         request.add_header('Range','bytes=%d-' % resume_len)
859                         open_mode = 'ab'
860
861                 count = 0
862                 retries = self.params.get('retries', 0)
863                 while count <= retries:
864                         # Establish connection
865                         try:
866                                 data = urllib2.urlopen(request)
867                                 break
868                         except (urllib2.HTTPError, ), err:
869                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
870                                         # Unexpected HTTP error
871                                         raise
872                                 elif err.code == 416:
873                                         # Unable to resume (requested range not satisfiable)
874                                         try:
875                                                 # Open the connection again without the range header
876                                                 data = urllib2.urlopen(basic_request)
877                                                 content_length = data.info()['Content-Length']
878                                         except (urllib2.HTTPError, ), err:
879                                                 if err.code < 500 or err.code >= 600:
880                                                         raise
881                                         else:
882                                                 # Examine the reported length
883                                                 if (content_length is not None and
884                                                         (resume_len - 100 < long(content_length) < resume_len + 100)):
885                                                         # The file had already been fully downloaded.
886                                                         # Explanation to the above condition: in issue #175 it was revealed that
887                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
888                                                         # changing the file size slightly and causing problems for some users. So
889                                                         # I decided to implement a suggested change and consider the file
890                                                         # completely downloaded if the file size differs less than 100 bytes from
891                                                         # the one in the hard drive.
892                                                         self.report_file_already_downloaded(filename)
893                                                         self.try_rename(tmpfilename, filename)
894                                                         return True
895                                                 else:
896                                                         # The length does not match, we start the download over
897                                                         self.report_unable_to_resume()
898                                                         open_mode = 'wb'
899                                                         break
900                         # Retry
901                         count += 1
902                         if count <= retries:
903                                 self.report_retry(count, retries)
904
905                 if count > retries:
906                         self.trouble(u'ERROR: giving up after %s retries' % retries)
907                         return False
908
909                 data_len = data.info().get('Content-length', None)
910                 if data_len is not None:
911                         data_len = long(data_len) + resume_len
912                 data_len_str = self.format_bytes(data_len)
913                 byte_counter = 0 + resume_len
914                 block_size = 1024
915                 start = time.time()
916                 while True:
917                         # Download and write
918                         before = time.time()
919                         data_block = data.read(block_size)
920                         after = time.time()
921                         if len(data_block) == 0:
922                                 break
923                         byte_counter += len(data_block)
924
925                         # Open file just in time
926                         if stream is None:
927                                 try:
928                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
929                                         filename = self.undo_temp_name(tmpfilename)
930                                         self.report_destination(filename)
931                                 except (OSError, IOError), err:
932                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
933                                         return False
934                         try:
935                                 stream.write(data_block)
936                         except (IOError, OSError), err:
937                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
938                                 return False
939                         block_size = self.best_block_size(after - before, len(data_block))
940
941                         # Progress message
942                         percent_str = self.calc_percent(byte_counter, data_len)
943                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
944                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
945                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
946
947                         # Apply rate limit
948                         self.slow_down(start, byte_counter - resume_len)
949
950                 stream.close()
951                 self.report_finish()
952                 if data_len is not None and byte_counter != data_len:
953                         raise ContentTooShortError(byte_counter, long(data_len))
954                 self.try_rename(tmpfilename, filename)
955
956                 # Update file modification time
957                 if self.params.get('updatetime', True):
958                         self.try_utime(filename, data.info().get('last-modified', None))
959
960                 return True
961
962 class InfoExtractor(object):
963         """Information Extractor class.
964
965         Information extractors are the classes that, given a URL, extract
966         information from the video (or videos) the URL refers to. This
967         information includes the real video URL, the video title and simplified
968         title, author and others. The information is stored in a dictionary
969         which is then passed to the FileDownloader. The FileDownloader
970         processes this information possibly downloading the video to the file
971         system, among other possible outcomes. The dictionaries must include
972         the following fields:
973
974         id:             Video identifier.
975         url:            Final video URL.
976         uploader:       Nickname of the video uploader.
977         title:          Literal title.
978         stitle:         Simplified title.
979         ext:            Video filename extension.
980         format:         Video format.
981         player_url:     SWF Player URL (may be None).
982
983         The following fields are optional. Their primary purpose is to allow
984         youtube-dl to serve as the backend for a video search function, such
985         as the one in youtube2mp3.  They are only used when their respective
986         forced printing functions are called:
987
988         thumbnail:      Full URL to a video thumbnail image.
989         description:    One-line video description.
990
991         Subclasses of this one should re-define the _real_initialize() and
992         _real_extract() methods, as well as the suitable() static method.
993         Probably, they should also be instantiated and added to the main
994         downloader.
995         """
996
997         _ready = False
998         _downloader = None
999
1000         def __init__(self, downloader=None):
1001                 """Constructor. Receives an optional downloader."""
1002                 self._ready = False
1003                 self.set_downloader(downloader)
1004
1005         @staticmethod
1006         def suitable(url):
1007                 """Receives a URL and returns True if suitable for this IE."""
1008                 return False
1009
1010         def initialize(self):
1011                 """Initializes an instance (authentication, etc)."""
1012                 if not self._ready:
1013                         self._real_initialize()
1014                         self._ready = True
1015
1016         def extract(self, url):
1017                 """Extracts URL information and returns it in list of dicts."""
1018                 self.initialize()
1019                 return self._real_extract(url)
1020
1021         def set_downloader(self, downloader):
1022                 """Sets the downloader for this IE."""
1023                 self._downloader = downloader
1024
1025         def _real_initialize(self):
1026                 """Real initialization process. Redefine in subclasses."""
1027                 pass
1028
1029         def _real_extract(self, url):
1030                 """Real extraction process. Redefine in subclasses."""
1031                 pass
1032
1033 class YoutubeIE(InfoExtractor):
1034         """Information extractor for youtube.com."""
1035
1036         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1037         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1038         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1039         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1040         _NETRC_MACHINE = 'youtube'
1041         # Listed in order of quality
1042         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1043         _video_extensions = {
1044                 '13': '3gp',
1045                 '17': 'mp4',
1046                 '18': 'mp4',
1047                 '22': 'mp4',
1048                 '37': 'mp4',
1049                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1050                 '43': 'webm',
1051                 '45': 'webm',
1052         }
1053
1054         @staticmethod
1055         def suitable(url):
1056                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1057
1058         def report_lang(self):
1059                 """Report attempt to set language."""
1060                 self._downloader.to_screen(u'[youtube] Setting language')
1061
1062         def report_login(self):
1063                 """Report attempt to log in."""
1064                 self._downloader.to_screen(u'[youtube] Logging in')
1065
1066         def report_age_confirmation(self):
1067                 """Report attempt to confirm age."""
1068                 self._downloader.to_screen(u'[youtube] Confirming age')
1069
1070         def report_video_webpage_download(self, video_id):
1071                 """Report attempt to download video webpage."""
1072                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1073
1074         def report_video_info_webpage_download(self, video_id):
1075                 """Report attempt to download video info webpage."""
1076                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1077
1078         def report_information_extraction(self, video_id):
1079                 """Report attempt to extract video information."""
1080                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1081
1082         def report_unavailable_format(self, video_id, format):
1083                 """Report extracted video URL."""
1084                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1085
1086         def report_rtmp_download(self):
1087                 """Indicate the download will use the RTMP protocol."""
1088                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1089
1090         def _real_initialize(self):
1091                 if self._downloader is None:
1092                         return
1093
1094                 username = None
1095                 password = None
1096                 downloader_params = self._downloader.params
1097
1098                 # Attempt to use provided username and password or .netrc data
1099                 if downloader_params.get('username', None) is not None:
1100                         username = downloader_params['username']
1101                         password = downloader_params['password']
1102                 elif downloader_params.get('usenetrc', False):
1103                         try:
1104                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1105                                 if info is not None:
1106                                         username = info[0]
1107                                         password = info[2]
1108                                 else:
1109                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1110                         except (IOError, netrc.NetrcParseError), err:
1111                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1112                                 return
1113
1114                 # Set language
1115                 request = urllib2.Request(self._LANG_URL)
1116                 try:
1117                         self.report_lang()
1118                         urllib2.urlopen(request).read()
1119                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1120                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1121                         return
1122
1123                 # No authentication to be performed
1124                 if username is None:
1125                         return
1126
1127                 # Log in
1128                 login_form = {
1129                                 'current_form': 'loginForm',
1130                                 'next':         '/',
1131                                 'action_login': 'Log In',
1132                                 'username':     username,
1133                                 'password':     password,
1134                                 }
1135                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1136                 try:
1137                         self.report_login()
1138                         login_results = urllib2.urlopen(request).read()
1139                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1140                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1141                                 return
1142                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1143                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1144                         return
1145
1146                 # Confirm age
1147                 age_form = {
1148                                 'next_url':             '/',
1149                                 'action_confirm':       'Confirm',
1150                                 }
1151                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1152                 try:
1153                         self.report_age_confirmation()
1154                         age_results = urllib2.urlopen(request).read()
1155                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1156                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1157                         return
1158
1159         def _real_extract(self, url):
1160                 # Extract video id from URL
1161                 mobj = re.match(self._VALID_URL, url)
1162                 if mobj is None:
1163                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1164                         return
1165                 video_id = mobj.group(2)
1166
1167                 # Get video webpage
1168                 self.report_video_webpage_download(video_id)
1169                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1170                 try:
1171                         video_webpage = urllib2.urlopen(request).read()
1172                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1173                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1174                         return
1175
1176                 # Attempt to extract SWF player URL
1177                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1178                 if mobj is not None:
1179                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1180                 else:
1181                         player_url = None
1182
1183                 # Get video info
1184                 self.report_video_info_webpage_download(video_id)
1185                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1186                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1187                                            % (video_id, el_type))
1188                         request = urllib2.Request(video_info_url)
1189                         try:
1190                                 video_info_webpage = urllib2.urlopen(request).read()
1191                                 video_info = parse_qs(video_info_webpage)
1192                                 if 'token' in video_info:
1193                                         break
1194                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1195                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1196                                 return
1197                 if 'token' not in video_info:
1198                         if 'reason' in video_info:
1199                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1200                         else:
1201                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1202                         return
1203
1204                 # Start extracting information
1205                 self.report_information_extraction(video_id)
1206
1207                 # uploader
1208                 if 'author' not in video_info:
1209                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1210                         return
1211                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1212
1213                 # title
1214                 if 'title' not in video_info:
1215                         self._downloader.trouble(u'ERROR: unable to extract video title')
1216                         return
1217                 video_title = urllib.unquote_plus(video_info['title'][0])
1218                 video_title = video_title.decode('utf-8')
1219                 video_title = sanitize_title(video_title)
1220
1221                 # simplified title
1222                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1223                 simple_title = simple_title.strip(ur'_')
1224
1225                 # thumbnail image
1226                 if 'thumbnail_url' not in video_info:
1227                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1228                         video_thumbnail = ''
1229                 else:   # don't panic if we can't find it
1230                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1231
1232                 # upload date
1233                 upload_date = u'NA'
1234                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1235                 if mobj is not None:
1236                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1237                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1238                         for expression in format_expressions:
1239                                 try:
1240                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1241                                 except:
1242                                         pass
1243
1244                 # description
1245                 try:
1246                         lxml.etree
1247                 except NameError:
1248                         video_description = u'No description available.'
1249                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1250                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1251                                 if mobj is not None:
1252                                         video_description = mobj.group(1).decode('utf-8')
1253                 else:
1254                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1255                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1256                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1257                         # TODO use another parser
1258
1259                 # token
1260                 video_token = urllib.unquote_plus(video_info['token'][0])
1261
1262                 # Decide which formats to download
1263                 req_format = self._downloader.params.get('format', None)
1264
1265                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1266                         self.report_rtmp_download()
1267                         video_url_list = [(None, video_info['conn'][0])]
1268                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1269                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1270                         url_data = [parse_qs(uds) for uds in url_data_strs]
1271                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1272                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1273
1274                         format_limit = self._downloader.params.get('format_limit', None)
1275                         if format_limit is not None and format_limit in self._available_formats:
1276                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1277                         else:
1278                                 format_list = self._available_formats
1279                         existing_formats = [x for x in format_list if x in url_map]
1280                         if len(existing_formats) == 0:
1281                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1282                                 return
1283                         if req_format is None:
1284                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1285                         elif req_format == '-1':
1286                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1287                         else:
1288                                 # Specific format
1289                                 if req_format not in url_map:
1290                                         self._downloader.trouble(u'ERROR: requested format not available')
1291                                         return
1292                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1293                 else:
1294                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1295                         return
1296
1297                 for format_param, video_real_url in video_url_list:
1298                         # At this point we have a new video
1299                         self._downloader.increment_downloads()
1300
1301                         # Extension
1302                         video_extension = self._video_extensions.get(format_param, 'flv')
1303
1304                         try:
1305                                 # Process video information
1306                                 self._downloader.process_info({
1307                                         'id':           video_id.decode('utf-8'),
1308                                         'url':          video_real_url.decode('utf-8'),
1309                                         'uploader':     video_uploader.decode('utf-8'),
1310                                         'upload_date':  upload_date,
1311                                         'title':        video_title,
1312                                         'stitle':       simple_title,
1313                                         'ext':          video_extension.decode('utf-8'),
1314                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1315                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1316                                         'description':  video_description,
1317                                         'player_url':   player_url,
1318                                 })
1319                         except UnavailableVideoError, err:
1320                                 self._downloader.trouble(u'\nERROR: unable to download video')
1321
1322
1323 class MetacafeIE(InfoExtractor):
1324         """Information Extractor for metacafe.com."""
1325
1326         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1327         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1328         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1329         _youtube_ie = None
1330
1331         def __init__(self, youtube_ie, downloader=None):
1332                 InfoExtractor.__init__(self, downloader)
1333                 self._youtube_ie = youtube_ie
1334
1335         @staticmethod
1336         def suitable(url):
1337                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1338
1339         def report_disclaimer(self):
1340                 """Report disclaimer retrieval."""
1341                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1342
1343         def report_age_confirmation(self):
1344                 """Report attempt to confirm age."""
1345                 self._downloader.to_screen(u'[metacafe] Confirming age')
1346
1347         def report_download_webpage(self, video_id):
1348                 """Report webpage download."""
1349                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1350
1351         def report_extraction(self, video_id):
1352                 """Report information extraction."""
1353                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1354
1355         def _real_initialize(self):
1356                 # Retrieve disclaimer
1357                 request = urllib2.Request(self._DISCLAIMER)
1358                 try:
1359                         self.report_disclaimer()
1360                         disclaimer = urllib2.urlopen(request).read()
1361                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1362                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1363                         return
1364
1365                 # Confirm age
1366                 disclaimer_form = {
1367                         'filters': '0',
1368                         'submit': "Continue - I'm over 18",
1369                         }
1370                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1371                 try:
1372                         self.report_age_confirmation()
1373                         disclaimer = urllib2.urlopen(request).read()
1374                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1375                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1376                         return
1377
1378         def _real_extract(self, url):
1379                 # Extract id and simplified title from URL
1380                 mobj = re.match(self._VALID_URL, url)
1381                 if mobj is None:
1382                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1383                         return
1384
1385                 video_id = mobj.group(1)
1386
1387                 # Check if video comes from YouTube
1388                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1389                 if mobj2 is not None:
1390                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1391                         return
1392
1393                 # At this point we have a new video
1394                 self._downloader.increment_downloads()
1395
1396                 simple_title = mobj.group(2).decode('utf-8')
1397
1398                 # Retrieve video webpage to extract further information
1399                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1400                 try:
1401                         self.report_download_webpage(video_id)
1402                         webpage = urllib2.urlopen(request).read()
1403                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1404                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1405                         return
1406
1407                 # Extract URL, uploader and title from webpage
1408                 self.report_extraction(video_id)
1409                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1410                 if mobj is not None:
1411                         mediaURL = urllib.unquote(mobj.group(1))
1412                         video_extension = mediaURL[-3:]
1413
1414                         # Extract gdaKey if available
1415                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1416                         if mobj is None:
1417                                 video_url = mediaURL
1418                         else:
1419                                 gdaKey = mobj.group(1)
1420                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1421                 else:
1422                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1423                         if mobj is None:
1424                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1425                                 return
1426                         vardict = parse_qs(mobj.group(1))
1427                         if 'mediaData' not in vardict:
1428                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1429                                 return
1430                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1431                         if mobj is None:
1432                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1433                                 return
1434                         mediaURL = mobj.group(1).replace('\\/', '/')
1435                         video_extension = mediaURL[-3:]
1436                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1437
1438                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1439                 if mobj is None:
1440                         self._downloader.trouble(u'ERROR: unable to extract title')
1441                         return
1442                 video_title = mobj.group(1).decode('utf-8')
1443                 video_title = sanitize_title(video_title)
1444
1445                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1446                 if mobj is None:
1447                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1448                         return
1449                 video_uploader = mobj.group(1)
1450
1451                 try:
1452                         # Process video information
1453                         self._downloader.process_info({
1454                                 'id':           video_id.decode('utf-8'),
1455                                 'url':          video_url.decode('utf-8'),
1456                                 'uploader':     video_uploader.decode('utf-8'),
1457                                 'upload_date':  u'NA',
1458                                 'title':        video_title,
1459                                 'stitle':       simple_title,
1460                                 'ext':          video_extension.decode('utf-8'),
1461                                 'format':       u'NA',
1462                                 'player_url':   None,
1463                         })
1464                 except UnavailableVideoError:
1465                         self._downloader.trouble(u'\nERROR: unable to download video')
1466
1467
1468 class DailymotionIE(InfoExtractor):
1469         """Information Extractor for Dailymotion"""
1470
1471         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1472
1473         def __init__(self, downloader=None):
1474                 InfoExtractor.__init__(self, downloader)
1475
1476         @staticmethod
1477         def suitable(url):
1478                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1479
1480         def report_download_webpage(self, video_id):
1481                 """Report webpage download."""
1482                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1483
1484         def report_extraction(self, video_id):
1485                 """Report information extraction."""
1486                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1487
1488         def _real_initialize(self):
1489                 return
1490
1491         def _real_extract(self, url):
1492                 # Extract id and simplified title from URL
1493                 mobj = re.match(self._VALID_URL, url)
1494                 if mobj is None:
1495                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1496                         return
1497
1498                 # At this point we have a new video
1499                 self._downloader.increment_downloads()
1500                 video_id = mobj.group(1)
1501
1502                 simple_title = mobj.group(2).decode('utf-8')
1503                 video_extension = 'flv'
1504
1505                 # Retrieve video webpage to extract further information
1506                 request = urllib2.Request(url)
1507                 try:
1508                         self.report_download_webpage(video_id)
1509                         webpage = urllib2.urlopen(request).read()
1510                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1511                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1512                         return
1513
1514                 # Extract URL, uploader and title from webpage
1515                 self.report_extraction(video_id)
1516                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1517                 if mobj is None:
1518                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1519                         return
1520                 mediaURL = urllib.unquote(mobj.group(1))
1521
1522                 # if needed add http://www.dailymotion.com/ if relative URL
1523
1524                 video_url = mediaURL
1525
1526                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1527                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1528                 if mobj is None:
1529                         self._downloader.trouble(u'ERROR: unable to extract title')
1530                         return
1531                 video_title = mobj.group(1).decode('utf-8')
1532                 video_title = sanitize_title(video_title)
1533
1534                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1535                 if mobj is None:
1536                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1537                         return
1538                 video_uploader = mobj.group(1)
1539
1540                 try:
1541                         # Process video information
1542                         self._downloader.process_info({
1543                                 'id':           video_id.decode('utf-8'),
1544                                 'url':          video_url.decode('utf-8'),
1545                                 'uploader':     video_uploader.decode('utf-8'),
1546                                 'upload_date':  u'NA',
1547                                 'title':        video_title,
1548                                 'stitle':       simple_title,
1549                                 'ext':          video_extension.decode('utf-8'),
1550                                 'format':       u'NA',
1551                                 'player_url':   None,
1552                         })
1553                 except UnavailableVideoError:
1554                         self._downloader.trouble(u'\nERROR: unable to download video')
1555
1556 class GoogleIE(InfoExtractor):
1557         """Information extractor for video.google.com."""
1558
1559         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1560
1561         def __init__(self, downloader=None):
1562                 InfoExtractor.__init__(self, downloader)
1563
1564         @staticmethod
1565         def suitable(url):
1566                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1567
1568         def report_download_webpage(self, video_id):
1569                 """Report webpage download."""
1570                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1571
1572         def report_extraction(self, video_id):
1573                 """Report information extraction."""
1574                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1575
1576         def _real_initialize(self):
1577                 return
1578
1579         def _real_extract(self, url):
1580                 # Extract id from URL
1581                 mobj = re.match(self._VALID_URL, url)
1582                 if mobj is None:
1583                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1584                         return
1585
1586                 # At this point we have a new video
1587                 self._downloader.increment_downloads()
1588                 video_id = mobj.group(1)
1589
1590                 video_extension = 'mp4'
1591
1592                 # Retrieve video webpage to extract further information
1593                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1594                 try:
1595                         self.report_download_webpage(video_id)
1596                         webpage = urllib2.urlopen(request).read()
1597                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1598                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1599                         return
1600
1601                 # Extract URL, uploader, and title from webpage
1602                 self.report_extraction(video_id)
1603                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1604                 if mobj is None:
1605                         video_extension = 'flv'
1606                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1607                 if mobj is None:
1608                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1609                         return
1610                 mediaURL = urllib.unquote(mobj.group(1))
1611                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1612                 mediaURL = mediaURL.replace('\\x26', '\x26')
1613
1614                 video_url = mediaURL
1615
1616                 mobj = re.search(r'<title>(.*)</title>', webpage)
1617                 if mobj is None:
1618                         self._downloader.trouble(u'ERROR: unable to extract title')
1619                         return
1620                 video_title = mobj.group(1).decode('utf-8')
1621                 video_title = sanitize_title(video_title)
1622                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1623
1624                 # Extract video description
1625                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1626                 if mobj is None:
1627                         self._downloader.trouble(u'ERROR: unable to extract video description')
1628                         return
1629                 video_description = mobj.group(1).decode('utf-8')
1630                 if not video_description:
1631                         video_description = 'No description available.'
1632
1633                 # Extract video thumbnail
1634                 if self._downloader.params.get('forcethumbnail', False):
1635                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1636                         try:
1637                                 webpage = urllib2.urlopen(request).read()
1638                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1639                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1640                                 return
1641                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1642                         if mobj is None:
1643                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1644                                 return
1645                         video_thumbnail = mobj.group(1)
1646                 else:   # we need something to pass to process_info
1647                         video_thumbnail = ''
1648
1649
1650                 try:
1651                         # Process video information
1652                         self._downloader.process_info({
1653                                 'id':           video_id.decode('utf-8'),
1654                                 'url':          video_url.decode('utf-8'),
1655                                 'uploader':     u'NA',
1656                                 'upload_date':  u'NA',
1657                                 'title':        video_title,
1658                                 'stitle':       simple_title,
1659                                 'ext':          video_extension.decode('utf-8'),
1660                                 'format':       u'NA',
1661                                 'player_url':   None,
1662                         })
1663                 except UnavailableVideoError:
1664                         self._downloader.trouble(u'\nERROR: unable to download video')
1665
1666
1667 class PhotobucketIE(InfoExtractor):
1668         """Information extractor for photobucket.com."""
1669
1670         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1671
1672         def __init__(self, downloader=None):
1673                 InfoExtractor.__init__(self, downloader)
1674
1675         @staticmethod
1676         def suitable(url):
1677                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1678
1679         def report_download_webpage(self, video_id):
1680                 """Report webpage download."""
1681                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1682
1683         def report_extraction(self, video_id):
1684                 """Report information extraction."""
1685                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1686
1687         def _real_initialize(self):
1688                 return
1689
1690         def _real_extract(self, url):
1691                 # Extract id from URL
1692                 mobj = re.match(self._VALID_URL, url)
1693                 if mobj is None:
1694                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1695                         return
1696
1697                 # At this point we have a new video
1698                 self._downloader.increment_downloads()
1699                 video_id = mobj.group(1)
1700
1701                 video_extension = 'flv'
1702
1703                 # Retrieve video webpage to extract further information
1704                 request = urllib2.Request(url)
1705                 try:
1706                         self.report_download_webpage(video_id)
1707                         webpage = urllib2.urlopen(request).read()
1708                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1709                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1710                         return
1711
1712                 # Extract URL, uploader, and title from webpage
1713                 self.report_extraction(video_id)
1714                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1715                 if mobj is None:
1716                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1717                         return
1718                 mediaURL = urllib.unquote(mobj.group(1))
1719
1720                 video_url = mediaURL
1721
1722                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1723                 if mobj is None:
1724                         self._downloader.trouble(u'ERROR: unable to extract title')
1725                         return
1726                 video_title = mobj.group(1).decode('utf-8')
1727                 video_title = sanitize_title(video_title)
1728                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1729
1730                 video_uploader = mobj.group(2).decode('utf-8')
1731
1732                 try:
1733                         # Process video information
1734                         self._downloader.process_info({
1735                                 'id':           video_id.decode('utf-8'),
1736                                 'url':          video_url.decode('utf-8'),
1737                                 'uploader':     video_uploader,
1738                                 'upload_date':  u'NA',
1739                                 'title':        video_title,
1740                                 'stitle':       simple_title,
1741                                 'ext':          video_extension.decode('utf-8'),
1742                                 'format':       u'NA',
1743                                 'player_url':   None,
1744                         })
1745                 except UnavailableVideoError:
1746                         self._downloader.trouble(u'\nERROR: unable to download video')
1747
1748
1749 class YahooIE(InfoExtractor):
1750         """Information extractor for video.yahoo.com."""
1751
1752         # _VALID_URL matches all Yahoo! Video URLs
1753         # _VPAGE_URL matches only the extractable '/watch/' URLs
1754         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1755         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1756
1757         def __init__(self, downloader=None):
1758                 InfoExtractor.__init__(self, downloader)
1759
1760         @staticmethod
1761         def suitable(url):
1762                 return (re.match(YahooIE._VALID_URL, url) is not None)
1763
1764         def report_download_webpage(self, video_id):
1765                 """Report webpage download."""
1766                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1767
1768         def report_extraction(self, video_id):
1769                 """Report information extraction."""
1770                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1771
1772         def _real_initialize(self):
1773                 return
1774
1775         def _real_extract(self, url, new_video=True):
1776                 # Extract ID from URL
1777                 mobj = re.match(self._VALID_URL, url)
1778                 if mobj is None:
1779                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1780                         return
1781
1782                 # At this point we have a new video
1783                 self._downloader.increment_downloads()
1784                 video_id = mobj.group(2)
1785                 video_extension = 'flv'
1786
1787                 # Rewrite valid but non-extractable URLs as
1788                 # extractable English language /watch/ URLs
1789                 if re.match(self._VPAGE_URL, url) is None:
1790                         request = urllib2.Request(url)
1791                         try:
1792                                 webpage = urllib2.urlopen(request).read()
1793                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1794                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1795                                 return
1796
1797                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1798                         if mobj is None:
1799                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1800                                 return
1801                         yahoo_id = mobj.group(1)
1802
1803                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1804                         if mobj is None:
1805                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1806                                 return
1807                         yahoo_vid = mobj.group(1)
1808
1809                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1810                         return self._real_extract(url, new_video=False)
1811
1812                 # Retrieve video webpage to extract further information
1813                 request = urllib2.Request(url)
1814                 try:
1815                         self.report_download_webpage(video_id)
1816                         webpage = urllib2.urlopen(request).read()
1817                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1818                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1819                         return
1820
1821                 # Extract uploader and title from webpage
1822                 self.report_extraction(video_id)
1823                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1824                 if mobj is None:
1825                         self._downloader.trouble(u'ERROR: unable to extract video title')
1826                         return
1827                 video_title = mobj.group(1).decode('utf-8')
1828                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1829
1830                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1831                 if mobj is None:
1832                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1833                         return
1834                 video_uploader = mobj.group(1).decode('utf-8')
1835
1836                 # Extract video thumbnail
1837                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1838                 if mobj is None:
1839                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1840                         return
1841                 video_thumbnail = mobj.group(1).decode('utf-8')
1842
1843                 # Extract video description
1844                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1845                 if mobj is None:
1846                         self._downloader.trouble(u'ERROR: unable to extract video description')
1847                         return
1848                 video_description = mobj.group(1).decode('utf-8')
1849                 if not video_description: video_description = 'No description available.'
1850
1851                 # Extract video height and width
1852                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1853                 if mobj is None:
1854                         self._downloader.trouble(u'ERROR: unable to extract video height')
1855                         return
1856                 yv_video_height = mobj.group(1)
1857
1858                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1859                 if mobj is None:
1860                         self._downloader.trouble(u'ERROR: unable to extract video width')
1861                         return
1862                 yv_video_width = mobj.group(1)
1863
1864                 # Retrieve video playlist to extract media URL
1865                 # I'm not completely sure what all these options are, but we
1866                 # seem to need most of them, otherwise the server sends a 401.
1867                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1868                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1869                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1870                                                                   '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1871                                                                   '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1872                 try:
1873                         self.report_download_webpage(video_id)
1874                         webpage = urllib2.urlopen(request).read()
1875                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1876                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1877                         return
1878
1879                 # Extract media URL from playlist XML
1880                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1881                 if mobj is None:
1882                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1883                         return
1884                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1885                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1886
1887                 try:
1888                         # Process video information
1889                         self._downloader.process_info({
1890                                 'id':           video_id.decode('utf-8'),
1891                                 'url':          video_url,
1892                                 'uploader':     video_uploader,
1893                                 'upload_date':  u'NA',
1894                                 'title':        video_title,
1895                                 'stitle':       simple_title,
1896                                 'ext':          video_extension.decode('utf-8'),
1897                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1898                                 'description':  video_description,
1899                                 'thumbnail':    video_thumbnail,
1900                                 'description':  video_description,
1901                                 'player_url':   None,
1902                         })
1903                 except UnavailableVideoError:
1904                         self._downloader.trouble(u'\nERROR: unable to download video')
1905
1906
1907 class VimeoIE(InfoExtractor):
1908         """Information extractor for vimeo.com."""
1909
1910         # _VALID_URL matches Vimeo URLs
1911         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1912
1913         def __init__(self, downloader=None):
1914                 InfoExtractor.__init__(self, downloader)
1915
1916         @staticmethod
1917         def suitable(url):
1918                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1919
1920         def report_download_webpage(self, video_id):
1921                 """Report webpage download."""
1922                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1923
1924         def report_extraction(self, video_id):
1925                 """Report information extraction."""
1926                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1927
1928         def _real_initialize(self):
1929                 return
1930
1931         def _real_extract(self, url, new_video=True):
1932                 # Extract ID from URL
1933                 mobj = re.match(self._VALID_URL, url)
1934                 if mobj is None:
1935                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1936                         return
1937
1938                 # At this point we have a new video
1939                 self._downloader.increment_downloads()
1940                 video_id = mobj.group(1)
1941
1942                 # Retrieve video webpage to extract further information
1943                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1944                 try:
1945                         self.report_download_webpage(video_id)
1946                         webpage = urllib2.urlopen(request).read()
1947                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1948                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1949                         return
1950
1951                 # Now we begin extracting as much information as we can from what we
1952                 # retrieved. First we extract the information common to all extractors,
1953                 # and latter we extract those that are Vimeo specific.
1954                 self.report_extraction(video_id)
1955
1956                 # Extract title
1957                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1958                 if mobj is None:
1959                         self._downloader.trouble(u'ERROR: unable to extract video title')
1960                         return
1961                 video_title = mobj.group(1).decode('utf-8')
1962                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1963
1964                 # Extract uploader
1965                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1966                 if mobj is None:
1967                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1968                         return
1969                 video_uploader = mobj.group(1).decode('utf-8')
1970
1971                 # Extract video thumbnail
1972                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1973                 if mobj is None:
1974                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1975                         return
1976                 video_thumbnail = mobj.group(1).decode('utf-8')
1977
1978                 # # Extract video description
1979                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1980                 # if mobj is None:
1981                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
1982                 #       return
1983                 # video_description = mobj.group(1).decode('utf-8')
1984                 # if not video_description: video_description = 'No description available.'
1985                 video_description = 'Foo.'
1986
1987                 # Vimeo specific: extract request signature
1988                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1989                 if mobj is None:
1990                         self._downloader.trouble(u'ERROR: unable to extract request signature')
1991                         return
1992                 sig = mobj.group(1).decode('utf-8')
1993
1994                 # Vimeo specific: Extract request signature expiration
1995                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
1996                 if mobj is None:
1997                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
1998                         return
1999                 sig_exp = mobj.group(1).decode('utf-8')
2000
2001                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2002
2003                 try:
2004                         # Process video information
2005                         self._downloader.process_info({
2006                                 'id':           video_id.decode('utf-8'),
2007                                 'url':          video_url,
2008                                 'uploader':     video_uploader,
2009                                 'upload_date':  u'NA',
2010                                 'title':        video_title,
2011                                 'stitle':       simple_title,
2012                                 'ext':          u'mp4',
2013                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2014                                 'description':  video_description,
2015                                 'thumbnail':    video_thumbnail,
2016                                 'description':  video_description,
2017                                 'player_url':   None,
2018                         })
2019                 except UnavailableVideoError:
2020                         self._downloader.trouble(u'ERROR: unable to download video')
2021
2022
2023 class GenericIE(InfoExtractor):
2024         """Generic last-resort information extractor."""
2025
2026         def __init__(self, downloader=None):
2027                 InfoExtractor.__init__(self, downloader)
2028
2029         @staticmethod
2030         def suitable(url):
2031                 return True
2032
2033         def report_download_webpage(self, video_id):
2034                 """Report webpage download."""
2035                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2036                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2037
2038         def report_extraction(self, video_id):
2039                 """Report information extraction."""
2040                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2041
2042         def _real_initialize(self):
2043                 return
2044
2045         def _real_extract(self, url):
2046                 # At this point we have a new video
2047                 self._downloader.increment_downloads()
2048
2049                 video_id = url.split('/')[-1]
2050                 request = urllib2.Request(url)
2051                 try:
2052                         self.report_download_webpage(video_id)
2053                         webpage = urllib2.urlopen(request).read()
2054                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2055                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2056                         return
2057                 except ValueError, err:
2058                         # since this is the last-resort InfoExtractor, if
2059                         # this error is thrown, it'll be thrown here
2060                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2061                         return
2062
2063                 self.report_extraction(video_id)
2064                 # Start with something easy: JW Player in SWFObject
2065                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2066                 if mobj is None:
2067                         # Broaden the search a little bit
2068                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2069                 if mobj is None:
2070                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2071                         return
2072
2073                 # It's possible that one of the regexes
2074                 # matched, but returned an empty group:
2075                 if mobj.group(1) is None:
2076                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2077                         return
2078
2079                 video_url = urllib.unquote(mobj.group(1))
2080                 video_id  = os.path.basename(video_url)
2081
2082                 # here's a fun little line of code for you:
2083                 video_extension = os.path.splitext(video_id)[1][1:]
2084                 video_id        = os.path.splitext(video_id)[0]
2085
2086                 # it's tempting to parse this further, but you would
2087                 # have to take into account all the variations like
2088                 #   Video Title - Site Name
2089                 #   Site Name | Video Title
2090                 #   Video Title - Tagline | Site Name
2091                 # and so on and so forth; it's just not practical
2092                 mobj = re.search(r'<title>(.*)</title>', webpage)
2093                 if mobj is None:
2094                         self._downloader.trouble(u'ERROR: unable to extract title')
2095                         return
2096                 video_title = mobj.group(1).decode('utf-8')
2097                 video_title = sanitize_title(video_title)
2098                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2099
2100                 # video uploader is domain name
2101                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2102                 if mobj is None:
2103                         self._downloader.trouble(u'ERROR: unable to extract title')
2104                         return
2105                 video_uploader = mobj.group(1).decode('utf-8')
2106
2107                 try:
2108                         # Process video information
2109                         self._downloader.process_info({
2110                                 'id':           video_id.decode('utf-8'),
2111                                 'url':          video_url.decode('utf-8'),
2112                                 'uploader':     video_uploader,
2113                                 'upload_date':  u'NA',
2114                                 'title':        video_title,
2115                                 'stitle':       simple_title,
2116                                 'ext':          video_extension.decode('utf-8'),
2117                                 'format':       u'NA',
2118                                 'player_url':   None,
2119                         })
2120                 except UnavailableVideoError, err:
2121                         self._downloader.trouble(u'\nERROR: unable to download video')
2122
2123
2124 class YoutubeSearchIE(InfoExtractor):
2125         """Information Extractor for YouTube search queries."""
2126         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2127         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2128         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2129         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2130         _youtube_ie = None
2131         _max_youtube_results = 1000
2132
2133         def __init__(self, youtube_ie, downloader=None):
2134                 InfoExtractor.__init__(self, downloader)
2135                 self._youtube_ie = youtube_ie
2136
2137         @staticmethod
2138         def suitable(url):
2139                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2140
2141         def report_download_page(self, query, pagenum):
2142                 """Report attempt to download playlist page with given number."""
2143                 query = query.decode(preferredencoding())
2144                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2145
2146         def _real_initialize(self):
2147                 self._youtube_ie.initialize()
2148
2149         def _real_extract(self, query):
2150                 mobj = re.match(self._VALID_QUERY, query)
2151                 if mobj is None:
2152                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2153                         return
2154
2155                 prefix, query = query.split(':')
2156                 prefix = prefix[8:]
2157                 query  = query.encode('utf-8')
2158                 if prefix == '':
2159                         self._download_n_results(query, 1)
2160                         return
2161                 elif prefix == 'all':
2162                         self._download_n_results(query, self._max_youtube_results)
2163                         return
2164                 else:
2165                         try:
2166                                 n = long(prefix)
2167                                 if n <= 0:
2168                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2169                                         return
2170                                 elif n > self._max_youtube_results:
2171                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2172                                         n = self._max_youtube_results
2173                                 self._download_n_results(query, n)
2174                                 return
2175                         except ValueError: # parsing prefix as integer fails
2176                                 self._download_n_results(query, 1)
2177                                 return
2178
2179         def _download_n_results(self, query, n):
2180                 """Downloads a specified number of results for a query"""
2181
2182                 video_ids = []
2183                 already_seen = set()
2184                 pagenum = 1
2185
2186                 while True:
2187                         self.report_download_page(query, pagenum)
2188                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2189                         request = urllib2.Request(result_url)
2190                         try:
2191                                 page = urllib2.urlopen(request).read()
2192                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2193                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2194                                 return
2195
2196                         # Extract video identifiers
2197                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2198                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2199                                 if video_id not in already_seen:
2200                                         video_ids.append(video_id)
2201                                         already_seen.add(video_id)
2202                                         if len(video_ids) == n:
2203                                                 # Specified n videos reached
2204                                                 for id in video_ids:
2205                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2206                                                 return
2207
2208                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2209                                 for id in video_ids:
2210                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2211                                 return
2212
2213                         pagenum = pagenum + 1
2214
2215 class GoogleSearchIE(InfoExtractor):
2216         """Information Extractor for Google Video search queries."""
2217         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2218         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2219         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2220         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2221         _google_ie = None
2222         _max_google_results = 1000
2223
2224         def __init__(self, google_ie, downloader=None):
2225                 InfoExtractor.__init__(self, downloader)
2226                 self._google_ie = google_ie
2227
2228         @staticmethod
2229         def suitable(url):
2230                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2231
2232         def report_download_page(self, query, pagenum):
2233                 """Report attempt to download playlist page with given number."""
2234                 query = query.decode(preferredencoding())
2235                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2236
2237         def _real_initialize(self):
2238                 self._google_ie.initialize()
2239
2240         def _real_extract(self, query):
2241                 mobj = re.match(self._VALID_QUERY, query)
2242                 if mobj is None:
2243                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2244                         return
2245
2246                 prefix, query = query.split(':')
2247                 prefix = prefix[8:]
2248                 query  = query.encode('utf-8')
2249                 if prefix == '':
2250                         self._download_n_results(query, 1)
2251                         return
2252                 elif prefix == 'all':
2253                         self._download_n_results(query, self._max_google_results)
2254                         return
2255                 else:
2256                         try:
2257                                 n = long(prefix)
2258                                 if n <= 0:
2259                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2260                                         return
2261                                 elif n > self._max_google_results:
2262                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2263                                         n = self._max_google_results
2264                                 self._download_n_results(query, n)
2265                                 return
2266                         except ValueError: # parsing prefix as integer fails
2267                                 self._download_n_results(query, 1)
2268                                 return
2269
2270         def _download_n_results(self, query, n):
2271                 """Downloads a specified number of results for a query"""
2272
2273                 video_ids = []
2274                 already_seen = set()
2275                 pagenum = 1
2276
2277                 while True:
2278                         self.report_download_page(query, pagenum)
2279                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2280                         request = urllib2.Request(result_url)
2281                         try:
2282                                 page = urllib2.urlopen(request).read()
2283                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2284                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2285                                 return
2286
2287                         # Extract video identifiers
2288                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2289                                 video_id = mobj.group(1)
2290                                 if video_id not in already_seen:
2291                                         video_ids.append(video_id)
2292                                         already_seen.add(video_id)
2293                                         if len(video_ids) == n:
2294                                                 # Specified n videos reached
2295                                                 for id in video_ids:
2296                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2297                                                 return
2298
2299                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2300                                 for id in video_ids:
2301                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2302                                 return
2303
2304                         pagenum = pagenum + 1
2305
2306 class YahooSearchIE(InfoExtractor):
2307         """Information Extractor for Yahoo! Video search queries."""
2308         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2309         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2310         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2311         _MORE_PAGES_INDICATOR = r'\s*Next'
2312         _yahoo_ie = None
2313         _max_yahoo_results = 1000
2314
2315         def __init__(self, yahoo_ie, downloader=None):
2316                 InfoExtractor.__init__(self, downloader)
2317                 self._yahoo_ie = yahoo_ie
2318
2319         @staticmethod
2320         def suitable(url):
2321                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2322
2323         def report_download_page(self, query, pagenum):
2324                 """Report attempt to download playlist page with given number."""
2325                 query = query.decode(preferredencoding())
2326                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2327
2328         def _real_initialize(self):
2329                 self._yahoo_ie.initialize()
2330
2331         def _real_extract(self, query):
2332                 mobj = re.match(self._VALID_QUERY, query)
2333                 if mobj is None:
2334                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2335                         return
2336
2337                 prefix, query = query.split(':')
2338                 prefix = prefix[8:]
2339                 query  = query.encode('utf-8')
2340                 if prefix == '':
2341                         self._download_n_results(query, 1)
2342                         return
2343                 elif prefix == 'all':
2344                         self._download_n_results(query, self._max_yahoo_results)
2345                         return
2346                 else:
2347                         try:
2348                                 n = long(prefix)
2349                                 if n <= 0:
2350                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2351                                         return
2352                                 elif n > self._max_yahoo_results:
2353                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2354                                         n = self._max_yahoo_results
2355                                 self._download_n_results(query, n)
2356                                 return
2357                         except ValueError: # parsing prefix as integer fails
2358                                 self._download_n_results(query, 1)
2359                                 return
2360
2361         def _download_n_results(self, query, n):
2362                 """Downloads a specified number of results for a query"""
2363
2364                 video_ids = []
2365                 already_seen = set()
2366                 pagenum = 1
2367
2368                 while True:
2369                         self.report_download_page(query, pagenum)
2370                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2371                         request = urllib2.Request(result_url)
2372                         try:
2373                                 page = urllib2.urlopen(request).read()
2374                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2375                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2376                                 return
2377
2378                         # Extract video identifiers
2379                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2380                                 video_id = mobj.group(1)
2381                                 if video_id not in already_seen:
2382                                         video_ids.append(video_id)
2383                                         already_seen.add(video_id)
2384                                         if len(video_ids) == n:
2385                                                 # Specified n videos reached
2386                                                 for id in video_ids:
2387                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2388                                                 return
2389
2390                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2391                                 for id in video_ids:
2392                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2393                                 return
2394
2395                         pagenum = pagenum + 1
2396
2397 class YoutubePlaylistIE(InfoExtractor):
2398         """Information Extractor for YouTube playlists."""
2399
2400         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2401         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2402         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2403         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2404         _youtube_ie = None
2405
2406         def __init__(self, youtube_ie, downloader=None):
2407                 InfoExtractor.__init__(self, downloader)
2408                 self._youtube_ie = youtube_ie
2409
2410         @staticmethod
2411         def suitable(url):
2412                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2413
2414         def report_download_page(self, playlist_id, pagenum):
2415                 """Report attempt to download playlist page with given number."""
2416                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2417
2418         def _real_initialize(self):
2419                 self._youtube_ie.initialize()
2420
2421         def _real_extract(self, url):
2422                 # Extract playlist id
2423                 mobj = re.match(self._VALID_URL, url)
2424                 if mobj is None:
2425                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2426                         return
2427
2428                 # Single video case
2429                 if mobj.group(3) is not None:
2430                         self._youtube_ie.extract(mobj.group(3))
2431                         return
2432
2433                 # Download playlist pages
2434                 # prefix is 'p' as default for playlists but there are other types that need extra care
2435                 playlist_prefix = mobj.group(1)
2436                 if playlist_prefix == 'a':
2437                         playlist_access = 'artist'
2438                 else:
2439                         playlist_prefix = 'p'
2440                         playlist_access = 'view_play_list'
2441                 playlist_id = mobj.group(2)
2442                 video_ids = []
2443                 pagenum = 1
2444
2445                 while True:
2446                         self.report_download_page(playlist_id, pagenum)
2447                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2448                         try:
2449                                 page = urllib2.urlopen(request).read()
2450                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2451                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2452                                 return
2453
2454                         # Extract video identifiers
2455                         ids_in_page = []
2456                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2457                                 if mobj.group(1) not in ids_in_page:
2458                                         ids_in_page.append(mobj.group(1))
2459                         video_ids.extend(ids_in_page)
2460
2461                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2462                                 break
2463                         pagenum = pagenum + 1
2464
2465                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2466                 playlistend = self._downloader.params.get('playlistend', -1)
2467                 video_ids = video_ids[playliststart:playlistend]
2468
2469                 for id in video_ids:
2470                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2471                 return
2472
2473 class YoutubeUserIE(InfoExtractor):
2474         """Information Extractor for YouTube users."""
2475
2476         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2477         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2478         _GDATA_PAGE_SIZE = 50
2479         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2480         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2481         _youtube_ie = None
2482
2483         def __init__(self, youtube_ie, downloader=None):
2484                 InfoExtractor.__init__(self, downloader)
2485                 self._youtube_ie = youtube_ie
2486
2487         @staticmethod
2488         def suitable(url):
2489                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2490
2491         def report_download_page(self, username, start_index):
2492                 """Report attempt to download user page."""
2493                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2494                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2495
2496         def _real_initialize(self):
2497                 self._youtube_ie.initialize()
2498
2499         def _real_extract(self, url):
2500                 # Extract username
2501                 mobj = re.match(self._VALID_URL, url)
2502                 if mobj is None:
2503                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2504                         return
2505
2506                 username = mobj.group(1)
2507
2508                 # Download video ids using YouTube Data API. Result size per
2509                 # query is limited (currently to 50 videos) so we need to query
2510                 # page by page until there are no video ids - it means we got
2511                 # all of them.
2512
2513                 video_ids = []
2514                 pagenum = 0
2515
2516                 while True:
2517                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2518                         self.report_download_page(username, start_index)
2519
2520                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2521
2522                         try:
2523                                 page = urllib2.urlopen(request).read()
2524                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2525                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2526                                 return
2527
2528                         # Extract video identifiers
2529                         ids_in_page = []
2530
2531                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2532                                 if mobj.group(1) not in ids_in_page:
2533                                         ids_in_page.append(mobj.group(1))
2534
2535                         video_ids.extend(ids_in_page)
2536
2537                         # A little optimization - if current page is not
2538                         # "full", ie. does not contain PAGE_SIZE video ids then
2539                         # we can assume that this page is the last one - there
2540                         # are no more ids on further pages - no need to query
2541                         # again.
2542
2543                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2544                                 break
2545
2546                         pagenum += 1
2547
2548                 all_ids_count = len(video_ids)
2549                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2550                 playlistend = self._downloader.params.get('playlistend', -1)
2551
2552                 if playlistend == -1:
2553                         video_ids = video_ids[playliststart:]
2554                 else:
2555                         video_ids = video_ids[playliststart:playlistend]
2556
2557                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2558                                                                   (username, all_ids_count, len(video_ids)))
2559
2560                 for video_id in video_ids:
2561                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2562
2563
2564 class DepositFilesIE(InfoExtractor):
2565         """Information extractor for depositfiles.com"""
2566
2567         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2568
2569         def __init__(self, downloader=None):
2570                 InfoExtractor.__init__(self, downloader)
2571
2572         @staticmethod
2573         def suitable(url):
2574                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2575
2576         def report_download_webpage(self, file_id):
2577                 """Report webpage download."""
2578                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2579
2580         def report_extraction(self, file_id):
2581                 """Report information extraction."""
2582                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2583
2584         def _real_initialize(self):
2585                 return
2586
2587         def _real_extract(self, url):
2588                 # At this point we have a new file
2589                 self._downloader.increment_downloads()
2590
2591                 file_id = url.split('/')[-1]
2592                 # Rebuild url in english locale
2593                 url = 'http://depositfiles.com/en/files/' + file_id
2594
2595                 # Retrieve file webpage with 'Free download' button pressed
2596                 free_download_indication = { 'gateway_result' : '1' }
2597                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2598                 try:
2599                         self.report_download_webpage(file_id)
2600                         webpage = urllib2.urlopen(request).read()
2601                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2602                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2603                         return
2604
2605                 # Search for the real file URL
2606                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2607                 if (mobj is None) or (mobj.group(1) is None):
2608                         # Try to figure out reason of the error.
2609                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2610                         if (mobj is not None) and (mobj.group(1) is not None):
2611                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2612                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2613                         else:
2614                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2615                         return
2616
2617                 file_url = mobj.group(1)
2618                 file_extension = os.path.splitext(file_url)[1][1:]
2619
2620                 # Search for file title
2621                 mobj = re.search(r'<b title="(.*?)">', webpage)
2622                 if mobj is None:
2623                         self._downloader.trouble(u'ERROR: unable to extract title')
2624                         return
2625                 file_title = mobj.group(1).decode('utf-8')
2626
2627                 try:
2628                         # Process file information
2629                         self._downloader.process_info({
2630                                 'id':           file_id.decode('utf-8'),
2631                                 'url':          file_url.decode('utf-8'),
2632                                 'uploader':     u'NA',
2633                                 'upload_date':  u'NA',
2634                                 'title':        file_title,
2635                                 'stitle':       file_title,
2636                                 'ext':          file_extension.decode('utf-8'),
2637                                 'format':       u'NA',
2638                                 'player_url':   None,
2639                         })
2640                 except UnavailableVideoError, err:
2641                         self._downloader.trouble(u'ERROR: unable to download file')
2642
2643 class FacebookIE(InfoExtractor):
2644         """Information Extractor for Facebook"""
2645
2646         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2647         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2648         _NETRC_MACHINE = 'facebook'
2649         _available_formats = ['highqual', 'lowqual']
2650         _video_extensions = {
2651                 'highqual': 'mp4',
2652                 'lowqual': 'mp4',
2653         }
2654
2655         def __init__(self, downloader=None):
2656                 InfoExtractor.__init__(self, downloader)
2657
2658         @staticmethod
2659         def suitable(url):
2660                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2661
2662         def _reporter(self, message):
2663                 """Add header and report message."""
2664                 self._downloader.to_screen(u'[facebook] %s' % message)
2665
2666         def report_login(self):
2667                 """Report attempt to log in."""
2668                 self._reporter(u'Logging in')
2669
2670         def report_video_webpage_download(self, video_id):
2671                 """Report attempt to download video webpage."""
2672                 self._reporter(u'%s: Downloading video webpage' % video_id)
2673
2674         def report_information_extraction(self, video_id):
2675                 """Report attempt to extract video information."""
2676                 self._reporter(u'%s: Extracting video information' % video_id)
2677
2678         def _parse_page(self, video_webpage):
2679                 """Extract video information from page"""
2680                 # General data
2681                 data = {'title': r'class="video_title datawrap">(.*?)</',
2682                         'description': r'<div class="datawrap">(.*?)</div>',
2683                         'owner': r'\("video_owner_name", "(.*?)"\)',
2684                         'upload_date': r'data-date="(.*?)"',
2685                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2686                         }
2687                 video_info = {}
2688                 for piece in data.keys():
2689                         mobj = re.search(data[piece], video_webpage)
2690                         if mobj is not None:
2691                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2692
2693                 # Video urls
2694                 video_urls = {}
2695                 for fmt in self._available_formats:
2696                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2697                         if mobj is not None:
2698                                 # URL is in a Javascript segment inside an escaped Unicode format within
2699                                 # the generally utf-8 page
2700                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2701                 video_info['video_urls'] = video_urls
2702
2703                 return video_info
2704
2705         def _real_initialize(self):
2706                 if self._downloader is None:
2707                         return
2708
2709                 useremail = None
2710                 password = None
2711                 downloader_params = self._downloader.params
2712
2713                 # Attempt to use provided username and password or .netrc data
2714                 if downloader_params.get('username', None) is not None:
2715                         useremail = downloader_params['username']
2716                         password = downloader_params['password']
2717                 elif downloader_params.get('usenetrc', False):
2718                         try:
2719                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2720                                 if info is not None:
2721                                         useremail = info[0]
2722                                         password = info[2]
2723                                 else:
2724                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2725                         except (IOError, netrc.NetrcParseError), err:
2726                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2727                                 return
2728
2729                 if useremail is None:
2730                         return
2731
2732                 # Log in
2733                 login_form = {
2734                         'email': useremail,
2735                         'pass': password,
2736                         'login': 'Log+In'
2737                         }
2738                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2739                 try:
2740                         self.report_login()
2741                         login_results = urllib2.urlopen(request).read()
2742                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2743                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2744                                 return
2745                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2746                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2747                         return
2748
2749         def _real_extract(self, url):
2750                 mobj = re.match(self._VALID_URL, url)
2751                 if mobj is None:
2752                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2753                         return
2754                 video_id = mobj.group('ID')
2755
2756                 # Get video webpage
2757                 self.report_video_webpage_download(video_id)
2758                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2759                 try:
2760                         page = urllib2.urlopen(request)
2761                         video_webpage = page.read()
2762                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2763                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2764                         return
2765
2766                 # Start extracting information
2767                 self.report_information_extraction(video_id)
2768
2769                 # Extract information
2770                 video_info = self._parse_page(video_webpage)
2771
2772                 # uploader
2773                 if 'owner' not in video_info:
2774                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2775                         return
2776                 video_uploader = video_info['owner']
2777
2778                 # title
2779                 if 'title' not in video_info:
2780                         self._downloader.trouble(u'ERROR: unable to extract video title')
2781                         return
2782                 video_title = video_info['title']
2783                 video_title = video_title.decode('utf-8')
2784                 video_title = sanitize_title(video_title)
2785
2786                 # simplified title
2787                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2788                 simple_title = simple_title.strip(ur'_')
2789
2790                 # thumbnail image
2791                 if 'thumbnail' not in video_info:
2792                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2793                         video_thumbnail = ''
2794                 else:
2795                         video_thumbnail = video_info['thumbnail']
2796
2797                 # upload date
2798                 upload_date = u'NA'
2799                 if 'upload_date' in video_info:
2800                         upload_time = video_info['upload_date']
2801                         timetuple = email.utils.parsedate_tz(upload_time)
2802                         if timetuple is not None:
2803                                 try:
2804                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2805                                 except:
2806                                         pass
2807
2808                 # description
2809                 video_description = video_info.get('description', 'No description available.')
2810
2811                 url_map = video_info['video_urls']
2812                 if len(url_map.keys()) > 0:
2813                         # Decide which formats to download
2814                         req_format = self._downloader.params.get('format', None)
2815                         format_limit = self._downloader.params.get('format_limit', None)
2816
2817                         if format_limit is not None and format_limit in self._available_formats:
2818                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2819                         else:
2820                                 format_list = self._available_formats
2821                         existing_formats = [x for x in format_list if x in url_map]
2822                         if len(existing_formats) == 0:
2823                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2824                                 return
2825                         if req_format is None:
2826                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2827                         elif req_format == '-1':
2828                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2829                         else:
2830                                 # Specific format
2831                                 if req_format not in url_map:
2832                                         self._downloader.trouble(u'ERROR: requested format not available')
2833                                         return
2834                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2835
2836                 for format_param, video_real_url in video_url_list:
2837
2838                         # At this point we have a new video
2839                         self._downloader.increment_downloads()
2840
2841                         # Extension
2842                         video_extension = self._video_extensions.get(format_param, 'mp4')
2843
2844                         try:
2845                                 # Process video information
2846                                 self._downloader.process_info({
2847                                         'id':           video_id.decode('utf-8'),
2848                                         'url':          video_real_url.decode('utf-8'),
2849                                         'uploader':     video_uploader.decode('utf-8'),
2850                                         'upload_date':  upload_date,
2851                                         'title':        video_title,
2852                                         'stitle':       simple_title,
2853                                         'ext':          video_extension.decode('utf-8'),
2854                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2855                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2856                                         'description':  video_description.decode('utf-8'),
2857                                         'player_url':   None,
2858                                 })
2859                         except UnavailableVideoError, err:
2860                                 self._downloader.trouble(u'\nERROR: unable to download video')
2861
2862 class BlipTVIE(InfoExtractor):
2863         """Information extractor for blip.tv"""
2864
2865         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2866         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2867
2868         @staticmethod
2869         def suitable(url):
2870                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2871
2872         def report_extraction(self, file_id):
2873                 """Report information extraction."""
2874                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2875
2876         def _simplify_title(self, title):
2877                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2878                 res = res.strip(ur'_')
2879                 return res
2880
2881         def _real_extract(self, url):
2882                 mobj = re.match(self._VALID_URL, url)
2883                 if mobj is None:
2884                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2885                         return
2886
2887                 if '?' in url:
2888                         cchar = '&'
2889                 else:
2890                         cchar = '?'
2891                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2892                 request = urllib2.Request(json_url)
2893                 self.report_extraction(mobj.group(1))
2894                 try:
2895                         json_code = urllib2.urlopen(request).read()
2896                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2897                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2898                         return
2899                 try:
2900                         json_data = json.loads(json_code)
2901                         if 'Post' in json_data:
2902                                 data = json_data['Post']
2903                         else:
2904                                 data = json_data
2905
2906                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2907                         video_url = data['media']['url']
2908                         umobj = re.match(self._URL_EXT, video_url)
2909                         if umobj is None:
2910                                 raise ValueError('Can not determine filename extension')
2911                         ext = umobj.group(1)
2912
2913                         self._downloader.increment_downloads()
2914
2915                         info = {
2916                                 'id': data['item_id'],
2917                                 'url': video_url,
2918                                 'uploader': data['display_name'],
2919                                 'upload_date': upload_date,
2920                                 'title': data['title'],
2921                                 'stitle': self._simplify_title(data['title']),
2922                                 'ext': ext,
2923                                 'format': data['media']['mimeType'],
2924                                 'thumbnail': data['thumbnailUrl'],
2925                                 'description': data['description'],
2926                                 'player_url': data['embedUrl']
2927                         }
2928                 except (ValueError,KeyError), err:
2929                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2930                         return
2931
2932                 try:
2933                         self._downloader.process_info(info)
2934                 except UnavailableVideoError, err:
2935                         self._downloader.trouble(u'\nERROR: unable to download video')
2936
2937
2938 class PostProcessor(object):
2939         """Post Processor class.
2940
2941         PostProcessor objects can be added to downloaders with their
2942         add_post_processor() method. When the downloader has finished a
2943         successful download, it will take its internal chain of PostProcessors
2944         and start calling the run() method on each one of them, first with
2945         an initial argument and then with the returned value of the previous
2946         PostProcessor.
2947
2948         The chain will be stopped if one of them ever returns None or the end
2949         of the chain is reached.
2950
2951         PostProcessor objects follow a "mutual registration" process similar
2952         to InfoExtractor objects.
2953         """
2954
2955         _downloader = None
2956
2957         def __init__(self, downloader=None):
2958                 self._downloader = downloader
2959
2960         def set_downloader(self, downloader):
2961                 """Sets the downloader for this PP."""
2962                 self._downloader = downloader
2963
2964         def run(self, information):
2965                 """Run the PostProcessor.
2966
2967                 The "information" argument is a dictionary like the ones
2968                 composed by InfoExtractors. The only difference is that this
2969                 one has an extra field called "filepath" that points to the
2970                 downloaded file.
2971
2972                 When this method returns None, the postprocessing chain is
2973                 stopped. However, this method may return an information
2974                 dictionary that will be passed to the next postprocessing
2975                 object in the chain. It can be the one it received after
2976                 changing some fields.
2977
2978                 In addition, this method may raise a PostProcessingError
2979                 exception that will be taken into account by the downloader
2980                 it was called from.
2981                 """
2982                 return information # by default, do nothing
2983
2984 class FFmpegExtractAudioPP(PostProcessor):
2985
2986         def __init__(self, downloader=None, preferredcodec=None):
2987                 PostProcessor.__init__(self, downloader)
2988                 if preferredcodec is None:
2989                         preferredcodec = 'best'
2990                 self._preferredcodec = preferredcodec
2991
2992         @staticmethod
2993         def get_audio_codec(path):
2994                 try:
2995                         cmd = ['ffprobe', '-show_streams', '--', path]
2996                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2997                         output = handle.communicate()[0]
2998                         if handle.wait() != 0:
2999                                 return None
3000                 except (IOError, OSError):
3001                         return None
3002                 audio_codec = None
3003                 for line in output.split('\n'):
3004                         if line.startswith('codec_name='):
3005                                 audio_codec = line.split('=')[1].strip()
3006                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3007                                 return audio_codec
3008                 return None
3009
3010         @staticmethod
3011         def run_ffmpeg(path, out_path, codec, more_opts):
3012                 try:
3013                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3014                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3015                         return (ret == 0)
3016                 except (IOError, OSError):
3017                         return False
3018
3019         def run(self, information):
3020                 path = information['filepath']
3021
3022                 filecodec = self.get_audio_codec(path)
3023                 if filecodec is None:
3024                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3025                         return None
3026
3027                 more_opts = []
3028                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3029                         if filecodec == 'aac' or filecodec == 'mp3':
3030                                 # Lossless if possible
3031                                 acodec = 'copy'
3032                                 extension = filecodec
3033                                 if filecodec == 'aac':
3034                                         more_opts = ['-f', 'adts']
3035                         else:
3036                                 # MP3 otherwise.
3037                                 acodec = 'libmp3lame'
3038                                 extension = 'mp3'
3039                                 more_opts = ['-ab', '128k']
3040                 else:
3041                         # We convert the audio (lossy)
3042                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3043                         extension = self._preferredcodec
3044                         more_opts = ['-ab', '128k']
3045                         if self._preferredcodec == 'aac':
3046                                 more_opts += ['-f', 'adts']
3047
3048                 (prefix, ext) = os.path.splitext(path)
3049                 new_path = prefix + '.' + extension
3050                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3051                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3052
3053                 if not status:
3054                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3055                         return None
3056
3057                 try:
3058                         os.remove(path)
3059                 except (IOError, OSError):
3060                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3061                         return None
3062
3063                 information['filepath'] = new_path
3064                 return information
3065
3066
3067 def updateSelf(downloader, filename):
3068         ''' Update the program file with the latest version from the repository '''
3069         # Note: downloader only used for options
3070         if not os.access(filename, os.W_OK):
3071                 sys.exit('ERROR: no write permissions on %s' % filename)
3072
3073         downloader.to_screen('Updating to latest stable version...')
3074
3075         try:
3076                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
3077                 latest_version = urllib.urlopen(latest_url).read().strip()
3078                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
3079                 newcontent = urllib.urlopen(prog_url).read()
3080         except (IOError, OSError), err:
3081                 sys.exit('ERROR: unable to download latest version')
3082
3083         try:
3084                 stream = open(filename, 'wb')
3085                 stream.write(newcontent)
3086                 stream.close()
3087         except (IOError, OSError), err:
3088                 sys.exit('ERROR: unable to overwrite current version')
3089
3090         downloader.to_screen('Updated to version %s' % latest_version)
3091
3092 def parseOpts():
3093         # Deferred imports
3094         import getpass
3095         import optparse
3096
3097         def _format_option_string(option):
3098                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3099
3100                 opts = []
3101
3102                 if option._short_opts: opts.append(option._short_opts[0])
3103                 if option._long_opts: opts.append(option._long_opts[0])
3104                 if len(opts) > 1: opts.insert(1, ', ')
3105
3106                 if option.takes_value(): opts.append(' %s' % option.metavar)
3107
3108                 return "".join(opts)
3109
3110         def _find_term_columns():
3111                 columns = os.environ.get('COLUMNS', None)
3112                 if columns:
3113                         return int(columns)
3114
3115                 try:
3116                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3117                         out,err = sp.communicate()
3118                         return int(out.split()[1])
3119                 except:
3120                         pass
3121                 return None
3122
3123         max_width = 80
3124         max_help_position = 80
3125
3126         # No need to wrap help messages if we're on a wide console
3127         columns = _find_term_columns()
3128         if columns: max_width = columns
3129
3130         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3131         fmt.format_option_strings = _format_option_string
3132
3133         kw = {
3134                 'version'   : __version__,
3135                 'formatter' : fmt,
3136                 'usage' : '%prog [options] url...',
3137                 'conflict_handler' : 'resolve',
3138         }
3139
3140         parser = optparse.OptionParser(**kw)
3141
3142         # option groups
3143         general        = optparse.OptionGroup(parser, 'General Options')
3144         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3145         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3146         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3147         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3148         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3149
3150         general.add_option('-h', '--help',
3151                         action='help', help='print this help text and exit')
3152         general.add_option('-v', '--version',
3153                         action='version', help='print program version and exit')
3154         general.add_option('-U', '--update',
3155                         action='store_true', dest='update_self', help='update this program to latest stable version')
3156         general.add_option('-i', '--ignore-errors',
3157                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3158         general.add_option('-r', '--rate-limit',
3159                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3160         general.add_option('-R', '--retries',
3161                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3162         general.add_option('--playlist-start',
3163                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3164         general.add_option('--playlist-end',
3165                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3166         general.add_option('--dump-user-agent',
3167                         action='store_true', dest='dump_user_agent',
3168                         help='display the current browser identification', default=False)
3169
3170         authentication.add_option('-u', '--username',
3171                         dest='username', metavar='USERNAME', help='account username')
3172         authentication.add_option('-p', '--password',
3173                         dest='password', metavar='PASSWORD', help='account password')
3174         authentication.add_option('-n', '--netrc',
3175                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3176
3177
3178         video_format.add_option('-f', '--format',
3179                         action='store', dest='format', metavar='FORMAT', help='video format code')
3180         video_format.add_option('--all-formats',
3181                         action='store_const', dest='format', help='download all available video formats', const='-1')
3182         video_format.add_option('--max-quality',
3183                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3184
3185
3186         verbosity.add_option('-q', '--quiet',
3187                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3188         verbosity.add_option('-s', '--simulate',
3189                         action='store_true', dest='simulate', help='do not download video', default=False)
3190         verbosity.add_option('-g', '--get-url',
3191                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3192         verbosity.add_option('-e', '--get-title',
3193                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3194         verbosity.add_option('--get-thumbnail',
3195                         action='store_true', dest='getthumbnail',
3196                         help='simulate, quiet but print thumbnail URL', default=False)
3197         verbosity.add_option('--get-description',
3198                         action='store_true', dest='getdescription',
3199                         help='simulate, quiet but print video description', default=False)
3200         verbosity.add_option('--get-filename',
3201                         action='store_true', dest='getfilename',
3202                         help='simulate, quiet but print output filename', default=False)
3203         verbosity.add_option('--no-progress',
3204                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3205         verbosity.add_option('--console-title',
3206                         action='store_true', dest='consoletitle',
3207                         help='display progress in console titlebar', default=False)
3208
3209
3210         filesystem.add_option('-t', '--title',
3211                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3212         filesystem.add_option('-l', '--literal',
3213                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3214         filesystem.add_option('-A', '--auto-number',
3215                         action='store_true', dest='autonumber',
3216                         help='number downloaded files starting from 00000', default=False)
3217         filesystem.add_option('-o', '--output',
3218                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3219         filesystem.add_option('-a', '--batch-file',
3220                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3221         filesystem.add_option('-w', '--no-overwrites',
3222                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3223         filesystem.add_option('-c', '--continue',
3224                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3225         filesystem.add_option('--cookies',
3226                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3227         filesystem.add_option('--no-part',
3228                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3229         filesystem.add_option('--no-mtime',
3230                         action='store_false', dest='updatetime',
3231                         help='do not use the Last-modified header to set the file modification time', default=True)
3232         filesystem.add_option('--write-description',
3233                         action='store_true', dest='writedescription',
3234                         help='write video description to a .description file', default=False)
3235         filesystem.add_option('--write-info-json',
3236                         action='store_true', dest='writeinfojson',
3237                         help='write video metadata to a .info.json file', default=False)
3238
3239
3240         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3241                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3242         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3243                         help='"best", "aac" or "mp3"; best by default')
3244
3245
3246         parser.add_option_group(general)
3247         parser.add_option_group(filesystem)
3248         parser.add_option_group(verbosity)
3249         parser.add_option_group(video_format)
3250         parser.add_option_group(authentication)
3251         parser.add_option_group(postproc)
3252
3253         opts, args = parser.parse_args()
3254
3255         return parser, opts, args
3256
3257 def main():
3258         parser, opts, args = parseOpts()
3259
3260         # Open appropriate CookieJar
3261         if opts.cookiefile is None:
3262                 jar = cookielib.CookieJar()
3263         else:
3264                 try:
3265                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3266                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3267                                 jar.load()
3268                 except (IOError, OSError), err:
3269                         sys.exit(u'ERROR: unable to open cookie file')
3270
3271         # Dump user agent
3272         if opts.dump_user_agent:
3273                 print std_headers['User-Agent']
3274                 sys.exit(0)
3275
3276         # General configuration
3277         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3278         urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3279         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3280
3281         # Batch file verification
3282         batchurls = []
3283         if opts.batchfile is not None:
3284                 try:
3285                         if opts.batchfile == '-':
3286                                 batchfd = sys.stdin
3287                         else:
3288                                 batchfd = open(opts.batchfile, 'r')
3289                         batchurls = batchfd.readlines()
3290                         batchurls = [x.strip() for x in batchurls]
3291                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3292                 except IOError:
3293                         sys.exit(u'ERROR: batch file could not be read')
3294         all_urls = batchurls + args
3295
3296         # Conflicting, missing and erroneous options
3297         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3298                 parser.error(u'using .netrc conflicts with giving username/password')
3299         if opts.password is not None and opts.username is None:
3300                 parser.error(u'account username missing')
3301         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3302                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3303         if opts.usetitle and opts.useliteral:
3304                 parser.error(u'using title conflicts with using literal title')
3305         if opts.username is not None and opts.password is None:
3306                 opts.password = getpass.getpass(u'Type account password and press return:')
3307         if opts.ratelimit is not None:
3308                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3309                 if numeric_limit is None:
3310                         parser.error(u'invalid rate limit specified')
3311                 opts.ratelimit = numeric_limit
3312         if opts.retries is not None:
3313                 try:
3314                         opts.retries = long(opts.retries)
3315                 except (TypeError, ValueError), err:
3316                         parser.error(u'invalid retry count specified')
3317         try:
3318                 opts.playliststart = int(opts.playliststart)
3319                 if opts.playliststart <= 0:
3320                         raise ValueError(u'Playlist start must be positive')
3321         except (TypeError, ValueError), err:
3322                 parser.error(u'invalid playlist start number specified')
3323         try:
3324                 opts.playlistend = int(opts.playlistend)
3325                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3326                         raise ValueError(u'Playlist end must be greater than playlist start')
3327         except (TypeError, ValueError), err:
3328                 parser.error(u'invalid playlist end number specified')
3329         if opts.extractaudio:
3330                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3331                         parser.error(u'invalid audio format specified')
3332
3333         # Information extractors
3334         youtube_ie = YoutubeIE()
3335         metacafe_ie = MetacafeIE(youtube_ie)
3336         dailymotion_ie = DailymotionIE()
3337         youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3338         youtube_user_ie = YoutubeUserIE(youtube_ie)
3339         youtube_search_ie = YoutubeSearchIE(youtube_ie)
3340         google_ie = GoogleIE()
3341         google_search_ie = GoogleSearchIE(google_ie)
3342         photobucket_ie = PhotobucketIE()
3343         yahoo_ie = YahooIE()
3344         yahoo_search_ie = YahooSearchIE(yahoo_ie)
3345         deposit_files_ie = DepositFilesIE()
3346         facebook_ie = FacebookIE()
3347         bliptv_ie = BlipTVIE()
3348         vimeo_ie = VimeoIE()
3349         generic_ie = GenericIE()
3350
3351         # File downloader
3352         fd = FileDownloader({
3353                 'usenetrc': opts.usenetrc,
3354                 'username': opts.username,
3355                 'password': opts.password,
3356                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3357                 'forceurl': opts.geturl,
3358                 'forcetitle': opts.gettitle,
3359                 'forcethumbnail': opts.getthumbnail,
3360                 'forcedescription': opts.getdescription,
3361                 'forcefilename': opts.getfilename,
3362                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3363                 'format': opts.format,
3364                 'format_limit': opts.format_limit,
3365                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3366                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3367                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3368                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3369                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3370                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3371                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3372                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3373                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3374                         or u'%(id)s.%(ext)s'),
3375                 'ignoreerrors': opts.ignoreerrors,
3376                 'ratelimit': opts.ratelimit,
3377                 'nooverwrites': opts.nooverwrites,
3378                 'retries': opts.retries,
3379                 'continuedl': opts.continue_dl,
3380                 'noprogress': opts.noprogress,
3381                 'playliststart': opts.playliststart,
3382                 'playlistend': opts.playlistend,
3383                 'logtostderr': opts.outtmpl == '-',
3384                 'consoletitle': opts.consoletitle,
3385                 'nopart': opts.nopart,
3386                 'updatetime': opts.updatetime,
3387                 'writedescription': opts.writedescription,
3388                 'writeinfojson': opts.writeinfojson,
3389                 })
3390         fd.add_info_extractor(youtube_search_ie)
3391         fd.add_info_extractor(youtube_pl_ie)
3392         fd.add_info_extractor(youtube_user_ie)
3393         fd.add_info_extractor(metacafe_ie)
3394         fd.add_info_extractor(dailymotion_ie)
3395         fd.add_info_extractor(youtube_ie)
3396         fd.add_info_extractor(google_ie)
3397         fd.add_info_extractor(google_search_ie)
3398         fd.add_info_extractor(photobucket_ie)
3399         fd.add_info_extractor(yahoo_ie)
3400         fd.add_info_extractor(yahoo_search_ie)
3401         fd.add_info_extractor(deposit_files_ie)
3402         fd.add_info_extractor(facebook_ie)
3403         fd.add_info_extractor(bliptv_ie)
3404         fd.add_info_extractor(vimeo_ie)
3405
3406         # This must come last since it's the
3407         # fallback if none of the others work
3408         fd.add_info_extractor(generic_ie)
3409
3410         # PostProcessors
3411         if opts.extractaudio:
3412                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3413
3414         # Update version
3415         if opts.update_self:
3416                 updateSelf(fd, sys.argv[0])
3417
3418         # Maybe do nothing
3419         if len(all_urls) < 1:
3420                 if not opts.update_self:
3421                         parser.error(u'you must provide at least one URL')
3422                 else:
3423                         sys.exit()
3424         retcode = fd.download(all_urls)
3425
3426         # Dump cookie jar if requested
3427         if opts.cookiefile is not None:
3428                 try:
3429                         jar.save()
3430                 except (IOError, OSError), err:
3431                         sys.exit(u'ERROR: unable to save cookie jar')
3432
3433         sys.exit(retcode)
3434
3435
3436 if __name__ == '__main__':
3437         try:
3438                 main()
3439         except DownloadError:
3440                 sys.exit(1)
3441         except SameFileError:
3442                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3443         except KeyboardInterrupt:
3444                 sys.exit(u'\nERROR: Interrupted by user')
3445
3446 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: