1b381d7b75307008b3686e036417edf155d896e7
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25
26 import cookielib
27 import datetime
28 import getpass
29 import gzip
30 import htmlentitydefs
31 import HTMLParser
32 import httplib
33 import locale
34 import math
35 import netrc
36 import optparse
37 import os
38 import os.path
39 import re
40 import shlex
41 import socket
42 import string
43 import subprocess
44 import sys
45 import time
46 import urllib
47 import urllib2
48 import warnings
49 import zlib
50
51 if os.name == 'nt':
52         import ctypes
53
54 try:
55         import email.utils
56 except ImportError: # Python 2.4
57         import email.Utils
58 try:
59         import cStringIO as StringIO
60 except ImportError:
61         import StringIO
62
63 # parse_qs was moved from the cgi module to the urlparse module recently.
64 try:
65         from urlparse import parse_qs
66 except ImportError:
67         from cgi import parse_qs
68
69 try:
70         import lxml.etree
71 except ImportError:
72         pass # Handled below
73
74 try:
75         import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
78
79 std_headers = {
80         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83         'Accept-Encoding': 'gzip, deflate',
84         'Accept-Language': 'en-us,en;q=0.5',
85 }
86
87 try:
88         import json
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90         import re
91         class json(object):
92                 @staticmethod
93                 def loads(s):
94                         s = s.decode('UTF-8')
95                         def raiseError(msg, i):
96                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97                         def skipSpace(i, expectMore=True):
98                                 while i < len(s) and s[i] in ' \t\r\n':
99                                         i += 1
100                                 if expectMore:
101                                         if i >= len(s):
102                                                 raiseError('Premature end', i)
103                                 return i
104                         def decodeEscape(match):
105                                 esc = match.group(1)
106                                 _STATIC = {
107                                         '"': '"',
108                                         '\\': '\\',
109                                         '/': '/',
110                                         'b': unichr(0x8),
111                                         'f': unichr(0xc),
112                                         'n': '\n',
113                                         'r': '\r',
114                                         't': '\t',
115                                 }
116                                 if esc in _STATIC:
117                                         return _STATIC[esc]
118                                 if esc[0] == 'u':
119                                         if len(esc) == 1+4:
120                                                 return unichr(int(esc[1:5], 16))
121                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
122                                                 hi = int(esc[1:5], 16)
123                                                 low = int(esc[7:11], 16)
124                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125                                 raise ValueError('Unknown escape ' + str(esc))
126                         def parseString(i):
127                                 i += 1
128                                 e = i
129                                 while True:
130                                         e = s.index('"', e)
131                                         bslashes = 0
132                                         while s[e-bslashes-1] == '\\':
133                                                 bslashes += 1
134                                         if bslashes % 2 == 1:
135                                                 e += 1
136                                                 continue
137                                         break
138                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139                                 stri = rexp.sub(decodeEscape, s[i:e])
140                                 return (e+1,stri)
141                         def parseObj(i):
142                                 i += 1
143                                 res = {}
144                                 i = skipSpace(i)
145                                 if s[i] == '}': # Empty dictionary
146                                         return (i+1,res)
147                                 while True:
148                                         if s[i] != '"':
149                                                 raiseError('Expected a string object key', i)
150                                         i,key = parseString(i)
151                                         i = skipSpace(i)
152                                         if i >= len(s) or s[i] != ':':
153                                                 raiseError('Expected a colon', i)
154                                         i,val = parse(i+1)
155                                         res[key] = val
156                                         i = skipSpace(i)
157                                         if s[i] == '}':
158                                                 return (i+1, res)
159                                         if s[i] != ',':
160                                                 raiseError('Expected comma or closing curly brace', i)
161                                         i = skipSpace(i+1)
162                         def parseArray(i):
163                                 res = []
164                                 i = skipSpace(i+1)
165                                 if s[i] == ']': # Empty array
166                                         return (i+1,res)
167                                 while True:
168                                         i,val = parse(i)
169                                         res.append(val)
170                                         i = skipSpace(i) # Raise exception if premature end
171                                         if s[i] == ']':
172                                                 return (i+1, res)
173                                         if s[i] != ',':
174                                                 raiseError('Expected a comma or closing bracket', i)
175                                         i = skipSpace(i+1)
176                         def parseDiscrete(i):
177                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
178                                         if s.startswith(k, i):
179                                                 return (i+len(k), v)
180                                 raiseError('Not a boolean (or null)', i)
181                         def parseNumber(i):
182                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
183                                 if mobj is None:
184                                         raiseError('Not a number', i)
185                                 nums = mobj.group(1)
186                                 if '.' in nums or 'e' in nums or 'E' in nums:
187                                         return (i+len(nums), float(nums))
188                                 return (i+len(nums), int(nums))
189                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
190                         def parse(i):
191                                 i = skipSpace(i)
192                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
193                                 i = skipSpace(i, False)
194                                 return (i,res)
195                         i,res = parse(0)
196                         if i < len(s):
197                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
198                         return res
199
200 def preferredencoding():
201         """Get preferred encoding.
202
203         Returns the best encoding scheme for the system, based on
204         locale.getpreferredencoding() and some further tweaks.
205         """
206         def yield_preferredencoding():
207                 try:
208                         pref = locale.getpreferredencoding()
209                         u'TEST'.encode(pref)
210                 except:
211                         pref = 'UTF-8'
212                 while True:
213                         yield pref
214         return yield_preferredencoding().next()
215
216
217 def htmlentity_transform(matchobj):
218         """Transforms an HTML entity to a Unicode character.
219
220         This function receives a match object and is intended to be used with
221         the re.sub() function.
222         """
223         entity = matchobj.group(1)
224
225         # Known non-numeric HTML entity
226         if entity in htmlentitydefs.name2codepoint:
227                 return unichr(htmlentitydefs.name2codepoint[entity])
228
229         # Unicode character
230         mobj = re.match(ur'(?u)#(x?\d+)', entity)
231         if mobj is not None:
232                 numstr = mobj.group(1)
233                 if numstr.startswith(u'x'):
234                         base = 16
235                         numstr = u'0%s' % numstr
236                 else:
237                         base = 10
238                 return unichr(long(numstr, base))
239
240         # Unknown entity in name, return its literal representation
241         return (u'&%s;' % entity)
242
243
244 def sanitize_title(utitle):
245         """Sanitizes a video title so it could be used as part of a filename."""
246         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247         return utitle.replace(unicode(os.sep), u'%')
248
249
250 def sanitize_open(filename, open_mode):
251         """Try to open the given filename, and slightly tweak it if this fails.
252
253         Attempts to open the given filename. If this fails, it tries to change
254         the filename slightly, step by step, until it's either able to open it
255         or it fails and raises a final exception, like the standard open()
256         function.
257
258         It returns the tuple (stream, definitive_file_name).
259         """
260         try:
261                 if filename == u'-':
262                         if sys.platform == 'win32':
263                                 import msvcrt
264                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265                         return (sys.stdout, filename)
266                 stream = open(_encodeFilename(filename), open_mode)
267                 return (stream, filename)
268         except (IOError, OSError), err:
269                 # In case of error, try to remove win32 forbidden chars
270                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
271
272                 # An exception here should be caught in the caller
273                 stream = open(_encodeFilename(filename), open_mode)
274                 return (stream, filename)
275
276
277 def timeconvert(timestr):
278         """Convert RFC 2822 defined time string into system timestamp"""
279         timestamp = None
280         timetuple = email.utils.parsedate_tz(timestr)
281         if timetuple is not None:
282                 timestamp = email.utils.mktime_tz(timetuple)
283         return timestamp
284
285 def _simplify_title(title):
286         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287         return expr.sub(u'_', title).strip(u'_')
288
289 def _orderedSet(iterable):
290         """ Remove all duplicates from the input iterable """
291         res = []
292         for el in iterable:
293                 if el not in res:
294                         res.append(el)
295         return res
296
297 def _unescapeHTML(s):
298         """
299         @param s a string (of type unicode)
300         """
301         assert type(s) == type(u'')
302
303         htmlParser = HTMLParser.HTMLParser()
304         return htmlParser.unescape(s)
305
306 def _encodeFilename(s):
307         """
308         @param s The name of the file (of type unicode)
309         """
310
311         assert type(s) == type(u'')
312
313         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
317                 return s
318         else:
319                 return s.encode(sys.getfilesystemencoding(), 'ignore')
320
321 class DownloadError(Exception):
322         """Download Error exception.
323
324         This exception may be thrown by FileDownloader objects if they are not
325         configured to continue on errors. They will contain the appropriate
326         error message.
327         """
328         pass
329
330
331 class SameFileError(Exception):
332         """Same File exception.
333
334         This exception will be thrown by FileDownloader objects if they detect
335         multiple files would have to be downloaded to the same file on disk.
336         """
337         pass
338
339
340 class PostProcessingError(Exception):
341         """Post Processing exception.
342
343         This exception may be raised by PostProcessor's .run() method to
344         indicate an error in the postprocessing task.
345         """
346         pass
347
348 class MaxDownloadsReached(Exception):
349         """ --max-downloads limit has been reached. """
350         pass
351
352
353 class UnavailableVideoError(Exception):
354         """Unavailable Format exception.
355
356         This exception will be thrown when a video is requested
357         in a format that is not available for that video.
358         """
359         pass
360
361
362 class ContentTooShortError(Exception):
363         """Content Too Short exception.
364
365         This exception may be raised by FileDownloader objects when a file they
366         download is too small for what the server announced first, indicating
367         the connection was probably interrupted.
368         """
369         # Both in bytes
370         downloaded = None
371         expected = None
372
373         def __init__(self, downloaded, expected):
374                 self.downloaded = downloaded
375                 self.expected = expected
376
377
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379         """Handler for HTTP requests and responses.
380
381         This class, when installed with an OpenerDirector, automatically adds
382         the standard headers to every HTTP request and handles gzipped and
383         deflated responses from web servers. If compression is to be avoided in
384         a particular request, the original request in the program code only has
385         to include the HTTP header "Youtubedl-No-Compression", which will be
386         removed before making the real request.
387
388         Part of this code was copied from:
389
390         http://techknack.net/python-urllib2-handlers/
391
392         Andrew Rowls, the author of that code, agreed to release it to the
393         public domain.
394         """
395
396         @staticmethod
397         def deflate(data):
398                 try:
399                         return zlib.decompress(data, -zlib.MAX_WBITS)
400                 except zlib.error:
401                         return zlib.decompress(data)
402
403         @staticmethod
404         def addinfourl_wrapper(stream, headers, url, code):
405                 if hasattr(urllib2.addinfourl, 'getcode'):
406                         return urllib2.addinfourl(stream, headers, url, code)
407                 ret = urllib2.addinfourl(stream, headers, url)
408                 ret.code = code
409                 return ret
410
411         def http_request(self, req):
412                 for h in std_headers:
413                         if h in req.headers:
414                                 del req.headers[h]
415                         req.add_header(h, std_headers[h])
416                 if 'Youtubedl-no-compression' in req.headers:
417                         if 'Accept-encoding' in req.headers:
418                                 del req.headers['Accept-encoding']
419                         del req.headers['Youtubedl-no-compression']
420                 return req
421
422         def http_response(self, req, resp):
423                 old_resp = resp
424                 # gzip
425                 if resp.headers.get('Content-encoding', '') == 'gzip':
426                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428                         resp.msg = old_resp.msg
429                 # deflate
430                 if resp.headers.get('Content-encoding', '') == 'deflate':
431                         gz = StringIO.StringIO(self.deflate(resp.read()))
432                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433                         resp.msg = old_resp.msg
434                 return resp
435
436
437 class FileDownloader(object):
438         """File Downloader class.
439
440         File downloader objects are the ones responsible of downloading the
441         actual video file and writing it to disk if the user has requested
442         it, among some other tasks. In most cases there should be one per
443         program. As, given a video URL, the downloader doesn't know how to
444         extract all the needed information, task that InfoExtractors do, it
445         has to pass the URL to one of them.
446
447         For this, file downloader objects have a method that allows
448         InfoExtractors to be registered in a given order. When it is passed
449         a URL, the file downloader handles it to the first InfoExtractor it
450         finds that reports being able to handle it. The InfoExtractor extracts
451         all the information about the video or videos the URL refers to, and
452         asks the FileDownloader to process the video information, possibly
453         downloading the video.
454
455         File downloaders accept a lot of parameters. In order not to saturate
456         the object constructor with arguments, it receives a dictionary of
457         options instead. These options are available through the params
458         attribute for the InfoExtractors to use. The FileDownloader also
459         registers itself as the downloader in charge for the InfoExtractors
460         that are added to it, so this is a "mutual registration".
461
462         Available options:
463
464         username:         Username for authentication purposes.
465         password:         Password for authentication purposes.
466         usenetrc:         Use netrc for authentication instead.
467         quiet:            Do not print messages to stdout.
468         forceurl:         Force printing final URL.
469         forcetitle:       Force printing title.
470         forcethumbnail:   Force printing thumbnail URL.
471         forcedescription: Force printing description.
472         forcefilename:    Force printing final filename.
473         simulate:         Do not download the video files.
474         format:           Video format code.
475         format_limit:     Highest quality format to try.
476         outtmpl:          Template for output names.
477         ignoreerrors:     Do not stop on download errors.
478         ratelimit:        Download speed limit, in bytes/sec.
479         nooverwrites:     Prevent overwriting files.
480         retries:          Number of times to retry for HTTP error 5xx
481         continuedl:       Try to continue downloads if possible.
482         noprogress:       Do not print the progress bar.
483         playliststart:    Playlist item to start at.
484         playlistend:      Playlist item to end at.
485         matchtitle:       Download only matching titles.
486         rejecttitle:      Reject downloads for matching titles.
487         logtostderr:      Log messages to stderr instead of stdout.
488         consoletitle:     Display progress in console window's titlebar.
489         nopart:           Do not use temporary .part files.
490         updatetime:       Use the Last-modified header to set output file timestamps.
491         writedescription: Write the video description to a .description file
492         writeinfojson:    Write the video description to a .info.json file
493         """
494
495         params = None
496         _ies = []
497         _pps = []
498         _download_retcode = None
499         _num_downloads = None
500         _screen_file = None
501
502         def __init__(self, params):
503                 """Create a FileDownloader object with the given options."""
504                 self._ies = []
505                 self._pps = []
506                 self._download_retcode = 0
507                 self._num_downloads = 0
508                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
509                 self.params = params
510
511         @staticmethod
512         def format_bytes(bytes):
513                 if bytes is None:
514                         return 'N/A'
515                 if type(bytes) is str:
516                         bytes = float(bytes)
517                 if bytes == 0.0:
518                         exponent = 0
519                 else:
520                         exponent = long(math.log(bytes, 1024.0))
521                 suffix = 'bkMGTPEZY'[exponent]
522                 converted = float(bytes) / float(1024 ** exponent)
523                 return '%.2f%s' % (converted, suffix)
524
525         @staticmethod
526         def calc_percent(byte_counter, data_len):
527                 if data_len is None:
528                         return '---.-%'
529                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
530
531         @staticmethod
532         def calc_eta(start, now, total, current):
533                 if total is None:
534                         return '--:--'
535                 dif = now - start
536                 if current == 0 or dif < 0.001: # One millisecond
537                         return '--:--'
538                 rate = float(current) / dif
539                 eta = long((float(total) - float(current)) / rate)
540                 (eta_mins, eta_secs) = divmod(eta, 60)
541                 if eta_mins > 99:
542                         return '--:--'
543                 return '%02d:%02d' % (eta_mins, eta_secs)
544
545         @staticmethod
546         def calc_speed(start, now, bytes):
547                 dif = now - start
548                 if bytes == 0 or dif < 0.001: # One millisecond
549                         return '%10s' % '---b/s'
550                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
551
552         @staticmethod
553         def best_block_size(elapsed_time, bytes):
554                 new_min = max(bytes / 2.0, 1.0)
555                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
556                 if elapsed_time < 0.001:
557                         return long(new_max)
558                 rate = bytes / elapsed_time
559                 if rate > new_max:
560                         return long(new_max)
561                 if rate < new_min:
562                         return long(new_min)
563                 return long(rate)
564
565         @staticmethod
566         def parse_bytes(bytestr):
567                 """Parse a string indicating a byte quantity into a long integer."""
568                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
569                 if matchobj is None:
570                         return None
571                 number = float(matchobj.group(1))
572                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
573                 return long(round(number * multiplier))
574
575         def add_info_extractor(self, ie):
576                 """Add an InfoExtractor object to the end of the list."""
577                 self._ies.append(ie)
578                 ie.set_downloader(self)
579
580         def add_post_processor(self, pp):
581                 """Add a PostProcessor object to the end of the chain."""
582                 self._pps.append(pp)
583                 pp.set_downloader(self)
584
585         def to_screen(self, message, skip_eol=False):
586                 """Print message to stdout if not in quiet mode."""
587                 assert type(message) == type(u'')
588                 if not self.params.get('quiet', False):
589                         terminator = [u'\n', u''][skip_eol]
590                         output = message + terminator
591
592                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
593                                 output = output.encode(preferredencoding(), 'ignore')
594                         self._screen_file.write(output)
595                         self._screen_file.flush()
596
597         def to_stderr(self, message):
598                 """Print message to stderr."""
599                 print >>sys.stderr, message.encode(preferredencoding())
600
601         def to_cons_title(self, message):
602                 """Set console/terminal window title to message."""
603                 if not self.params.get('consoletitle', False):
604                         return
605                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
606                         # c_wchar_p() might not be necessary if `message` is
607                         # already of type unicode()
608                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
609                 elif 'TERM' in os.environ:
610                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
611
612         def fixed_template(self):
613                 """Checks if the output template is fixed."""
614                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
615
616         def trouble(self, message=None):
617                 """Determine action to take when a download problem appears.
618
619                 Depending on if the downloader has been configured to ignore
620                 download errors or not, this method may throw an exception or
621                 not when errors are found, after printing the message.
622                 """
623                 if message is not None:
624                         self.to_stderr(message)
625                 if not self.params.get('ignoreerrors', False):
626                         raise DownloadError(message)
627                 self._download_retcode = 1
628
629         def slow_down(self, start_time, byte_counter):
630                 """Sleep if the download speed is over the rate limit."""
631                 rate_limit = self.params.get('ratelimit', None)
632                 if rate_limit is None or byte_counter == 0:
633                         return
634                 now = time.time()
635                 elapsed = now - start_time
636                 if elapsed <= 0.0:
637                         return
638                 speed = float(byte_counter) / elapsed
639                 if speed > rate_limit:
640                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
641
642         def temp_name(self, filename):
643                 """Returns a temporary filename for the given filename."""
644                 if self.params.get('nopart', False) or filename == u'-' or \
645                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
646                         return filename
647                 return filename + u'.part'
648
649         def undo_temp_name(self, filename):
650                 if filename.endswith(u'.part'):
651                         return filename[:-len(u'.part')]
652                 return filename
653
654         def try_rename(self, old_filename, new_filename):
655                 try:
656                         if old_filename == new_filename:
657                                 return
658                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
659                 except (IOError, OSError), err:
660                         self.trouble(u'ERROR: unable to rename file')
661
662         def try_utime(self, filename, last_modified_hdr):
663                 """Try to set the last-modified time of the given file."""
664                 if last_modified_hdr is None:
665                         return
666                 if not os.path.isfile(_encodeFilename(filename)):
667                         return
668                 timestr = last_modified_hdr
669                 if timestr is None:
670                         return
671                 filetime = timeconvert(timestr)
672                 if filetime is None:
673                         return filetime
674                 try:
675                         os.utime(filename, (time.time(), filetime))
676                 except:
677                         pass
678                 return filetime
679
680         def report_writedescription(self, descfn):
681                 """ Report that the description file is being written """
682                 self.to_screen(u'[info] Writing video description to: ' + descfn)
683
684         def report_writeinfojson(self, infofn):
685                 """ Report that the metadata file has been written """
686                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
687
688         def report_destination(self, filename):
689                 """Report destination filename."""
690                 self.to_screen(u'[download] Destination: ' + filename)
691
692         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
693                 """Report download progress."""
694                 if self.params.get('noprogress', False):
695                         return
696                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
697                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
698                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
699                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
700
701         def report_resuming_byte(self, resume_len):
702                 """Report attempt to resume at given byte."""
703                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
704
705         def report_retry(self, count, retries):
706                 """Report retry in case of HTTP error 5xx"""
707                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
708
709         def report_file_already_downloaded(self, file_name):
710                 """Report file has already been fully downloaded."""
711                 try:
712                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
713                 except (UnicodeEncodeError), err:
714                         self.to_screen(u'[download] The file has already been downloaded')
715
716         def report_unable_to_resume(self):
717                 """Report it was impossible to resume download."""
718                 self.to_screen(u'[download] Unable to resume')
719
720         def report_finish(self):
721                 """Report download finished."""
722                 if self.params.get('noprogress', False):
723                         self.to_screen(u'[download] Download completed')
724                 else:
725                         self.to_screen(u'')
726
727         def increment_downloads(self):
728                 """Increment the ordinal that assigns a number to each file."""
729                 self._num_downloads += 1
730
731         def prepare_filename(self, info_dict):
732                 """Generate the output filename."""
733                 try:
734                         template_dict = dict(info_dict)
735                         template_dict['epoch'] = unicode(long(time.time()))
736                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
737                         filename = self.params['outtmpl'] % template_dict
738                         return filename
739                 except (ValueError, KeyError), err:
740                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
741                         return None
742
743         def _match_entry(self, info_dict):
744                 """ Returns None iff the file should be downloaded """
745
746                 title = info_dict['title']
747                 matchtitle = self.params.get('matchtitle', False)
748                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
749                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
750                 rejecttitle = self.params.get('rejecttitle', False)
751                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
752                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
753                 return None
754
755         def process_info(self, info_dict):
756                 """Process a single dictionary returned by an InfoExtractor."""
757
758                 reason = self._match_entry(info_dict)
759                 if reason is not None:
760                         self.to_screen(u'[download] ' + reason)
761                         return
762
763                 max_downloads = self.params.get('max_downloads')
764                 if max_downloads is not None:
765                         if self._num_downloads > int(max_downloads):
766                                 raise MaxDownloadsReached()
767
768                 filename = self.prepare_filename(info_dict)
769                 
770                 # Forced printings
771                 if self.params.get('forcetitle', False):
772                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
773                 if self.params.get('forceurl', False):
774                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
775                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
776                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
777                 if self.params.get('forcedescription', False) and 'description' in info_dict:
778                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
779                 if self.params.get('forcefilename', False) and filename is not None:
780                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
781                 if self.params.get('forceformat', False):
782                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
783
784                 # Do nothing else if in simulate mode
785                 if self.params.get('simulate', False):
786                         return
787
788                 if filename is None:
789                         return
790
791                 try:
792                         dn = os.path.dirname(_encodeFilename(filename))
793                         if dn != '' and not os.path.exists(dn): # dn is already encoded
794                                 os.makedirs(dn)
795                 except (OSError, IOError), err:
796                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
797                         return
798
799                 if self.params.get('writedescription', False):
800                         try:
801                                 descfn = filename + u'.description'
802                                 self.report_writedescription(descfn)
803                                 descfile = open(_encodeFilename(descfn), 'wb')
804                                 try:
805                                         descfile.write(info_dict['description'].encode('utf-8'))
806                                 finally:
807                                         descfile.close()
808                         except (OSError, IOError):
809                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
810                                 return
811
812                 if self.params.get('writeinfojson', False):
813                         infofn = filename + u'.info.json'
814                         self.report_writeinfojson(infofn)
815                         try:
816                                 json.dump
817                         except (NameError,AttributeError):
818                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
819                                 return
820                         try:
821                                 infof = open(_encodeFilename(infofn), 'wb')
822                                 try:
823                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
824                                         json.dump(json_info_dict, infof)
825                                 finally:
826                                         infof.close()
827                         except (OSError, IOError):
828                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
829                                 return
830
831                 if not self.params.get('skip_download', False):
832                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
833                                 success = True
834                         else:
835                                 try:
836                                         success = self._do_download(filename, info_dict)
837                                 except (OSError, IOError), err:
838                                         raise UnavailableVideoError
839                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
840                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
841                                         return
842                                 except (ContentTooShortError, ), err:
843                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
844                                         return
845         
846                         if success:
847                                 try:
848                                         self.post_process(filename, info_dict)
849                                 except (PostProcessingError), err:
850                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
851                                         return
852
853         def download(self, url_list):
854                 """Download a given list of URLs."""
855                 if len(url_list) > 1 and self.fixed_template():
856                         raise SameFileError(self.params['outtmpl'])
857
858                 for url in url_list:
859                         suitable_found = False
860                         for ie in self._ies:
861                                 # Go to next InfoExtractor if not suitable
862                                 if not ie.suitable(url):
863                                         continue
864
865                                 # Suitable InfoExtractor found
866                                 suitable_found = True
867
868                                 # Extract information from URL and process it
869                                 ie.extract(url)
870
871                                 # Suitable InfoExtractor had been found; go to next URL
872                                 break
873
874                         if not suitable_found:
875                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
876
877                 return self._download_retcode
878
879         def post_process(self, filename, ie_info):
880                 """Run the postprocessing chain on the given file."""
881                 info = dict(ie_info)
882                 info['filepath'] = filename
883                 for pp in self._pps:
884                         info = pp.run(info)
885                         if info is None:
886                                 break
887
888         def _download_with_rtmpdump(self, filename, url, player_url):
889                 self.report_destination(filename)
890                 tmpfilename = self.temp_name(filename)
891
892                 # Check for rtmpdump first
893                 try:
894                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
895                 except (OSError, IOError):
896                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
897                         return False
898
899                 # Download using rtmpdump. rtmpdump returns exit code 2 when
900                 # the connection was interrumpted and resuming appears to be
901                 # possible. This is part of rtmpdump's normal usage, AFAIK.
902                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
903                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
904                 if self.params.get('verbose', False):
905                         try:
906                                 import pipes
907                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
908                         except ImportError:
909                                 shell_quote = repr
910                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
911                 retval = subprocess.call(args)
912                 while retval == 2 or retval == 1:
913                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
914                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
915                         time.sleep(5.0) # This seems to be needed
916                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
917                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
918                         if prevsize == cursize and retval == 1:
919                                 break
920                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
921                         if prevsize == cursize and retval == 2 and cursize > 1024:
922                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
923                                 retval = 0
924                                 break
925                 if retval == 0:
926                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
927                         self.try_rename(tmpfilename, filename)
928                         return True
929                 else:
930                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
931                         return False
932
933         def _do_download(self, filename, info_dict):
934                 url = info_dict['url']
935                 player_url = info_dict.get('player_url', None)
936
937                 # Check file already present
938                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
939                         self.report_file_already_downloaded(filename)
940                         return True
941
942                 # Attempt to download using rtmpdump
943                 if url.startswith('rtmp'):
944                         return self._download_with_rtmpdump(filename, url, player_url)
945
946                 tmpfilename = self.temp_name(filename)
947                 stream = None
948
949                 # Do not include the Accept-Encoding header
950                 headers = {'Youtubedl-no-compression': 'True'}
951                 basic_request = urllib2.Request(url, None, headers)
952                 request = urllib2.Request(url, None, headers)
953
954                 # Establish possible resume length
955                 if os.path.isfile(_encodeFilename(tmpfilename)):
956                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
957                 else:
958                         resume_len = 0
959
960                 open_mode = 'wb'
961                 if resume_len != 0:
962                         if self.params.get('continuedl', False):
963                                 self.report_resuming_byte(resume_len)
964                                 request.add_header('Range','bytes=%d-' % resume_len)
965                                 open_mode = 'ab'
966                         else:
967                                 resume_len = 0
968
969                 count = 0
970                 retries = self.params.get('retries', 0)
971                 while count <= retries:
972                         # Establish connection
973                         try:
974                                 if count == 0 and 'urlhandle' in info_dict:
975                                         data = info_dict['urlhandle']
976                                 data = urllib2.urlopen(request)
977                                 break
978                         except (urllib2.HTTPError, ), err:
979                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
980                                         # Unexpected HTTP error
981                                         raise
982                                 elif err.code == 416:
983                                         # Unable to resume (requested range not satisfiable)
984                                         try:
985                                                 # Open the connection again without the range header
986                                                 data = urllib2.urlopen(basic_request)
987                                                 content_length = data.info()['Content-Length']
988                                         except (urllib2.HTTPError, ), err:
989                                                 if err.code < 500 or err.code >= 600:
990                                                         raise
991                                         else:
992                                                 # Examine the reported length
993                                                 if (content_length is not None and
994                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
995                                                         # The file had already been fully downloaded.
996                                                         # Explanation to the above condition: in issue #175 it was revealed that
997                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
998                                                         # changing the file size slightly and causing problems for some users. So
999                                                         # I decided to implement a suggested change and consider the file
1000                                                         # completely downloaded if the file size differs less than 100 bytes from
1001                                                         # the one in the hard drive.
1002                                                         self.report_file_already_downloaded(filename)
1003                                                         self.try_rename(tmpfilename, filename)
1004                                                         return True
1005                                                 else:
1006                                                         # The length does not match, we start the download over
1007                                                         self.report_unable_to_resume()
1008                                                         open_mode = 'wb'
1009                                                         break
1010                         # Retry
1011                         count += 1
1012                         if count <= retries:
1013                                 self.report_retry(count, retries)
1014
1015                 if count > retries:
1016                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1017                         return False
1018
1019                 data_len = data.info().get('Content-length', None)
1020                 if data_len is not None:
1021                         data_len = long(data_len) + resume_len
1022                 data_len_str = self.format_bytes(data_len)
1023                 byte_counter = 0 + resume_len
1024                 block_size = 1024
1025                 start = time.time()
1026                 while True:
1027                         # Download and write
1028                         before = time.time()
1029                         data_block = data.read(block_size)
1030                         after = time.time()
1031                         if len(data_block) == 0:
1032                                 break
1033                         byte_counter += len(data_block)
1034
1035                         # Open file just in time
1036                         if stream is None:
1037                                 try:
1038                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1039                                         assert stream is not None
1040                                         filename = self.undo_temp_name(tmpfilename)
1041                                         self.report_destination(filename)
1042                                 except (OSError, IOError), err:
1043                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1044                                         return False
1045                         try:
1046                                 stream.write(data_block)
1047                         except (IOError, OSError), err:
1048                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1049                                 return False
1050                         block_size = self.best_block_size(after - before, len(data_block))
1051
1052                         # Progress message
1053                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1054                         if data_len is None:
1055                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1056                         else:
1057                                 percent_str = self.calc_percent(byte_counter, data_len)
1058                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1059                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1060
1061                         # Apply rate limit
1062                         self.slow_down(start, byte_counter - resume_len)
1063
1064                 if stream is None:
1065                         self.trouble(u'\nERROR: Did not get any data blocks')
1066                         return False
1067                 stream.close()
1068                 self.report_finish()
1069                 if data_len is not None and byte_counter != data_len:
1070                         raise ContentTooShortError(byte_counter, long(data_len))
1071                 self.try_rename(tmpfilename, filename)
1072
1073                 # Update file modification time
1074                 if self.params.get('updatetime', True):
1075                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1076
1077                 return True
1078
1079
1080 class InfoExtractor(object):
1081         """Information Extractor class.
1082
1083         Information extractors are the classes that, given a URL, extract
1084         information from the video (or videos) the URL refers to. This
1085         information includes the real video URL, the video title and simplified
1086         title, author and others. The information is stored in a dictionary
1087         which is then passed to the FileDownloader. The FileDownloader
1088         processes this information possibly downloading the video to the file
1089         system, among other possible outcomes. The dictionaries must include
1090         the following fields:
1091
1092         id:             Video identifier.
1093         url:            Final video URL.
1094         uploader:       Nickname of the video uploader.
1095         title:          Literal title.
1096         stitle:         Simplified title.
1097         ext:            Video filename extension.
1098         format:         Video format.
1099         player_url:     SWF Player URL (may be None).
1100
1101         The following fields are optional. Their primary purpose is to allow
1102         youtube-dl to serve as the backend for a video search function, such
1103         as the one in youtube2mp3.  They are only used when their respective
1104         forced printing functions are called:
1105
1106         thumbnail:      Full URL to a video thumbnail image.
1107         description:    One-line video description.
1108
1109         Subclasses of this one should re-define the _real_initialize() and
1110         _real_extract() methods and define a _VALID_URL regexp.
1111         Probably, they should also be added to the list of extractors.
1112         """
1113
1114         _ready = False
1115         _downloader = None
1116
1117         def __init__(self, downloader=None):
1118                 """Constructor. Receives an optional downloader."""
1119                 self._ready = False
1120                 self.set_downloader(downloader)
1121
1122         def suitable(self, url):
1123                 """Receives a URL and returns True if suitable for this IE."""
1124                 return re.match(self._VALID_URL, url) is not None
1125
1126         def initialize(self):
1127                 """Initializes an instance (authentication, etc)."""
1128                 if not self._ready:
1129                         self._real_initialize()
1130                         self._ready = True
1131
1132         def extract(self, url):
1133                 """Extracts URL information and returns it in list of dicts."""
1134                 self.initialize()
1135                 return self._real_extract(url)
1136
1137         def set_downloader(self, downloader):
1138                 """Sets the downloader for this IE."""
1139                 self._downloader = downloader
1140
1141         def _real_initialize(self):
1142                 """Real initialization process. Redefine in subclasses."""
1143                 pass
1144
1145         def _real_extract(self, url):
1146                 """Real extraction process. Redefine in subclasses."""
1147                 pass
1148
1149
1150 class YoutubeIE(InfoExtractor):
1151         """Information extractor for youtube.com."""
1152
1153         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1154         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1155         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1156         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1157         _NETRC_MACHINE = 'youtube'
1158         # Listed in order of quality
1159         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1160         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1161         _video_extensions = {
1162                 '13': '3gp',
1163                 '17': 'mp4',
1164                 '18': 'mp4',
1165                 '22': 'mp4',
1166                 '37': 'mp4',
1167                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1168                 '43': 'webm',
1169                 '44': 'webm',
1170                 '45': 'webm',
1171         }
1172         _video_dimensions = {
1173                 '5': '240x400',
1174                 '6': '???',
1175                 '13': '???',
1176                 '17': '144x176',
1177                 '18': '360x640',
1178                 '22': '720x1280',
1179                 '34': '360x640',
1180                 '35': '480x854',
1181                 '37': '1080x1920',
1182                 '38': '3072x4096',
1183                 '43': '360x640',
1184                 '44': '480x854',
1185                 '45': '720x1280',
1186         }       
1187         IE_NAME = u'youtube'
1188
1189         def report_lang(self):
1190                 """Report attempt to set language."""
1191                 self._downloader.to_screen(u'[youtube] Setting language')
1192
1193         def report_login(self):
1194                 """Report attempt to log in."""
1195                 self._downloader.to_screen(u'[youtube] Logging in')
1196
1197         def report_age_confirmation(self):
1198                 """Report attempt to confirm age."""
1199                 self._downloader.to_screen(u'[youtube] Confirming age')
1200
1201         def report_video_webpage_download(self, video_id):
1202                 """Report attempt to download video webpage."""
1203                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1204
1205         def report_video_info_webpage_download(self, video_id):
1206                 """Report attempt to download video info webpage."""
1207                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1208
1209         def report_information_extraction(self, video_id):
1210                 """Report attempt to extract video information."""
1211                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1212
1213         def report_unavailable_format(self, video_id, format):
1214                 """Report extracted video URL."""
1215                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1216
1217         def report_rtmp_download(self):
1218                 """Indicate the download will use the RTMP protocol."""
1219                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1220
1221         def _print_formats(self, formats):
1222                 print 'Available formats:'
1223                 for x in formats:
1224                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1225
1226         def _real_initialize(self):
1227                 if self._downloader is None:
1228                         return
1229
1230                 username = None
1231                 password = None
1232                 downloader_params = self._downloader.params
1233
1234                 # Attempt to use provided username and password or .netrc data
1235                 if downloader_params.get('username', None) is not None:
1236                         username = downloader_params['username']
1237                         password = downloader_params['password']
1238                 elif downloader_params.get('usenetrc', False):
1239                         try:
1240                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1241                                 if info is not None:
1242                                         username = info[0]
1243                                         password = info[2]
1244                                 else:
1245                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1246                         except (IOError, netrc.NetrcParseError), err:
1247                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1248                                 return
1249
1250                 # Set language
1251                 request = urllib2.Request(self._LANG_URL)
1252                 try:
1253                         self.report_lang()
1254                         urllib2.urlopen(request).read()
1255                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1257                         return
1258
1259                 # No authentication to be performed
1260                 if username is None:
1261                         return
1262
1263                 # Log in
1264                 login_form = {
1265                                 'current_form': 'loginForm',
1266                                 'next':         '/',
1267                                 'action_login': 'Log In',
1268                                 'username':     username,
1269                                 'password':     password,
1270                                 }
1271                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1272                 try:
1273                         self.report_login()
1274                         login_results = urllib2.urlopen(request).read()
1275                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1276                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1277                                 return
1278                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1279                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1280                         return
1281
1282                 # Confirm age
1283                 age_form = {
1284                                 'next_url':             '/',
1285                                 'action_confirm':       'Confirm',
1286                                 }
1287                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1288                 try:
1289                         self.report_age_confirmation()
1290                         age_results = urllib2.urlopen(request).read()
1291                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1292                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1293                         return
1294
1295         def _real_extract(self, url):
1296                 # Extract video id from URL
1297                 mobj = re.match(self._VALID_URL, url)
1298                 if mobj is None:
1299                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1300                         return
1301                 video_id = mobj.group(2)
1302
1303                 # Get video webpage
1304                 self.report_video_webpage_download(video_id)
1305                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1306                 try:
1307                         video_webpage = urllib2.urlopen(request).read()
1308                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1309                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1310                         return
1311
1312                 # Attempt to extract SWF player URL
1313                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1314                 if mobj is not None:
1315                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1316                 else:
1317                         player_url = None
1318
1319                 # Get video info
1320                 self.report_video_info_webpage_download(video_id)
1321                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1322                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1323                                         % (video_id, el_type))
1324                         request = urllib2.Request(video_info_url)
1325                         try:
1326                                 video_info_webpage = urllib2.urlopen(request).read()
1327                                 video_info = parse_qs(video_info_webpage)
1328                                 if 'token' in video_info:
1329                                         break
1330                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1331                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1332                                 return
1333                 if 'token' not in video_info:
1334                         if 'reason' in video_info:
1335                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1336                         else:
1337                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1338                         return
1339
1340                 # Start extracting information
1341                 self.report_information_extraction(video_id)
1342
1343                 # uploader
1344                 if 'author' not in video_info:
1345                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1346                         return
1347                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1348
1349                 # title
1350                 if 'title' not in video_info:
1351                         self._downloader.trouble(u'ERROR: unable to extract video title')
1352                         return
1353                 video_title = urllib.unquote_plus(video_info['title'][0])
1354                 video_title = video_title.decode('utf-8')
1355                 video_title = sanitize_title(video_title)
1356
1357                 # simplified title
1358                 simple_title = _simplify_title(video_title)
1359
1360                 # thumbnail image
1361                 if 'thumbnail_url' not in video_info:
1362                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1363                         video_thumbnail = ''
1364                 else:   # don't panic if we can't find it
1365                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1366
1367                 # upload date
1368                 upload_date = u'NA'
1369                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1370                 if mobj is not None:
1371                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1372                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1373                         for expression in format_expressions:
1374                                 try:
1375                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1376                                 except:
1377                                         pass
1378
1379                 # description
1380                 try:
1381                         lxml.etree
1382                 except NameError:
1383                         video_description = u'No description available.'
1384                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1385                         if mobj is not None:
1386                                 video_description = mobj.group(1).decode('utf-8')
1387                 else:
1388                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1389                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1390                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1391                         # TODO use another parser
1392
1393                 # token
1394                 video_token = urllib.unquote_plus(video_info['token'][0])
1395
1396                 # Decide which formats to download
1397                 req_format = self._downloader.params.get('format', None)
1398
1399                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1400                         self.report_rtmp_download()
1401                         video_url_list = [(None, video_info['conn'][0])]
1402                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1403                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1404                         url_data = [parse_qs(uds) for uds in url_data_strs]
1405                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1406                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1407
1408                         format_limit = self._downloader.params.get('format_limit', None)
1409                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1410                         if format_limit is not None and format_limit in available_formats:
1411                                 format_list = available_formats[available_formats.index(format_limit):]
1412                         else:
1413                                 format_list = available_formats
1414                         existing_formats = [x for x in format_list if x in url_map]
1415                         if len(existing_formats) == 0:
1416                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1417                                 return
1418                         if self._downloader.params.get('listformats', None):
1419                                 self._print_formats(existing_formats)
1420                                 return
1421                         if req_format is None or req_format == 'best':
1422                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1423                         elif req_format == 'worst':
1424                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1425                         elif req_format in ('-1', 'all'):
1426                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1427                         else:
1428                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1429                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1430                                 req_formats = req_format.split('/')
1431                                 video_url_list = None
1432                                 for rf in req_formats:
1433                                         if rf in url_map:
1434                                                 video_url_list = [(rf, url_map[rf])]
1435                                                 break
1436                                 if video_url_list is None:
1437                                         self._downloader.trouble(u'ERROR: requested format not available')
1438                                         return
1439                 else:
1440                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1441                         return
1442
1443                 for format_param, video_real_url in video_url_list:
1444                         # At this point we have a new video
1445                         self._downloader.increment_downloads()
1446
1447                         # Extension
1448                         video_extension = self._video_extensions.get(format_param, 'flv')
1449
1450                         try:
1451                                 # Process video information
1452                                 self._downloader.process_info({
1453                                         'id':           video_id.decode('utf-8'),
1454                                         'url':          video_real_url.decode('utf-8'),
1455                                         'uploader':     video_uploader.decode('utf-8'),
1456                                         'upload_date':  upload_date,
1457                                         'title':        video_title,
1458                                         'stitle':       simple_title,
1459                                         'ext':          video_extension.decode('utf-8'),
1460                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1461                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1462                                         'description':  video_description,
1463                                         'player_url':   player_url,
1464                                 })
1465                         except UnavailableVideoError, err:
1466                                 self._downloader.trouble(u'\nERROR: unable to download video')
1467
1468
1469 class MetacafeIE(InfoExtractor):
1470         """Information Extractor for metacafe.com."""
1471
1472         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1473         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1474         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1475         _youtube_ie = None
1476         IE_NAME = u'metacafe'
1477
1478         def __init__(self, youtube_ie, downloader=None):
1479                 InfoExtractor.__init__(self, downloader)
1480                 self._youtube_ie = youtube_ie
1481
1482         def report_disclaimer(self):
1483                 """Report disclaimer retrieval."""
1484                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1485
1486         def report_age_confirmation(self):
1487                 """Report attempt to confirm age."""
1488                 self._downloader.to_screen(u'[metacafe] Confirming age')
1489
1490         def report_download_webpage(self, video_id):
1491                 """Report webpage download."""
1492                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1493
1494         def report_extraction(self, video_id):
1495                 """Report information extraction."""
1496                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1497
1498         def _real_initialize(self):
1499                 # Retrieve disclaimer
1500                 request = urllib2.Request(self._DISCLAIMER)
1501                 try:
1502                         self.report_disclaimer()
1503                         disclaimer = urllib2.urlopen(request).read()
1504                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1505                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1506                         return
1507
1508                 # Confirm age
1509                 disclaimer_form = {
1510                         'filters': '0',
1511                         'submit': "Continue - I'm over 18",
1512                         }
1513                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1514                 try:
1515                         self.report_age_confirmation()
1516                         disclaimer = urllib2.urlopen(request).read()
1517                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1519                         return
1520
1521         def _real_extract(self, url):
1522                 # Extract id and simplified title from URL
1523                 mobj = re.match(self._VALID_URL, url)
1524                 if mobj is None:
1525                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1526                         return
1527
1528                 video_id = mobj.group(1)
1529
1530                 # Check if video comes from YouTube
1531                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1532                 if mobj2 is not None:
1533                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1534                         return
1535
1536                 # At this point we have a new video
1537                 self._downloader.increment_downloads()
1538
1539                 simple_title = mobj.group(2).decode('utf-8')
1540
1541                 # Retrieve video webpage to extract further information
1542                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1543                 try:
1544                         self.report_download_webpage(video_id)
1545                         webpage = urllib2.urlopen(request).read()
1546                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1547                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1548                         return
1549
1550                 # Extract URL, uploader and title from webpage
1551                 self.report_extraction(video_id)
1552                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1553                 if mobj is not None:
1554                         mediaURL = urllib.unquote(mobj.group(1))
1555                         video_extension = mediaURL[-3:]
1556
1557                         # Extract gdaKey if available
1558                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1559                         if mobj is None:
1560                                 video_url = mediaURL
1561                         else:
1562                                 gdaKey = mobj.group(1)
1563                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1564                 else:
1565                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1566                         if mobj is None:
1567                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1568                                 return
1569                         vardict = parse_qs(mobj.group(1))
1570                         if 'mediaData' not in vardict:
1571                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1572                                 return
1573                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1574                         if mobj is None:
1575                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1576                                 return
1577                         mediaURL = mobj.group(1).replace('\\/', '/')
1578                         video_extension = mediaURL[-3:]
1579                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1580
1581                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1582                 if mobj is None:
1583                         self._downloader.trouble(u'ERROR: unable to extract title')
1584                         return
1585                 video_title = mobj.group(1).decode('utf-8')
1586                 video_title = sanitize_title(video_title)
1587
1588                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1589                 if mobj is None:
1590                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1591                         return
1592                 video_uploader = mobj.group(1)
1593
1594                 try:
1595                         # Process video information
1596                         self._downloader.process_info({
1597                                 'id':           video_id.decode('utf-8'),
1598                                 'url':          video_url.decode('utf-8'),
1599                                 'uploader':     video_uploader.decode('utf-8'),
1600                                 'upload_date':  u'NA',
1601                                 'title':        video_title,
1602                                 'stitle':       simple_title,
1603                                 'ext':          video_extension.decode('utf-8'),
1604                                 'format':       u'NA',
1605                                 'player_url':   None,
1606                         })
1607                 except UnavailableVideoError:
1608                         self._downloader.trouble(u'\nERROR: unable to download video')
1609
1610
1611 class DailymotionIE(InfoExtractor):
1612         """Information Extractor for Dailymotion"""
1613
1614         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1615         IE_NAME = u'dailymotion'
1616
1617         def __init__(self, downloader=None):
1618                 InfoExtractor.__init__(self, downloader)
1619
1620         def report_download_webpage(self, video_id):
1621                 """Report webpage download."""
1622                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1623
1624         def report_extraction(self, video_id):
1625                 """Report information extraction."""
1626                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1627
1628         def _real_extract(self, url):
1629                 # Extract id and simplified title from URL
1630                 mobj = re.match(self._VALID_URL, url)
1631                 if mobj is None:
1632                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1633                         return
1634
1635                 # At this point we have a new video
1636                 self._downloader.increment_downloads()
1637                 video_id = mobj.group(1)
1638
1639                 video_extension = 'flv'
1640
1641                 # Retrieve video webpage to extract further information
1642                 request = urllib2.Request(url)
1643                 request.add_header('Cookie', 'family_filter=off')
1644                 try:
1645                         self.report_download_webpage(video_id)
1646                         webpage = urllib2.urlopen(request).read()
1647                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1648                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1649                         return
1650
1651                 # Extract URL, uploader and title from webpage
1652                 self.report_extraction(video_id)
1653                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1654                 if mobj is None:
1655                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1656                         return
1657                 sequence = urllib.unquote(mobj.group(1))
1658                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1659                 if mobj is None:
1660                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1661                         return
1662                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1663
1664                 # if needed add http://www.dailymotion.com/ if relative URL
1665
1666                 video_url = mediaURL
1667
1668                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1669                 if mobj is None:
1670                         self._downloader.trouble(u'ERROR: unable to extract title')
1671                         return
1672                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1673                 video_title = sanitize_title(video_title)
1674                 simple_title = _simplify_title(video_title)
1675
1676                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1677                 if mobj is None:
1678                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1679                         return
1680                 video_uploader = mobj.group(1)
1681
1682                 try:
1683                         # Process video information
1684                         self._downloader.process_info({
1685                                 'id':           video_id.decode('utf-8'),
1686                                 'url':          video_url.decode('utf-8'),
1687                                 'uploader':     video_uploader.decode('utf-8'),
1688                                 'upload_date':  u'NA',
1689                                 'title':        video_title,
1690                                 'stitle':       simple_title,
1691                                 'ext':          video_extension.decode('utf-8'),
1692                                 'format':       u'NA',
1693                                 'player_url':   None,
1694                         })
1695                 except UnavailableVideoError:
1696                         self._downloader.trouble(u'\nERROR: unable to download video')
1697
1698
1699 class GoogleIE(InfoExtractor):
1700         """Information extractor for video.google.com."""
1701
1702         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1703         IE_NAME = u'video.google'
1704
1705         def __init__(self, downloader=None):
1706                 InfoExtractor.__init__(self, downloader)
1707
1708         def report_download_webpage(self, video_id):
1709                 """Report webpage download."""
1710                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1711
1712         def report_extraction(self, video_id):
1713                 """Report information extraction."""
1714                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1715
1716         def _real_extract(self, url):
1717                 # Extract id from URL
1718                 mobj = re.match(self._VALID_URL, url)
1719                 if mobj is None:
1720                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1721                         return
1722
1723                 # At this point we have a new video
1724                 self._downloader.increment_downloads()
1725                 video_id = mobj.group(1)
1726
1727                 video_extension = 'mp4'
1728
1729                 # Retrieve video webpage to extract further information
1730                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1731                 try:
1732                         self.report_download_webpage(video_id)
1733                         webpage = urllib2.urlopen(request).read()
1734                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1735                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1736                         return
1737
1738                 # Extract URL, uploader, and title from webpage
1739                 self.report_extraction(video_id)
1740                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1741                 if mobj is None:
1742                         video_extension = 'flv'
1743                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1744                 if mobj is None:
1745                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1746                         return
1747                 mediaURL = urllib.unquote(mobj.group(1))
1748                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1749                 mediaURL = mediaURL.replace('\\x26', '\x26')
1750
1751                 video_url = mediaURL
1752
1753                 mobj = re.search(r'<title>(.*)</title>', webpage)
1754                 if mobj is None:
1755                         self._downloader.trouble(u'ERROR: unable to extract title')
1756                         return
1757                 video_title = mobj.group(1).decode('utf-8')
1758                 video_title = sanitize_title(video_title)
1759                 simple_title = _simplify_title(video_title)
1760
1761                 # Extract video description
1762                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1763                 if mobj is None:
1764                         self._downloader.trouble(u'ERROR: unable to extract video description')
1765                         return
1766                 video_description = mobj.group(1).decode('utf-8')
1767                 if not video_description:
1768                         video_description = 'No description available.'
1769
1770                 # Extract video thumbnail
1771                 if self._downloader.params.get('forcethumbnail', False):
1772                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1773                         try:
1774                                 webpage = urllib2.urlopen(request).read()
1775                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1777                                 return
1778                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1779                         if mobj is None:
1780                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1781                                 return
1782                         video_thumbnail = mobj.group(1)
1783                 else:   # we need something to pass to process_info
1784                         video_thumbnail = ''
1785
1786                 try:
1787                         # Process video information
1788                         self._downloader.process_info({
1789                                 'id':           video_id.decode('utf-8'),
1790                                 'url':          video_url.decode('utf-8'),
1791                                 'uploader':     u'NA',
1792                                 'upload_date':  u'NA',
1793                                 'title':        video_title,
1794                                 'stitle':       simple_title,
1795                                 'ext':          video_extension.decode('utf-8'),
1796                                 'format':       u'NA',
1797                                 'player_url':   None,
1798                         })
1799                 except UnavailableVideoError:
1800                         self._downloader.trouble(u'\nERROR: unable to download video')
1801
1802
1803 class PhotobucketIE(InfoExtractor):
1804         """Information extractor for photobucket.com."""
1805
1806         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1807         IE_NAME = u'photobucket'
1808
1809         def __init__(self, downloader=None):
1810                 InfoExtractor.__init__(self, downloader)
1811
1812         def report_download_webpage(self, video_id):
1813                 """Report webpage download."""
1814                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1815
1816         def report_extraction(self, video_id):
1817                 """Report information extraction."""
1818                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1819
1820         def _real_extract(self, url):
1821                 # Extract id from URL
1822                 mobj = re.match(self._VALID_URL, url)
1823                 if mobj is None:
1824                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1825                         return
1826
1827                 # At this point we have a new video
1828                 self._downloader.increment_downloads()
1829                 video_id = mobj.group(1)
1830
1831                 video_extension = 'flv'
1832
1833                 # Retrieve video webpage to extract further information
1834                 request = urllib2.Request(url)
1835                 try:
1836                         self.report_download_webpage(video_id)
1837                         webpage = urllib2.urlopen(request).read()
1838                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1839                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1840                         return
1841
1842                 # Extract URL, uploader, and title from webpage
1843                 self.report_extraction(video_id)
1844                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1845                 if mobj is None:
1846                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1847                         return
1848                 mediaURL = urllib.unquote(mobj.group(1))
1849
1850                 video_url = mediaURL
1851
1852                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1853                 if mobj is None:
1854                         self._downloader.trouble(u'ERROR: unable to extract title')
1855                         return
1856                 video_title = mobj.group(1).decode('utf-8')
1857                 video_title = sanitize_title(video_title)
1858                 simple_title = _simplify_title(vide_title)
1859
1860                 video_uploader = mobj.group(2).decode('utf-8')
1861
1862                 try:
1863                         # Process video information
1864                         self._downloader.process_info({
1865                                 'id':           video_id.decode('utf-8'),
1866                                 'url':          video_url.decode('utf-8'),
1867                                 'uploader':     video_uploader,
1868                                 'upload_date':  u'NA',
1869                                 'title':        video_title,
1870                                 'stitle':       simple_title,
1871                                 'ext':          video_extension.decode('utf-8'),
1872                                 'format':       u'NA',
1873                                 'player_url':   None,
1874                         })
1875                 except UnavailableVideoError:
1876                         self._downloader.trouble(u'\nERROR: unable to download video')
1877
1878
1879 class YahooIE(InfoExtractor):
1880         """Information extractor for video.yahoo.com."""
1881
1882         # _VALID_URL matches all Yahoo! Video URLs
1883         # _VPAGE_URL matches only the extractable '/watch/' URLs
1884         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1885         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1886         IE_NAME = u'video.yahoo'
1887
1888         def __init__(self, downloader=None):
1889                 InfoExtractor.__init__(self, downloader)
1890
1891         def report_download_webpage(self, video_id):
1892                 """Report webpage download."""
1893                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1894
1895         def report_extraction(self, video_id):
1896                 """Report information extraction."""
1897                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1898
1899         def _real_extract(self, url, new_video=True):
1900                 # Extract ID from URL
1901                 mobj = re.match(self._VALID_URL, url)
1902                 if mobj is None:
1903                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1904                         return
1905
1906                 # At this point we have a new video
1907                 self._downloader.increment_downloads()
1908                 video_id = mobj.group(2)
1909                 video_extension = 'flv'
1910
1911                 # Rewrite valid but non-extractable URLs as
1912                 # extractable English language /watch/ URLs
1913                 if re.match(self._VPAGE_URL, url) is None:
1914                         request = urllib2.Request(url)
1915                         try:
1916                                 webpage = urllib2.urlopen(request).read()
1917                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1918                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1919                                 return
1920
1921                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1922                         if mobj is None:
1923                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1924                                 return
1925                         yahoo_id = mobj.group(1)
1926
1927                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1928                         if mobj is None:
1929                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1930                                 return
1931                         yahoo_vid = mobj.group(1)
1932
1933                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1934                         return self._real_extract(url, new_video=False)
1935
1936                 # Retrieve video webpage to extract further information
1937                 request = urllib2.Request(url)
1938                 try:
1939                         self.report_download_webpage(video_id)
1940                         webpage = urllib2.urlopen(request).read()
1941                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1942                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1943                         return
1944
1945                 # Extract uploader and title from webpage
1946                 self.report_extraction(video_id)
1947                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1948                 if mobj is None:
1949                         self._downloader.trouble(u'ERROR: unable to extract video title')
1950                         return
1951                 video_title = mobj.group(1).decode('utf-8')
1952                 simple_title = _simplify_title(video_title)
1953
1954                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1955                 if mobj is None:
1956                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1957                         return
1958                 video_uploader = mobj.group(1).decode('utf-8')
1959
1960                 # Extract video thumbnail
1961                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1962                 if mobj is None:
1963                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1964                         return
1965                 video_thumbnail = mobj.group(1).decode('utf-8')
1966
1967                 # Extract video description
1968                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1969                 if mobj is None:
1970                         self._downloader.trouble(u'ERROR: unable to extract video description')
1971                         return
1972                 video_description = mobj.group(1).decode('utf-8')
1973                 if not video_description:
1974                         video_description = 'No description available.'
1975
1976                 # Extract video height and width
1977                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1978                 if mobj is None:
1979                         self._downloader.trouble(u'ERROR: unable to extract video height')
1980                         return
1981                 yv_video_height = mobj.group(1)
1982
1983                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1984                 if mobj is None:
1985                         self._downloader.trouble(u'ERROR: unable to extract video width')
1986                         return
1987                 yv_video_width = mobj.group(1)
1988
1989                 # Retrieve video playlist to extract media URL
1990                 # I'm not completely sure what all these options are, but we
1991                 # seem to need most of them, otherwise the server sends a 401.
1992                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1993                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1994                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1995                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1996                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1997                 try:
1998                         self.report_download_webpage(video_id)
1999                         webpage = urllib2.urlopen(request).read()
2000                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2001                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2002                         return
2003
2004                 # Extract media URL from playlist XML
2005                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2006                 if mobj is None:
2007                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2008                         return
2009                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2010                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2011
2012                 try:
2013                         # Process video information
2014                         self._downloader.process_info({
2015                                 'id':           video_id.decode('utf-8'),
2016                                 'url':          video_url,
2017                                 'uploader':     video_uploader,
2018                                 'upload_date':  u'NA',
2019                                 'title':        video_title,
2020                                 'stitle':       simple_title,
2021                                 'ext':          video_extension.decode('utf-8'),
2022                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2023                                 'description':  video_description,
2024                                 'thumbnail':    video_thumbnail,
2025                                 'player_url':   None,
2026                         })
2027                 except UnavailableVideoError:
2028                         self._downloader.trouble(u'\nERROR: unable to download video')
2029
2030
2031 class VimeoIE(InfoExtractor):
2032         """Information extractor for vimeo.com."""
2033
2034         # _VALID_URL matches Vimeo URLs
2035         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2036         IE_NAME = u'vimeo'
2037
2038         def __init__(self, downloader=None):
2039                 InfoExtractor.__init__(self, downloader)
2040
2041         def report_download_webpage(self, video_id):
2042                 """Report webpage download."""
2043                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2044
2045         def report_extraction(self, video_id):
2046                 """Report information extraction."""
2047                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2048
2049         def _real_extract(self, url, new_video=True):
2050                 # Extract ID from URL
2051                 mobj = re.match(self._VALID_URL, url)
2052                 if mobj is None:
2053                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2054                         return
2055
2056                 # At this point we have a new video
2057                 self._downloader.increment_downloads()
2058                 video_id = mobj.group(1)
2059
2060                 # Retrieve video webpage to extract further information
2061                 request = urllib2.Request(url, None, std_headers)
2062                 try:
2063                         self.report_download_webpage(video_id)
2064                         webpage = urllib2.urlopen(request).read()
2065                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2066                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2067                         return
2068
2069                 # Now we begin extracting as much information as we can from what we
2070                 # retrieved. First we extract the information common to all extractors,
2071                 # and latter we extract those that are Vimeo specific.
2072                 self.report_extraction(video_id)
2073
2074                 # Extract the config JSON
2075                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2076                 try:
2077                         config = json.loads(config)
2078                 except:
2079                         self._downloader.trouble(u'ERROR: unable to extract info section')
2080                         return
2081                 
2082                 # Extract title
2083                 video_title = config["video"]["title"]
2084                 simple_title = _simplify_title(video_title)
2085
2086                 # Extract uploader
2087                 video_uploader = config["video"]["owner"]["name"]
2088
2089                 # Extract video thumbnail
2090                 video_thumbnail = config["video"]["thumbnail"]
2091
2092                 # Extract video description
2093                 try:
2094                         lxml.etree
2095                 except NameError:
2096                         video_description = u'No description available.'
2097                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2098                         if mobj is not None:
2099                                 video_description = mobj.group(1)
2100                 else:
2101                         html_parser = lxml.etree.HTMLParser()
2102                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2103                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2104                         # TODO use another parser
2105
2106                 # Extract upload date
2107                 video_upload_date = u'NA'
2108                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2109                 if mobj is not None:
2110                         video_upload_date = mobj.group(1)
2111
2112                 # Vimeo specific: extract request signature and timestamp
2113                 sig = config['request']['signature']
2114                 timestamp = config['request']['timestamp']
2115
2116                 # Vimeo specific: extract video codec and quality information
2117                 # TODO bind to format param
2118                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2119                 for codec in codecs:
2120                         if codec[0] in config["video"]["files"]:
2121                                 video_codec = codec[0]
2122                                 video_extension = codec[1]
2123                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2124                                 else: quality = 'sd'
2125                                 break
2126                 else:
2127                         self._downloader.trouble(u'ERROR: no known codec found')
2128                         return
2129
2130                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2131                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2132
2133                 try:
2134                         # Process video information
2135                         self._downloader.process_info({
2136                                 'id':           video_id,
2137                                 'url':          video_url,
2138                                 'uploader':     video_uploader,
2139                                 'upload_date':  video_upload_date,
2140                                 'title':        video_title,
2141                                 'stitle':       simple_title,
2142                                 'ext':          video_extension,
2143                                 'thumbnail':    video_thumbnail,
2144                                 'description':  video_description,
2145                                 'player_url':   None,
2146                         })
2147                 except UnavailableVideoError:
2148                         self._downloader.trouble(u'ERROR: unable to download video')
2149
2150
2151 class GenericIE(InfoExtractor):
2152         """Generic last-resort information extractor."""
2153
2154         _VALID_URL = r'.*'
2155         IE_NAME = u'generic'
2156
2157         def __init__(self, downloader=None):
2158                 InfoExtractor.__init__(self, downloader)
2159
2160         def report_download_webpage(self, video_id):
2161                 """Report webpage download."""
2162                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2163                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2164
2165         def report_extraction(self, video_id):
2166                 """Report information extraction."""
2167                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2168
2169         def _real_extract(self, url):
2170                 # At this point we have a new video
2171                 self._downloader.increment_downloads()
2172
2173                 video_id = url.split('/')[-1]
2174                 request = urllib2.Request(url)
2175                 try:
2176                         self.report_download_webpage(video_id)
2177                         webpage = urllib2.urlopen(request).read()
2178                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2179                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2180                         return
2181                 except ValueError, err:
2182                         # since this is the last-resort InfoExtractor, if
2183                         # this error is thrown, it'll be thrown here
2184                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2185                         return
2186
2187                 self.report_extraction(video_id)
2188                 # Start with something easy: JW Player in SWFObject
2189                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2190                 if mobj is None:
2191                         # Broaden the search a little bit
2192                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2193                 if mobj is None:
2194                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2195                         return
2196
2197                 # It's possible that one of the regexes
2198                 # matched, but returned an empty group:
2199                 if mobj.group(1) is None:
2200                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2201                         return
2202
2203                 video_url = urllib.unquote(mobj.group(1))
2204                 video_id = os.path.basename(video_url)
2205
2206                 # here's a fun little line of code for you:
2207                 video_extension = os.path.splitext(video_id)[1][1:]
2208                 video_id = os.path.splitext(video_id)[0]
2209
2210                 # it's tempting to parse this further, but you would
2211                 # have to take into account all the variations like
2212                 #   Video Title - Site Name
2213                 #   Site Name | Video Title
2214                 #   Video Title - Tagline | Site Name
2215                 # and so on and so forth; it's just not practical
2216                 mobj = re.search(r'<title>(.*)</title>', webpage)
2217                 if mobj is None:
2218                         self._downloader.trouble(u'ERROR: unable to extract title')
2219                         return
2220                 video_title = mobj.group(1).decode('utf-8')
2221                 video_title = sanitize_title(video_title)
2222                 simple_title = _simplify_title(video_title)
2223
2224                 # video uploader is domain name
2225                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2226                 if mobj is None:
2227                         self._downloader.trouble(u'ERROR: unable to extract title')
2228                         return
2229                 video_uploader = mobj.group(1).decode('utf-8')
2230
2231                 try:
2232                         # Process video information
2233                         self._downloader.process_info({
2234                                 'id':           video_id.decode('utf-8'),
2235                                 'url':          video_url.decode('utf-8'),
2236                                 'uploader':     video_uploader,
2237                                 'upload_date':  u'NA',
2238                                 'title':        video_title,
2239                                 'stitle':       simple_title,
2240                                 'ext':          video_extension.decode('utf-8'),
2241                                 'format':       u'NA',
2242                                 'player_url':   None,
2243                         })
2244                 except UnavailableVideoError, err:
2245                         self._downloader.trouble(u'\nERROR: unable to download video')
2246
2247
2248 class YoutubeSearchIE(InfoExtractor):
2249         """Information Extractor for YouTube search queries."""
2250         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2251         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2252         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2253         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2254         _youtube_ie = None
2255         _max_youtube_results = 1000
2256         IE_NAME = u'youtube:search'
2257
2258         def __init__(self, youtube_ie, downloader=None):
2259                 InfoExtractor.__init__(self, downloader)
2260                 self._youtube_ie = youtube_ie
2261
2262         def report_download_page(self, query, pagenum):
2263                 """Report attempt to download playlist page with given number."""
2264                 query = query.decode(preferredencoding())
2265                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2266
2267         def _real_initialize(self):
2268                 self._youtube_ie.initialize()
2269
2270         def _real_extract(self, query):
2271                 mobj = re.match(self._VALID_URL, query)
2272                 if mobj is None:
2273                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2274                         return
2275
2276                 prefix, query = query.split(':')
2277                 prefix = prefix[8:]
2278                 query = query.encode('utf-8')
2279                 if prefix == '':
2280                         self._download_n_results(query, 1)
2281                         return
2282                 elif prefix == 'all':
2283                         self._download_n_results(query, self._max_youtube_results)
2284                         return
2285                 else:
2286                         try:
2287                                 n = long(prefix)
2288                                 if n <= 0:
2289                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2290                                         return
2291                                 elif n > self._max_youtube_results:
2292                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2293                                         n = self._max_youtube_results
2294                                 self._download_n_results(query, n)
2295                                 return
2296                         except ValueError: # parsing prefix as integer fails
2297                                 self._download_n_results(query, 1)
2298                                 return
2299
2300         def _download_n_results(self, query, n):
2301                 """Downloads a specified number of results for a query"""
2302
2303                 video_ids = []
2304                 already_seen = set()
2305                 pagenum = 1
2306
2307                 while True:
2308                         self.report_download_page(query, pagenum)
2309                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2310                         request = urllib2.Request(result_url)
2311                         try:
2312                                 page = urllib2.urlopen(request).read()
2313                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2314                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2315                                 return
2316
2317                         # Extract video identifiers
2318                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2319                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2320                                 if video_id not in already_seen:
2321                                         video_ids.append(video_id)
2322                                         already_seen.add(video_id)
2323                                         if len(video_ids) == n:
2324                                                 # Specified n videos reached
2325                                                 for id in video_ids:
2326                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2327                                                 return
2328
2329                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2330                                 for id in video_ids:
2331                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2332                                 return
2333
2334                         pagenum = pagenum + 1
2335
2336
2337 class GoogleSearchIE(InfoExtractor):
2338         """Information Extractor for Google Video search queries."""
2339         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2340         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2341         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2342         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2343         _google_ie = None
2344         _max_google_results = 1000
2345         IE_NAME = u'video.google:search'
2346
2347         def __init__(self, google_ie, downloader=None):
2348                 InfoExtractor.__init__(self, downloader)
2349                 self._google_ie = google_ie
2350
2351         def report_download_page(self, query, pagenum):
2352                 """Report attempt to download playlist page with given number."""
2353                 query = query.decode(preferredencoding())
2354                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2355
2356         def _real_initialize(self):
2357                 self._google_ie.initialize()
2358
2359         def _real_extract(self, query):
2360                 mobj = re.match(self._VALID_URL, query)
2361                 if mobj is None:
2362                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2363                         return
2364
2365                 prefix, query = query.split(':')
2366                 prefix = prefix[8:]
2367                 query = query.encode('utf-8')
2368                 if prefix == '':
2369                         self._download_n_results(query, 1)
2370                         return
2371                 elif prefix == 'all':
2372                         self._download_n_results(query, self._max_google_results)
2373                         return
2374                 else:
2375                         try:
2376                                 n = long(prefix)
2377                                 if n <= 0:
2378                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2379                                         return
2380                                 elif n > self._max_google_results:
2381                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2382                                         n = self._max_google_results
2383                                 self._download_n_results(query, n)
2384                                 return
2385                         except ValueError: # parsing prefix as integer fails
2386                                 self._download_n_results(query, 1)
2387                                 return
2388
2389         def _download_n_results(self, query, n):
2390                 """Downloads a specified number of results for a query"""
2391
2392                 video_ids = []
2393                 pagenum = 0
2394
2395                 while True:
2396                         self.report_download_page(query, pagenum)
2397                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2398                         request = urllib2.Request(result_url)
2399                         try:
2400                                 page = urllib2.urlopen(request).read()
2401                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2402                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2403                                 return
2404
2405                         # Extract video identifiers
2406                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2407                                 video_id = mobj.group(1)
2408                                 if video_id not in video_ids:
2409                                         video_ids.append(video_id)
2410                                         if len(video_ids) == n:
2411                                                 # Specified n videos reached
2412                                                 for id in video_ids:
2413                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2414                                                 return
2415
2416                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2417                                 for id in video_ids:
2418                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2419                                 return
2420
2421                         pagenum = pagenum + 1
2422
2423
2424 class YahooSearchIE(InfoExtractor):
2425         """Information Extractor for Yahoo! Video search queries."""
2426         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2427         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2428         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2429         _MORE_PAGES_INDICATOR = r'\s*Next'
2430         _yahoo_ie = None
2431         _max_yahoo_results = 1000
2432         IE_NAME = u'video.yahoo:search'
2433
2434         def __init__(self, yahoo_ie, downloader=None):
2435                 InfoExtractor.__init__(self, downloader)
2436                 self._yahoo_ie = yahoo_ie
2437
2438         def report_download_page(self, query, pagenum):
2439                 """Report attempt to download playlist page with given number."""
2440                 query = query.decode(preferredencoding())
2441                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2442
2443         def _real_initialize(self):
2444                 self._yahoo_ie.initialize()
2445
2446         def _real_extract(self, query):
2447                 mobj = re.match(self._VALID_URL, query)
2448                 if mobj is None:
2449                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2450                         return
2451
2452                 prefix, query = query.split(':')
2453                 prefix = prefix[8:]
2454                 query = query.encode('utf-8')
2455                 if prefix == '':
2456                         self._download_n_results(query, 1)
2457                         return
2458                 elif prefix == 'all':
2459                         self._download_n_results(query, self._max_yahoo_results)
2460                         return
2461                 else:
2462                         try:
2463                                 n = long(prefix)
2464                                 if n <= 0:
2465                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2466                                         return
2467                                 elif n > self._max_yahoo_results:
2468                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2469                                         n = self._max_yahoo_results
2470                                 self._download_n_results(query, n)
2471                                 return
2472                         except ValueError: # parsing prefix as integer fails
2473                                 self._download_n_results(query, 1)
2474                                 return
2475
2476         def _download_n_results(self, query, n):
2477                 """Downloads a specified number of results for a query"""
2478
2479                 video_ids = []
2480                 already_seen = set()
2481                 pagenum = 1
2482
2483                 while True:
2484                         self.report_download_page(query, pagenum)
2485                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2486                         request = urllib2.Request(result_url)
2487                         try:
2488                                 page = urllib2.urlopen(request).read()
2489                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2490                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2491                                 return
2492
2493                         # Extract video identifiers
2494                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2495                                 video_id = mobj.group(1)
2496                                 if video_id not in already_seen:
2497                                         video_ids.append(video_id)
2498                                         already_seen.add(video_id)
2499                                         if len(video_ids) == n:
2500                                                 # Specified n videos reached
2501                                                 for id in video_ids:
2502                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2503                                                 return
2504
2505                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2506                                 for id in video_ids:
2507                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2508                                 return
2509
2510                         pagenum = pagenum + 1
2511
2512
2513 class YoutubePlaylistIE(InfoExtractor):
2514         """Information Extractor for YouTube playlists."""
2515
2516         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2517         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2518         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2519         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2520         _youtube_ie = None
2521         IE_NAME = u'youtube:playlist'
2522
2523         def __init__(self, youtube_ie, downloader=None):
2524                 InfoExtractor.__init__(self, downloader)
2525                 self._youtube_ie = youtube_ie
2526
2527         def report_download_page(self, playlist_id, pagenum):
2528                 """Report attempt to download playlist page with given number."""
2529                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2530
2531         def _real_initialize(self):
2532                 self._youtube_ie.initialize()
2533
2534         def _real_extract(self, url):
2535                 # Extract playlist id
2536                 mobj = re.match(self._VALID_URL, url)
2537                 if mobj is None:
2538                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2539                         return
2540
2541                 # Single video case
2542                 if mobj.group(3) is not None:
2543                         self._youtube_ie.extract(mobj.group(3))
2544                         return
2545
2546                 # Download playlist pages
2547                 # prefix is 'p' as default for playlists but there are other types that need extra care
2548                 playlist_prefix = mobj.group(1)
2549                 if playlist_prefix == 'a':
2550                         playlist_access = 'artist'
2551                 else:
2552                         playlist_prefix = 'p'
2553                         playlist_access = 'view_play_list'
2554                 playlist_id = mobj.group(2)
2555                 video_ids = []
2556                 pagenum = 1
2557
2558                 while True:
2559                         self.report_download_page(playlist_id, pagenum)
2560                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2561                         request = urllib2.Request(url)
2562                         try:
2563                                 page = urllib2.urlopen(request).read()
2564                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2565                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2566                                 return
2567
2568                         # Extract video identifiers
2569                         ids_in_page = []
2570                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2571                                 if mobj.group(1) not in ids_in_page:
2572                                         ids_in_page.append(mobj.group(1))
2573                         video_ids.extend(ids_in_page)
2574
2575                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2576                                 break
2577                         pagenum = pagenum + 1
2578
2579                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2580                 playlistend = self._downloader.params.get('playlistend', -1)
2581                 video_ids = video_ids[playliststart:playlistend]
2582
2583                 for id in video_ids:
2584                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2585                 return
2586
2587
2588 class YoutubeUserIE(InfoExtractor):
2589         """Information Extractor for YouTube users."""
2590
2591         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2592         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2593         _GDATA_PAGE_SIZE = 50
2594         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2595         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2596         _youtube_ie = None
2597         IE_NAME = u'youtube:user'
2598
2599         def __init__(self, youtube_ie, downloader=None):
2600                 InfoExtractor.__init__(self, downloader)
2601                 self._youtube_ie = youtube_ie
2602
2603         def report_download_page(self, username, start_index):
2604                 """Report attempt to download user page."""
2605                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2606                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2607
2608         def _real_initialize(self):
2609                 self._youtube_ie.initialize()
2610
2611         def _real_extract(self, url):
2612                 # Extract username
2613                 mobj = re.match(self._VALID_URL, url)
2614                 if mobj is None:
2615                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2616                         return
2617
2618                 username = mobj.group(1)
2619
2620                 # Download video ids using YouTube Data API. Result size per
2621                 # query is limited (currently to 50 videos) so we need to query
2622                 # page by page until there are no video ids - it means we got
2623                 # all of them.
2624
2625                 video_ids = []
2626                 pagenum = 0
2627
2628                 while True:
2629                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2630                         self.report_download_page(username, start_index)
2631
2632                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2633
2634                         try:
2635                                 page = urllib2.urlopen(request).read()
2636                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2637                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2638                                 return
2639
2640                         # Extract video identifiers
2641                         ids_in_page = []
2642
2643                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2644                                 if mobj.group(1) not in ids_in_page:
2645                                         ids_in_page.append(mobj.group(1))
2646
2647                         video_ids.extend(ids_in_page)
2648
2649                         # A little optimization - if current page is not
2650                         # "full", ie. does not contain PAGE_SIZE video ids then
2651                         # we can assume that this page is the last one - there
2652                         # are no more ids on further pages - no need to query
2653                         # again.
2654
2655                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2656                                 break
2657
2658                         pagenum += 1
2659
2660                 all_ids_count = len(video_ids)
2661                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2662                 playlistend = self._downloader.params.get('playlistend', -1)
2663
2664                 if playlistend == -1:
2665                         video_ids = video_ids[playliststart:]
2666                 else:
2667                         video_ids = video_ids[playliststart:playlistend]
2668
2669                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2670                                 (username, all_ids_count, len(video_ids)))
2671
2672                 for video_id in video_ids:
2673                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2674
2675
2676 class DepositFilesIE(InfoExtractor):
2677         """Information extractor for depositfiles.com"""
2678
2679         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2680         IE_NAME = u'DepositFiles'
2681
2682         def __init__(self, downloader=None):
2683                 InfoExtractor.__init__(self, downloader)
2684
2685         def report_download_webpage(self, file_id):
2686                 """Report webpage download."""
2687                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2688
2689         def report_extraction(self, file_id):
2690                 """Report information extraction."""
2691                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2692
2693         def _real_extract(self, url):
2694                 # At this point we have a new file
2695                 self._downloader.increment_downloads()
2696
2697                 file_id = url.split('/')[-1]
2698                 # Rebuild url in english locale
2699                 url = 'http://depositfiles.com/en/files/' + file_id
2700
2701                 # Retrieve file webpage with 'Free download' button pressed
2702                 free_download_indication = { 'gateway_result' : '1' }
2703                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2704                 try:
2705                         self.report_download_webpage(file_id)
2706                         webpage = urllib2.urlopen(request).read()
2707                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2708                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2709                         return
2710
2711                 # Search for the real file URL
2712                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2713                 if (mobj is None) or (mobj.group(1) is None):
2714                         # Try to figure out reason of the error.
2715                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2716                         if (mobj is not None) and (mobj.group(1) is not None):
2717                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2718                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2719                         else:
2720                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2721                         return
2722
2723                 file_url = mobj.group(1)
2724                 file_extension = os.path.splitext(file_url)[1][1:]
2725
2726                 # Search for file title
2727                 mobj = re.search(r'<b title="(.*?)">', webpage)
2728                 if mobj is None:
2729                         self._downloader.trouble(u'ERROR: unable to extract title')
2730                         return
2731                 file_title = mobj.group(1).decode('utf-8')
2732
2733                 try:
2734                         # Process file information
2735                         self._downloader.process_info({
2736                                 'id':           file_id.decode('utf-8'),
2737                                 'url':          file_url.decode('utf-8'),
2738                                 'uploader':     u'NA',
2739                                 'upload_date':  u'NA',
2740                                 'title':        file_title,
2741                                 'stitle':       file_title,
2742                                 'ext':          file_extension.decode('utf-8'),
2743                                 'format':       u'NA',
2744                                 'player_url':   None,
2745                         })
2746                 except UnavailableVideoError, err:
2747                         self._downloader.trouble(u'ERROR: unable to download file')
2748
2749
2750 class FacebookIE(InfoExtractor):
2751         """Information Extractor for Facebook"""
2752
2753         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2754         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2755         _NETRC_MACHINE = 'facebook'
2756         _available_formats = ['video', 'highqual', 'lowqual']
2757         _video_extensions = {
2758                 'video': 'mp4',
2759                 'highqual': 'mp4',
2760                 'lowqual': 'mp4',
2761         }
2762         IE_NAME = u'facebook'
2763
2764         def __init__(self, downloader=None):
2765                 InfoExtractor.__init__(self, downloader)
2766
2767         def _reporter(self, message):
2768                 """Add header and report message."""
2769                 self._downloader.to_screen(u'[facebook] %s' % message)
2770
2771         def report_login(self):
2772                 """Report attempt to log in."""
2773                 self._reporter(u'Logging in')
2774
2775         def report_video_webpage_download(self, video_id):
2776                 """Report attempt to download video webpage."""
2777                 self._reporter(u'%s: Downloading video webpage' % video_id)
2778
2779         def report_information_extraction(self, video_id):
2780                 """Report attempt to extract video information."""
2781                 self._reporter(u'%s: Extracting video information' % video_id)
2782
2783         def _parse_page(self, video_webpage):
2784                 """Extract video information from page"""
2785                 # General data
2786                 data = {'title': r'\("video_title", "(.*?)"\)',
2787                         'description': r'<div class="datawrap">(.*?)</div>',
2788                         'owner': r'\("video_owner_name", "(.*?)"\)',
2789                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2790                         }
2791                 video_info = {}
2792                 for piece in data.keys():
2793                         mobj = re.search(data[piece], video_webpage)
2794                         if mobj is not None:
2795                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2796
2797                 # Video urls
2798                 video_urls = {}
2799                 for fmt in self._available_formats:
2800                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2801                         if mobj is not None:
2802                                 # URL is in a Javascript segment inside an escaped Unicode format within
2803                                 # the generally utf-8 page
2804                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2805                 video_info['video_urls'] = video_urls
2806
2807                 return video_info
2808
2809         def _real_initialize(self):
2810                 if self._downloader is None:
2811                         return
2812
2813                 useremail = None
2814                 password = None
2815                 downloader_params = self._downloader.params
2816
2817                 # Attempt to use provided username and password or .netrc data
2818                 if downloader_params.get('username', None) is not None:
2819                         useremail = downloader_params['username']
2820                         password = downloader_params['password']
2821                 elif downloader_params.get('usenetrc', False):
2822                         try:
2823                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2824                                 if info is not None:
2825                                         useremail = info[0]
2826                                         password = info[2]
2827                                 else:
2828                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2829                         except (IOError, netrc.NetrcParseError), err:
2830                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2831                                 return
2832
2833                 if useremail is None:
2834                         return
2835
2836                 # Log in
2837                 login_form = {
2838                         'email': useremail,
2839                         'pass': password,
2840                         'login': 'Log+In'
2841                         }
2842                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2843                 try:
2844                         self.report_login()
2845                         login_results = urllib2.urlopen(request).read()
2846                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2847                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2848                                 return
2849                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2850                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2851                         return
2852
2853         def _real_extract(self, url):
2854                 mobj = re.match(self._VALID_URL, url)
2855                 if mobj is None:
2856                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2857                         return
2858                 video_id = mobj.group('ID')
2859
2860                 # Get video webpage
2861                 self.report_video_webpage_download(video_id)
2862                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2863                 try:
2864                         page = urllib2.urlopen(request)
2865                         video_webpage = page.read()
2866                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2867                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2868                         return
2869
2870                 # Start extracting information
2871                 self.report_information_extraction(video_id)
2872
2873                 # Extract information
2874                 video_info = self._parse_page(video_webpage)
2875
2876                 # uploader
2877                 if 'owner' not in video_info:
2878                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2879                         return
2880                 video_uploader = video_info['owner']
2881
2882                 # title
2883                 if 'title' not in video_info:
2884                         self._downloader.trouble(u'ERROR: unable to extract video title')
2885                         return
2886                 video_title = video_info['title']
2887                 video_title = video_title.decode('utf-8')
2888                 video_title = sanitize_title(video_title)
2889
2890                 simple_title = _simplify_title(video_title)
2891
2892                 # thumbnail image
2893                 if 'thumbnail' not in video_info:
2894                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2895                         video_thumbnail = ''
2896                 else:
2897                         video_thumbnail = video_info['thumbnail']
2898
2899                 # upload date
2900                 upload_date = u'NA'
2901                 if 'upload_date' in video_info:
2902                         upload_time = video_info['upload_date']
2903                         timetuple = email.utils.parsedate_tz(upload_time)
2904                         if timetuple is not None:
2905                                 try:
2906                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2907                                 except:
2908                                         pass
2909
2910                 # description
2911                 video_description = video_info.get('description', 'No description available.')
2912
2913                 url_map = video_info['video_urls']
2914                 if len(url_map.keys()) > 0:
2915                         # Decide which formats to download
2916                         req_format = self._downloader.params.get('format', None)
2917                         format_limit = self._downloader.params.get('format_limit', None)
2918
2919                         if format_limit is not None and format_limit in self._available_formats:
2920                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2921                         else:
2922                                 format_list = self._available_formats
2923                         existing_formats = [x for x in format_list if x in url_map]
2924                         if len(existing_formats) == 0:
2925                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2926                                 return
2927                         if req_format is None:
2928                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2929                         elif req_format == 'worst':
2930                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2931                         elif req_format == '-1':
2932                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2933                         else:
2934                                 # Specific format
2935                                 if req_format not in url_map:
2936                                         self._downloader.trouble(u'ERROR: requested format not available')
2937                                         return
2938                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2939
2940                 for format_param, video_real_url in video_url_list:
2941
2942                         # At this point we have a new video
2943                         self._downloader.increment_downloads()
2944
2945                         # Extension
2946                         video_extension = self._video_extensions.get(format_param, 'mp4')
2947
2948                         try:
2949                                 # Process video information
2950                                 self._downloader.process_info({
2951                                         'id':           video_id.decode('utf-8'),
2952                                         'url':          video_real_url.decode('utf-8'),
2953                                         'uploader':     video_uploader.decode('utf-8'),
2954                                         'upload_date':  upload_date,
2955                                         'title':        video_title,
2956                                         'stitle':       simple_title,
2957                                         'ext':          video_extension.decode('utf-8'),
2958                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2959                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2960                                         'description':  video_description.decode('utf-8'),
2961                                         'player_url':   None,
2962                                 })
2963                         except UnavailableVideoError, err:
2964                                 self._downloader.trouble(u'\nERROR: unable to download video')
2965
2966 class BlipTVIE(InfoExtractor):
2967         """Information extractor for blip.tv"""
2968
2969         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2970         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2971         IE_NAME = u'blip.tv'
2972
2973         def report_extraction(self, file_id):
2974                 """Report information extraction."""
2975                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2976
2977         def report_direct_download(self, title):
2978                 """Report information extraction."""
2979                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2980
2981         def _real_extract(self, url):
2982                 mobj = re.match(self._VALID_URL, url)
2983                 if mobj is None:
2984                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2985                         return
2986
2987                 if '?' in url:
2988                         cchar = '&'
2989                 else:
2990                         cchar = '?'
2991                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2992                 request = urllib2.Request(json_url)
2993                 self.report_extraction(mobj.group(1))
2994                 info = None
2995                 try:
2996                         urlh = urllib2.urlopen(request)
2997                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2998                                 basename = url.split('/')[-1]
2999                                 title,ext = os.path.splitext(basename)
3000                                 title = title.decode('UTF-8')
3001                                 ext = ext.replace('.', '')
3002                                 self.report_direct_download(title)
3003                                 info = {
3004                                         'id': title,
3005                                         'url': url,
3006                                         'title': title,
3007                                         'stitle': _simplify_title(title),
3008                                         'ext': ext,
3009                                         'urlhandle': urlh
3010                                 }
3011                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3012                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3013                         return
3014                 if info is None: # Regular URL
3015                         try:
3016                                 json_code = urlh.read()
3017                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3018                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3019                                 return
3020
3021                         try:
3022                                 json_data = json.loads(json_code)
3023                                 if 'Post' in json_data:
3024                                         data = json_data['Post']
3025                                 else:
3026                                         data = json_data
3027         
3028                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3029                                 video_url = data['media']['url']
3030                                 umobj = re.match(self._URL_EXT, video_url)
3031                                 if umobj is None:
3032                                         raise ValueError('Can not determine filename extension')
3033                                 ext = umobj.group(1)
3034         
3035                                 info = {
3036                                         'id': data['item_id'],
3037                                         'url': video_url,
3038                                         'uploader': data['display_name'],
3039                                         'upload_date': upload_date,
3040                                         'title': data['title'],
3041                                         'stitle': _simplify_title(data['title']),
3042                                         'ext': ext,
3043                                         'format': data['media']['mimeType'],
3044                                         'thumbnail': data['thumbnailUrl'],
3045                                         'description': data['description'],
3046                                         'player_url': data['embedUrl']
3047                                 }
3048                         except (ValueError,KeyError), err:
3049                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3050                                 return
3051
3052                 self._downloader.increment_downloads()
3053
3054                 try:
3055                         self._downloader.process_info(info)
3056                 except UnavailableVideoError, err:
3057                         self._downloader.trouble(u'\nERROR: unable to download video')
3058
3059
3060 class MyVideoIE(InfoExtractor):
3061         """Information Extractor for myvideo.de."""
3062
3063         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3064         IE_NAME = u'myvideo'
3065
3066         def __init__(self, downloader=None):
3067                 InfoExtractor.__init__(self, downloader)
3068         
3069         def report_download_webpage(self, video_id):
3070                 """Report webpage download."""
3071                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3072
3073         def report_extraction(self, video_id):
3074                 """Report information extraction."""
3075                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3076
3077         def _real_extract(self,url):
3078                 mobj = re.match(self._VALID_URL, url)
3079                 if mobj is None:
3080                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3081                         return
3082
3083                 video_id = mobj.group(1)
3084
3085                 # Get video webpage
3086                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3087                 try:
3088                         self.report_download_webpage(video_id)
3089                         webpage = urllib2.urlopen(request).read()
3090                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3091                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3092                         return
3093
3094                 self.report_extraction(video_id)
3095                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3096                                  webpage)
3097                 if mobj is None:
3098                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3099                         return
3100                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3101
3102                 mobj = re.search('<title>([^<]+)</title>', webpage)
3103                 if mobj is None:
3104                         self._downloader.trouble(u'ERROR: unable to extract title')
3105                         return
3106
3107                 video_title = mobj.group(1)
3108                 video_title = sanitize_title(video_title)
3109
3110                 simple_title = _simplify_title(video_title)
3111
3112                 try:
3113                         self._downloader.process_info({
3114                                 'id':           video_id,
3115                                 'url':          video_url,
3116                                 'uploader':     u'NA',
3117                                 'upload_date':  u'NA',
3118                                 'title':        video_title,
3119                                 'stitle':       simple_title,
3120                                 'ext':          u'flv',
3121                                 'format':       u'NA',
3122                                 'player_url':   None,
3123                         })
3124                 except UnavailableVideoError:
3125                         self._downloader.trouble(u'\nERROR: Unable to download video')
3126
3127 class ComedyCentralIE(InfoExtractor):
3128         """Information extractor for The Daily Show and Colbert Report """
3129
3130         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3131         IE_NAME = u'comedycentral'
3132
3133         def report_extraction(self, episode_id):
3134                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3135         
3136         def report_config_download(self, episode_id):
3137                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3138
3139         def report_index_download(self, episode_id):
3140                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3141
3142         def report_player_url(self, episode_id):
3143                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3144
3145         def _real_extract(self, url):
3146                 mobj = re.match(self._VALID_URL, url)
3147                 if mobj is None:
3148                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3149                         return
3150
3151                 if mobj.group('shortname'):
3152                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3153                                 url = u'http://www.thedailyshow.com/full-episodes/'
3154                         else:
3155                                 url = u'http://www.colbertnation.com/full-episodes/'
3156                         mobj = re.match(self._VALID_URL, url)
3157                         assert mobj is not None
3158
3159                 dlNewest = not mobj.group('episode')
3160                 if dlNewest:
3161                         epTitle = mobj.group('showname')
3162                 else:
3163                         epTitle = mobj.group('episode')
3164
3165                 req = urllib2.Request(url)
3166                 self.report_extraction(epTitle)
3167                 try:
3168                         htmlHandle = urllib2.urlopen(req)
3169                         html = htmlHandle.read()
3170                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3171                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3172                         return
3173                 if dlNewest:
3174                         url = htmlHandle.geturl()
3175                         mobj = re.match(self._VALID_URL, url)
3176                         if mobj is None:
3177                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3178                                 return
3179                         if mobj.group('episode') == '':
3180                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3181                                 return
3182                         epTitle = mobj.group('episode')
3183
3184                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3185                 if len(mMovieParams) == 0:
3186                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3187                         return
3188
3189                 playerUrl_raw = mMovieParams[0][0]
3190                 self.report_player_url(epTitle)
3191                 try:
3192                         urlHandle = urllib2.urlopen(playerUrl_raw)
3193                         playerUrl = urlHandle.geturl()
3194                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3195                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3196                         return
3197
3198                 uri = mMovieParams[0][1]
3199                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3200                 self.report_index_download(epTitle)
3201                 try:
3202                         indexXml = urllib2.urlopen(indexUrl).read()
3203                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3204                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3205                         return
3206
3207                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3208                 itemEls = idoc.findall('.//item')
3209                 for itemEl in itemEls:
3210                         mediaId = itemEl.findall('./guid')[0].text
3211                         shortMediaId = mediaId.split(':')[-1]
3212                         showId = mediaId.split(':')[-2].replace('.com', '')
3213                         officialTitle = itemEl.findall('./title')[0].text
3214                         officialDate = itemEl.findall('./pubDate')[0].text
3215
3216                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3217                                                 urllib.urlencode({'uri': mediaId}))
3218                         configReq = urllib2.Request(configUrl)
3219                         self.report_config_download(epTitle)
3220                         try:
3221                                 configXml = urllib2.urlopen(configReq).read()
3222                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3223                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3224                                 return
3225
3226                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3227                         turls = []
3228                         for rendition in cdoc.findall('.//rendition'):
3229                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3230                                 turls.append(finfo)
3231
3232                         if len(turls) == 0:
3233                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3234                                 continue
3235
3236                         # For now, just pick the highest bitrate
3237                         format,video_url = turls[-1]
3238
3239                         self._downloader.increment_downloads()
3240
3241                         effTitle = showId + u'-' + epTitle
3242                         info = {
3243                                 'id': shortMediaId,
3244                                 'url': video_url,
3245                                 'uploader': showId,
3246                                 'upload_date': officialDate,
3247                                 'title': effTitle,
3248                                 'stitle': _simplify_title(effTitle),
3249                                 'ext': 'mp4',
3250                                 'format': format,
3251                                 'thumbnail': None,
3252                                 'description': officialTitle,
3253                                 'player_url': playerUrl
3254                         }
3255
3256                         try:
3257                                 self._downloader.process_info(info)
3258                         except UnavailableVideoError, err:
3259                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3260                                 continue
3261
3262
3263 class EscapistIE(InfoExtractor):
3264         """Information extractor for The Escapist """
3265
3266         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3267         IE_NAME = u'escapist'
3268
3269         def report_extraction(self, showName):
3270                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3271
3272         def report_config_download(self, showName):
3273                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3274
3275         def _real_extract(self, url):
3276                 htmlParser = HTMLParser.HTMLParser()
3277
3278                 mobj = re.match(self._VALID_URL, url)
3279                 if mobj is None:
3280                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3281                         return
3282                 showName = mobj.group('showname')
3283                 videoId = mobj.group('episode')
3284
3285                 self.report_extraction(showName)
3286                 try:
3287                         webPage = urllib2.urlopen(url).read()
3288                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3289                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3290                         return
3291
3292                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3293                 description = htmlParser.unescape(descMatch.group(1))
3294                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3295                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3296                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3297                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3298                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3299                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3300
3301                 self.report_config_download(showName)
3302                 try:
3303                         configJSON = urllib2.urlopen(configUrl).read()
3304                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3305                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3306                         return
3307
3308                 # Technically, it's JavaScript, not JSON
3309                 configJSON = configJSON.replace("'", '"')
3310
3311                 try:
3312                         config = json.loads(configJSON)
3313                 except (ValueError,), err:
3314                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3315                         return
3316
3317                 playlist = config['playlist']
3318                 videoUrl = playlist[1]['url']
3319
3320                 self._downloader.increment_downloads()
3321                 info = {
3322                         'id': videoId,
3323                         'url': videoUrl,
3324                         'uploader': showName,
3325                         'upload_date': None,
3326                         'title': showName,
3327                         'stitle': _simplify_title(showName),
3328                         'ext': 'flv',
3329                         'format': 'flv',
3330                         'thumbnail': imgUrl,
3331                         'description': description,
3332                         'player_url': playerUrl,
3333                 }
3334
3335                 try:
3336                         self._downloader.process_info(info)
3337                 except UnavailableVideoError, err:
3338                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3339
3340
3341 class CollegeHumorIE(InfoExtractor):
3342         """Information extractor for collegehumor.com"""
3343
3344         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3345         IE_NAME = u'collegehumor'
3346
3347         def report_webpage(self, video_id):
3348                 """Report information extraction."""
3349                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3350
3351         def report_extraction(self, video_id):
3352                 """Report information extraction."""
3353                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3354
3355         def _real_extract(self, url):
3356                 htmlParser = HTMLParser.HTMLParser()
3357
3358                 mobj = re.match(self._VALID_URL, url)
3359                 if mobj is None:
3360                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3361                         return
3362                 video_id = mobj.group('videoid')
3363
3364                 self.report_webpage(video_id)
3365                 request = urllib2.Request(url)
3366                 try:
3367                         webpage = urllib2.urlopen(request).read()
3368                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3369                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3370                         return
3371
3372                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3373                 if m is None:
3374                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3375                         return
3376                 internal_video_id = m.group('internalvideoid')
3377
3378                 info = {
3379                         'id': video_id,
3380                         'internal_id': internal_video_id,
3381                 }
3382
3383                 self.report_extraction(video_id)
3384                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3385                 try:
3386                         metaXml = urllib2.urlopen(xmlUrl).read()
3387                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3388                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3389                         return
3390
3391                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3392                 try:
3393                         videoNode = mdoc.findall('./video')[0]
3394                         info['description'] = videoNode.findall('./description')[0].text
3395                         info['title'] = videoNode.findall('./caption')[0].text
3396                         info['stitle'] = _simplify_title(info['title'])
3397                         info['url'] = videoNode.findall('./file')[0].text
3398                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3399                         info['ext'] = info['url'].rpartition('.')[2]
3400                         info['format'] = info['ext']
3401                 except IndexError:
3402                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3403                         return
3404
3405                 self._downloader.increment_downloads()
3406
3407                 try:
3408                         self._downloader.process_info(info)
3409                 except UnavailableVideoError, err:
3410                         self._downloader.trouble(u'\nERROR: unable to download video')
3411
3412
3413 class XVideosIE(InfoExtractor):
3414         """Information extractor for xvideos.com"""
3415
3416         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3417         IE_NAME = u'xvideos'
3418
3419         def report_webpage(self, video_id):
3420                 """Report information extraction."""
3421                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3422
3423         def report_extraction(self, video_id):
3424                 """Report information extraction."""
3425                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3426
3427         def _real_extract(self, url):
3428                 htmlParser = HTMLParser.HTMLParser()
3429
3430                 mobj = re.match(self._VALID_URL, url)
3431                 if mobj is None:
3432                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3433                         return
3434                 video_id = mobj.group(1).decode('utf-8')
3435
3436                 self.report_webpage(video_id)
3437
3438                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3439                 try:
3440                         webpage = urllib2.urlopen(request).read()
3441                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3442                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3443                         return
3444
3445                 self.report_extraction(video_id)
3446
3447
3448                 # Extract video URL
3449                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3450                 if mobj is None:
3451                         self._downloader.trouble(u'ERROR: unable to extract video url')
3452                         return
3453                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3454
3455
3456                 # Extract title
3457                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3458                 if mobj is None:
3459                         self._downloader.trouble(u'ERROR: unable to extract video title')
3460                         return
3461                 video_title = mobj.group(1).decode('utf-8')
3462
3463
3464                 # Extract video thumbnail
3465                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3466                 if mobj is None:
3467                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3468                         return
3469                 video_thumbnail = mobj.group(1).decode('utf-8')
3470
3471
3472
3473                 self._downloader.increment_downloads()
3474                 info = {
3475                         'id': video_id,
3476                         'url': video_url,
3477                         'uploader': None,
3478                         'upload_date': None,
3479                         'title': video_title,
3480                         'stitle': _simplify_title(video_title),
3481                         'ext': 'flv',
3482                         'format': 'flv',
3483                         'thumbnail': video_thumbnail,
3484                         'description': None,
3485                         'player_url': None,
3486                 }
3487
3488                 try:
3489                         self._downloader.process_info(info)
3490                 except UnavailableVideoError, err:
3491                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3492
3493
3494 class SoundcloudIE(InfoExtractor):
3495         """Information extractor for soundcloud.com
3496            To access the media, the uid of the song and a stream token
3497            must be extracted from the page source and the script must make
3498            a request to media.soundcloud.com/crossdomain.xml. Then
3499            the media can be grabbed by requesting from an url composed
3500            of the stream token and uid
3501          """
3502
3503         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3504         IE_NAME = u'soundcloud'
3505
3506         def __init__(self, downloader=None):
3507                 InfoExtractor.__init__(self, downloader)
3508
3509         def report_webpage(self, video_id):
3510                 """Report information extraction."""
3511                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3512
3513         def report_extraction(self, video_id):
3514                 """Report information extraction."""
3515                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3516
3517         def _real_extract(self, url):
3518                 htmlParser = HTMLParser.HTMLParser()
3519
3520                 mobj = re.match(self._VALID_URL, url)
3521                 if mobj is None:
3522                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3523                         return
3524
3525                 # extract uploader (which is in the url)
3526                 uploader = mobj.group(1).decode('utf-8')
3527                 # extract simple title (uploader + slug of song title)
3528                 slug_title =  mobj.group(2).decode('utf-8')
3529                 simple_title = uploader + '-' + slug_title
3530
3531                 self.report_webpage('%s/%s' % (uploader, slug_title))
3532
3533                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3534                 try:
3535                         webpage = urllib2.urlopen(request).read()
3536                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3537                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3538                         return
3539
3540                 self.report_extraction('%s/%s' % (uploader, slug_title))
3541
3542                 # extract uid and stream token that soundcloud hands out for access
3543                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3544                 if mobj:
3545                         video_id = mobj.group(1)
3546                         stream_token = mobj.group(2)
3547
3548                 # extract unsimplified title
3549                 mobj = re.search('"title":"(.*?)",', webpage)
3550                 if mobj:
3551                         title = mobj.group(1)
3552
3553                 # construct media url (with uid/token)
3554                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3555                 mediaURL = mediaURL % (video_id, stream_token)
3556
3557                 # description
3558                 description = u'No description available'
3559                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3560                 if mobj:
3561                         description = mobj.group(1)
3562                 
3563                 # upload date
3564                 upload_date = None
3565                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3566                 if mobj:
3567                         try:
3568                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3569                         except Exception, e:
3570                                 print str(e)
3571
3572                 # for soundcloud, a request to a cross domain is required for cookies
3573                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3574
3575                 try:
3576                         self._downloader.process_info({
3577                                 'id':           video_id.decode('utf-8'),
3578                                 'url':          mediaURL,
3579                                 'uploader':     uploader.decode('utf-8'),
3580                                 'upload_date':  upload_date,
3581                                 'title':        simple_title.decode('utf-8'),
3582                                 'stitle':       simple_title.decode('utf-8'),
3583                                 'ext':          u'mp3',
3584                                 'format':       u'NA',
3585                                 'player_url':   None,
3586                                 'description': description.decode('utf-8')
3587                         })
3588                 except UnavailableVideoError:
3589                         self._downloader.trouble(u'\nERROR: unable to download video')
3590
3591
3592 class InfoQIE(InfoExtractor):
3593         """Information extractor for infoq.com"""
3594
3595         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3596         IE_NAME = u'infoq'
3597
3598         def report_webpage(self, video_id):
3599                 """Report information extraction."""
3600                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3601
3602         def report_extraction(self, video_id):
3603                 """Report information extraction."""
3604                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3605
3606         def _real_extract(self, url):
3607                 htmlParser = HTMLParser.HTMLParser()
3608
3609                 mobj = re.match(self._VALID_URL, url)
3610                 if mobj is None:
3611                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3612                         return
3613
3614                 self.report_webpage(url)
3615
3616                 request = urllib2.Request(url)
3617                 try:
3618                         webpage = urllib2.urlopen(request).read()
3619                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3620                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3621                         return
3622
3623                 self.report_extraction(url)
3624
3625
3626                 # Extract video URL
3627                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3628                 if mobj is None:
3629                         self._downloader.trouble(u'ERROR: unable to extract video url')
3630                         return
3631                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3632
3633
3634                 # Extract title
3635                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3636                 if mobj is None:
3637                         self._downloader.trouble(u'ERROR: unable to extract video title')
3638                         return
3639                 video_title = mobj.group(1).decode('utf-8')
3640
3641                 # Extract description
3642                 video_description = u'No description available.'
3643                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3644                 if mobj is not None:
3645                         video_description = mobj.group(1).decode('utf-8')
3646
3647                 video_filename = video_url.split('/')[-1]
3648                 video_id, extension = video_filename.split('.')
3649
3650                 self._downloader.increment_downloads()
3651                 info = {
3652                         'id': video_id,
3653                         'url': video_url,
3654                         'uploader': None,
3655                         'upload_date': None,
3656                         'title': video_title,
3657                         'stitle': _simplify_title(video_title),
3658                         'ext': extension,
3659                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3660                         'thumbnail': None,
3661                         'description': video_description,
3662                         'player_url': None,
3663                 }
3664
3665                 try:
3666                         self._downloader.process_info(info)
3667                 except UnavailableVideoError, err:
3668                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3669
3670 class MixcloudIE(InfoExtractor):
3671         """Information extractor for www.mixcloud.com"""
3672         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3673         IE_NAME = u'mixcloud'
3674
3675         def __init__(self, downloader=None):
3676                 InfoExtractor.__init__(self, downloader)
3677
3678         def report_download_json(self, file_id):
3679                 """Report JSON download."""
3680                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3681
3682         def report_extraction(self, file_id):
3683                 """Report information extraction."""
3684                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3685
3686         def get_urls(self, jsonData, fmt, bitrate='best'):
3687                 """Get urls from 'audio_formats' section in json"""
3688                 file_url = None
3689                 try:
3690                         bitrate_list = jsonData[fmt]
3691                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3692                                 bitrate = max(bitrate_list) # select highest
3693
3694                         url_list = jsonData[fmt][bitrate]
3695                 except TypeError: # we have no bitrate info.
3696                         url_list = jsonData[fmt]
3697                                 
3698                 return url_list
3699
3700         def check_urls(self, url_list):
3701                 """Returns 1st active url from list"""
3702                 for url in url_list:
3703                         try:
3704                                 urllib2.urlopen(url)
3705                                 return url
3706                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3707                                 url = None
3708
3709                 return None
3710
3711         def _print_formats(self, formats):
3712                 print 'Available formats:'
3713                 for fmt in formats.keys():
3714                         for b in formats[fmt]:
3715                                 try:
3716                                         ext = formats[fmt][b][0]
3717                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3718                                 except TypeError: # we have no bitrate info
3719                                         ext = formats[fmt][0]
3720                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3721                                         break
3722
3723         def _real_extract(self, url):
3724                 mobj = re.match(self._VALID_URL, url)
3725                 if mobj is None:
3726                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3727                         return
3728                 # extract uploader & filename from url
3729                 uploader = mobj.group(1).decode('utf-8')
3730                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3731
3732                 # construct API request
3733                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3734                 # retrieve .json file with links to files
3735                 request = urllib2.Request(file_url)
3736                 try:
3737                         self.report_download_json(file_url)
3738                         jsonData = urllib2.urlopen(request).read()
3739                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3740                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3741                         return
3742
3743                 # parse JSON
3744                 json_data = json.loads(jsonData)
3745                 player_url = json_data['player_swf_url']
3746                 formats = dict(json_data['audio_formats'])
3747
3748                 req_format = self._downloader.params.get('format', None)
3749                 bitrate = None
3750
3751                 if self._downloader.params.get('listformats', None):
3752                         self._print_formats(formats)
3753                         return
3754
3755                 if req_format is None or req_format == 'best':
3756                         for format_param in formats.keys():
3757                                 url_list = self.get_urls(formats, format_param)
3758                                 # check urls
3759                                 file_url = self.check_urls(url_list)
3760                                 if file_url is not None:
3761                                         break # got it!
3762                 else:
3763                         if req_format not in formats.keys():
3764                                 self._downloader.trouble(u'ERROR: format is not available')
3765                                 return
3766
3767                         url_list = self.get_urls(formats, req_format)
3768                         file_url = self.check_urls(url_list)
3769                         format_param = req_format
3770
3771                 # We have audio
3772                 self._downloader.increment_downloads()
3773                 try:
3774                         # Process file information
3775                         self._downloader.process_info({
3776                                 'id': file_id.decode('utf-8'),
3777                                 'url': file_url.decode('utf-8'),
3778                                 'uploader':     uploader.decode('utf-8'),
3779                                 'upload_date': u'NA',
3780                                 'title': json_data['name'],
3781                                 'stitle': _simplify_title(json_data['name']),
3782                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3783                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3784                                 'thumbnail': json_data['thumbnail_url'],
3785                                 'description': json_data['description'],
3786                                 'player_url': player_url.decode('utf-8'),
3787                         })
3788                 except UnavailableVideoError, err:
3789                         self._downloader.trouble(u'ERROR: unable to download file')
3790
3791 class StanfordOpenClassroomIE(InfoExtractor):
3792         """Information extractor for Stanford's Open ClassRoom"""
3793
3794         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3795         IE_NAME = u'stanfordoc'
3796
3797         def report_download_webpage(self, objid):
3798                 """Report information extraction."""
3799                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3800
3801         def report_extraction(self, video_id):
3802                 """Report information extraction."""
3803                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3804
3805         def _real_extract(self, url):
3806                 mobj = re.match(self._VALID_URL, url)
3807                 if mobj is None:
3808                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3809                         return
3810
3811                 if mobj.group('course') and mobj.group('video'): # A specific video
3812                         course = mobj.group('course')
3813                         video = mobj.group('video')
3814                         info = {
3815                                 'id': _simplify_title(course + '_' + video),
3816                         }
3817         
3818                         self.report_extraction(info['id'])
3819                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3820                         xmlUrl = baseUrl + video + '.xml'
3821                         try:
3822                                 metaXml = urllib2.urlopen(xmlUrl).read()
3823                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3824                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3825                                 return
3826                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3827                         try:
3828                                 info['title'] = mdoc.findall('./title')[0].text
3829                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3830                         except IndexError:
3831                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3832                                 return
3833                         info['stitle'] = _simplify_title(info['title'])
3834                         info['ext'] = info['url'].rpartition('.')[2]
3835                         info['format'] = info['ext']
3836                         self._downloader.increment_downloads()
3837                         try:
3838                                 self._downloader.process_info(info)
3839                         except UnavailableVideoError, err:
3840                                 self._downloader.trouble(u'\nERROR: unable to download video')
3841                 elif mobj.group('course'): # A course page
3842                         unescapeHTML = HTMLParser.HTMLParser().unescape
3843
3844                         course = mobj.group('course')
3845                         info = {
3846                                 'id': _simplify_title(course),
3847                                 'type': 'playlist',
3848                         }
3849
3850                         self.report_download_webpage(info['id'])
3851                         try:
3852                                 coursepage = urllib2.urlopen(url).read()
3853                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3854                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3855                                 return
3856
3857                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3858                         if m:
3859                                 info['title'] = unescapeHTML(m.group(1))
3860                         else:
3861                                 info['title'] = info['id']
3862                         info['stitle'] = _simplify_title(info['title'])
3863
3864                         m = re.search('<description>([^<]+)</description>', coursepage)
3865                         if m:
3866                                 info['description'] = unescapeHTML(m.group(1))
3867
3868                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3869                         info['list'] = [
3870                                 {
3871                                         'type': 'reference',
3872                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3873                                 }
3874                                         for vpage in links]
3875
3876                         for entry in info['list']:
3877                                 assert entry['type'] == 'reference'
3878                                 self.extract(entry['url'])
3879                 else: # Root page
3880                         unescapeHTML = HTMLParser.HTMLParser().unescape
3881
3882                         info = {
3883                                 'id': 'Stanford OpenClassroom',
3884                                 'type': 'playlist',
3885                         }
3886
3887                         self.report_download_webpage(info['id'])
3888                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3889                         try:
3890                                 rootpage = urllib2.urlopen(rootURL).read()
3891                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3892                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3893                                 return
3894
3895                         info['title'] = info['id']
3896                         info['stitle'] = _simplify_title(info['title'])
3897
3898                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3899                         info['list'] = [
3900                                 {
3901                                         'type': 'reference',
3902                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3903                                 }
3904                                         for cpage in links]
3905
3906                         for entry in info['list']:
3907                                 assert entry['type'] == 'reference'
3908                                 self.extract(entry['url'])
3909
3910 class MTVIE(InfoExtractor):
3911         """Information extractor for MTV.com"""
3912
3913         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3914         IE_NAME = u'mtv'
3915
3916         def report_webpage(self, video_id):
3917                 """Report information extraction."""
3918                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3919
3920         def report_extraction(self, video_id):
3921                 """Report information extraction."""
3922                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3923
3924         def _real_extract(self, url):
3925                 mobj = re.match(self._VALID_URL, url)
3926                 if mobj is None:
3927                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3928                         return
3929                 if not mobj.group('proto'):
3930                         url = 'http://' + url
3931                 video_id = mobj.group('videoid')
3932                 self.report_webpage(video_id)
3933
3934                 request = urllib2.Request(url)
3935                 try:
3936                         webpage = urllib2.urlopen(request).read()
3937                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3938                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3939                         return
3940
3941                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3942                 if mobj is None:
3943                         self._downloader.trouble(u'ERROR: unable to extract song name')
3944                         return
3945                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3946                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3947                 if mobj is None:
3948                         self._downloader.trouble(u'ERROR: unable to extract performer')
3949                         return
3950                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3951                 video_title = performer + ' - ' + song_name 
3952
3953                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3954                 if mobj is None:
3955                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3956                         return
3957                 mtvn_uri = mobj.group(1)
3958
3959                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3960                 if mobj is None:
3961                         self._downloader.trouble(u'ERROR: unable to extract content id')
3962                         return
3963                 content_id = mobj.group(1)
3964
3965                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3966                 self.report_extraction(video_id)
3967                 request = urllib2.Request(videogen_url)
3968                 try:
3969                         metadataXml = urllib2.urlopen(request).read()
3970                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3971                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3972                         return
3973
3974                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3975                 renditions = mdoc.findall('.//rendition')
3976
3977                 # For now, always pick the highest quality.
3978                 rendition = renditions[-1]
3979
3980                 try:
3981                         _,_,ext = rendition.attrib['type'].partition('/')
3982                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3983                         video_url = rendition.find('./src').text
3984                 except KeyError:
3985                         self._downloader.trouble('Invalid rendition field.')
3986                         return
3987
3988                 self._downloader.increment_downloads()
3989                 info = {
3990                         'id': video_id,
3991                         'url': video_url,
3992                         'uploader': performer,
3993                         'title': video_title,
3994                         'stitle': _simplify_title(video_title),
3995                         'ext': ext,
3996                         'format': format,
3997                 }
3998
3999                 try:
4000                         self._downloader.process_info(info)
4001                 except UnavailableVideoError, err:
4002                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4003
4004
4005 class PostProcessor(object):
4006         """Post Processor class.
4007
4008         PostProcessor objects can be added to downloaders with their
4009         add_post_processor() method. When the downloader has finished a
4010         successful download, it will take its internal chain of PostProcessors
4011         and start calling the run() method on each one of them, first with
4012         an initial argument and then with the returned value of the previous
4013         PostProcessor.
4014
4015         The chain will be stopped if one of them ever returns None or the end
4016         of the chain is reached.
4017
4018         PostProcessor objects follow a "mutual registration" process similar
4019         to InfoExtractor objects.
4020         """
4021
4022         _downloader = None
4023
4024         def __init__(self, downloader=None):
4025                 self._downloader = downloader
4026
4027         def set_downloader(self, downloader):
4028                 """Sets the downloader for this PP."""
4029                 self._downloader = downloader
4030
4031         def run(self, information):
4032                 """Run the PostProcessor.
4033
4034                 The "information" argument is a dictionary like the ones
4035                 composed by InfoExtractors. The only difference is that this
4036                 one has an extra field called "filepath" that points to the
4037                 downloaded file.
4038
4039                 When this method returns None, the postprocessing chain is
4040                 stopped. However, this method may return an information
4041                 dictionary that will be passed to the next postprocessing
4042                 object in the chain. It can be the one it received after
4043                 changing some fields.
4044
4045                 In addition, this method may raise a PostProcessingError
4046                 exception that will be taken into account by the downloader
4047                 it was called from.
4048                 """
4049                 return information # by default, do nothing
4050
4051 class AudioConversionError(BaseException):
4052         def __init__(self, message):
4053                 self.message = message
4054
4055 class FFmpegExtractAudioPP(PostProcessor):
4056
4057         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4058                 PostProcessor.__init__(self, downloader)
4059                 if preferredcodec is None:
4060                         preferredcodec = 'best'
4061                 self._preferredcodec = preferredcodec
4062                 self._preferredquality = preferredquality
4063                 self._keepvideo = keepvideo
4064
4065         @staticmethod
4066         def get_audio_codec(path):
4067                 try:
4068                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4069                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4070                         output = handle.communicate()[0]
4071                         if handle.wait() != 0:
4072                                 return None
4073                 except (IOError, OSError):
4074                         return None
4075                 audio_codec = None
4076                 for line in output.split('\n'):
4077                         if line.startswith('codec_name='):
4078                                 audio_codec = line.split('=')[1].strip()
4079                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4080                                 return audio_codec
4081                 return None
4082
4083         @staticmethod
4084         def run_ffmpeg(path, out_path, codec, more_opts):
4085                 if codec is None:
4086                         acodec_opts = []
4087                 else:
4088                         acodec_opts = ['-acodec', codec]
4089                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4090                 try:
4091                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4092                         stdout,stderr = p.communicate()
4093                 except (IOError, OSError):
4094                         e = sys.exc_info()[1]
4095                         if isinstance(e, OSError) and e.errno == 2:
4096                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4097                         else:
4098                                 raise e
4099                 if p.returncode != 0:
4100                         msg = stderr.strip().split('\n')[-1]
4101                         raise AudioConversionError(msg)
4102
4103         def run(self, information):
4104                 path = information['filepath']
4105
4106                 filecodec = self.get_audio_codec(path)
4107                 if filecodec is None:
4108                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4109                         return None
4110
4111                 more_opts = []
4112                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4113                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4114                                 # Lossless, but in another container
4115                                 acodec = 'copy'
4116                                 extension = self._preferredcodec
4117                                 more_opts = ['-absf', 'aac_adtstoasc']
4118                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4119                                 # Lossless if possible
4120                                 acodec = 'copy'
4121                                 extension = filecodec
4122                                 if filecodec == 'aac':
4123                                         more_opts = ['-f', 'adts']
4124                                 if filecodec == 'vorbis':
4125                                         extension = 'ogg'
4126                         else:
4127                                 # MP3 otherwise.
4128                                 acodec = 'libmp3lame'
4129                                 extension = 'mp3'
4130                                 more_opts = []
4131                                 if self._preferredquality is not None:
4132                                         more_opts += ['-ab', self._preferredquality]
4133                 else:
4134                         # We convert the audio (lossy)
4135                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4136                         extension = self._preferredcodec
4137                         more_opts = []
4138                         if self._preferredquality is not None:
4139                                 more_opts += ['-ab', self._preferredquality]
4140                         if self._preferredcodec == 'aac':
4141                                 more_opts += ['-f', 'adts']
4142                         if self._preferredcodec == 'm4a':
4143                                 more_opts += ['-absf', 'aac_adtstoasc']
4144                         if self._preferredcodec == 'vorbis':
4145                                 extension = 'ogg'
4146                         if self._preferredcodec == 'wav':
4147                                 extension = 'wav'
4148                                 more_opts += ['-f', 'wav']
4149
4150                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4151                 new_path = prefix + sep + extension
4152                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4153                 try:
4154                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4155                 except:
4156                         etype,e,tb = sys.exc_info()
4157                         if isinstance(e, AudioConversionError):
4158                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4159                         else:
4160                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4161                         return None
4162
4163                 # Try to update the date time for extracted audio file.
4164                 if information.get('filetime') is not None:
4165                         try:
4166                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4167                         except:
4168                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4169
4170                 if not self._keepvideo:
4171                         try:
4172                                 os.remove(_encodeFilename(path))
4173                         except (IOError, OSError):
4174                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4175                                 return None
4176
4177                 information['filepath'] = new_path
4178                 return information
4179
4180
4181 def updateSelf(downloader, filename):
4182         ''' Update the program file with the latest version from the repository '''
4183         # Note: downloader only used for options
4184         if not os.access(filename, os.W_OK):
4185                 sys.exit('ERROR: no write permissions on %s' % filename)
4186
4187         downloader.to_screen(u'Updating to latest version...')
4188
4189         try:
4190                 try:
4191                         urlh = urllib.urlopen(UPDATE_URL)
4192                         newcontent = urlh.read()
4193                         
4194                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4195                         if vmatch is not None and vmatch.group(1) == __version__:
4196                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4197                                 return
4198                 finally:
4199                         urlh.close()
4200         except (IOError, OSError), err:
4201                 sys.exit('ERROR: unable to download latest version')
4202
4203         try:
4204                 outf = open(filename, 'wb')
4205                 try:
4206                         outf.write(newcontent)
4207                 finally:
4208                         outf.close()
4209         except (IOError, OSError), err:
4210                 sys.exit('ERROR: unable to overwrite current version')
4211
4212         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4213
4214 def parseOpts():
4215         def _readOptions(filename_bytes):
4216                 try:
4217                         optionf = open(filename_bytes)
4218                 except IOError:
4219                         return [] # silently skip if file is not present
4220                 try:
4221                         res = []
4222                         for l in optionf:
4223                                 res += shlex.split(l, comments=True)
4224                 finally:
4225                         optionf.close()
4226                 return res
4227
4228         def _format_option_string(option):
4229                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4230
4231                 opts = []
4232
4233                 if option._short_opts: opts.append(option._short_opts[0])
4234                 if option._long_opts: opts.append(option._long_opts[0])
4235                 if len(opts) > 1: opts.insert(1, ', ')
4236
4237                 if option.takes_value(): opts.append(' %s' % option.metavar)
4238
4239                 return "".join(opts)
4240
4241         def _find_term_columns():
4242                 columns = os.environ.get('COLUMNS', None)
4243                 if columns:
4244                         return int(columns)
4245
4246                 try:
4247                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4248                         out,err = sp.communicate()
4249                         return int(out.split()[1])
4250                 except:
4251                         pass
4252                 return None
4253
4254         max_width = 80
4255         max_help_position = 80
4256
4257         # No need to wrap help messages if we're on a wide console
4258         columns = _find_term_columns()
4259         if columns: max_width = columns
4260
4261         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4262         fmt.format_option_strings = _format_option_string
4263
4264         kw = {
4265                 'version'   : __version__,
4266                 'formatter' : fmt,
4267                 'usage' : '%prog [options] url [url...]',
4268                 'conflict_handler' : 'resolve',
4269         }
4270
4271         parser = optparse.OptionParser(**kw)
4272
4273         # option groups
4274         general        = optparse.OptionGroup(parser, 'General Options')
4275         selection      = optparse.OptionGroup(parser, 'Video Selection')
4276         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4277         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4278         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4279         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4280         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4281
4282         general.add_option('-h', '--help',
4283                         action='help', help='print this help text and exit')
4284         general.add_option('-v', '--version',
4285                         action='version', help='print program version and exit')
4286         general.add_option('-U', '--update',
4287                         action='store_true', dest='update_self', help='update this program to latest version')
4288         general.add_option('-i', '--ignore-errors',
4289                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4290         general.add_option('-r', '--rate-limit',
4291                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4292         general.add_option('-R', '--retries',
4293                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4294         general.add_option('--dump-user-agent',
4295                         action='store_true', dest='dump_user_agent',
4296                         help='display the current browser identification', default=False)
4297         general.add_option('--list-extractors',
4298                         action='store_true', dest='list_extractors',
4299                         help='List all supported extractors and the URLs they would handle', default=False)
4300
4301         selection.add_option('--playlist-start',
4302                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4303         selection.add_option('--playlist-end',
4304                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4305         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4306         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4307         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4308
4309         authentication.add_option('-u', '--username',
4310                         dest='username', metavar='USERNAME', help='account username')
4311         authentication.add_option('-p', '--password',
4312                         dest='password', metavar='PASSWORD', help='account password')
4313         authentication.add_option('-n', '--netrc',
4314                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4315
4316
4317         video_format.add_option('-f', '--format',
4318                         action='store', dest='format', metavar='FORMAT', help='video format code')
4319         video_format.add_option('--all-formats',
4320                         action='store_const', dest='format', help='download all available video formats', const='all')
4321         video_format.add_option('--prefer-free-formats',
4322                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4323         video_format.add_option('--max-quality',
4324                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4325         video_format.add_option('-F', '--list-formats',
4326                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4327
4328
4329         verbosity.add_option('-q', '--quiet',
4330                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4331         verbosity.add_option('-s', '--simulate',
4332                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4333         verbosity.add_option('--skip-download',
4334                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4335         verbosity.add_option('-g', '--get-url',
4336                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4337         verbosity.add_option('-e', '--get-title',
4338                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4339         verbosity.add_option('--get-thumbnail',
4340                         action='store_true', dest='getthumbnail',
4341                         help='simulate, quiet but print thumbnail URL', default=False)
4342         verbosity.add_option('--get-description',
4343                         action='store_true', dest='getdescription',
4344                         help='simulate, quiet but print video description', default=False)
4345         verbosity.add_option('--get-filename',
4346                         action='store_true', dest='getfilename',
4347                         help='simulate, quiet but print output filename', default=False)
4348         verbosity.add_option('--get-format',
4349                         action='store_true', dest='getformat',
4350                         help='simulate, quiet but print output format', default=False)
4351         verbosity.add_option('--no-progress',
4352                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4353         verbosity.add_option('--console-title',
4354                         action='store_true', dest='consoletitle',
4355                         help='display progress in console titlebar', default=False)
4356         verbosity.add_option('-v', '--verbose',
4357                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4358
4359
4360         filesystem.add_option('-t', '--title',
4361                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4362         filesystem.add_option('-l', '--literal',
4363                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4364         filesystem.add_option('-A', '--auto-number',
4365                         action='store_true', dest='autonumber',
4366                         help='number downloaded files starting from 00000', default=False)
4367         filesystem.add_option('-o', '--output',
4368                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4369         filesystem.add_option('-a', '--batch-file',
4370                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4371         filesystem.add_option('-w', '--no-overwrites',
4372                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4373         filesystem.add_option('-c', '--continue',
4374                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4375         filesystem.add_option('--no-continue',
4376                         action='store_false', dest='continue_dl',
4377                         help='do not resume partially downloaded files (restart from beginning)')
4378         filesystem.add_option('--cookies',
4379                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4380         filesystem.add_option('--no-part',
4381                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4382         filesystem.add_option('--no-mtime',
4383                         action='store_false', dest='updatetime',
4384                         help='do not use the Last-modified header to set the file modification time', default=True)
4385         filesystem.add_option('--write-description',
4386                         action='store_true', dest='writedescription',
4387                         help='write video description to a .description file', default=False)
4388         filesystem.add_option('--write-info-json',
4389                         action='store_true', dest='writeinfojson',
4390                         help='write video metadata to a .info.json file', default=False)
4391
4392
4393         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4394                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4395         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4396                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4397         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4398                         help='ffmpeg audio bitrate specification, 128k by default')
4399         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4400                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4401
4402
4403         parser.add_option_group(general)
4404         parser.add_option_group(selection)
4405         parser.add_option_group(filesystem)
4406         parser.add_option_group(verbosity)
4407         parser.add_option_group(video_format)
4408         parser.add_option_group(authentication)
4409         parser.add_option_group(postproc)
4410
4411         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4412         if xdg_config_home:
4413                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4414         else:
4415                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4416         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4417         opts, args = parser.parse_args(argv)
4418
4419         return parser, opts, args
4420
4421 def gen_extractors():
4422         """ Return a list of an instance of every supported extractor.
4423         The order does matter; the first extractor matched is the one handling the URL.
4424         """
4425         youtube_ie = YoutubeIE()
4426         google_ie = GoogleIE()
4427         yahoo_ie = YahooIE()
4428         return [
4429                 YoutubePlaylistIE(youtube_ie),
4430                 YoutubeUserIE(youtube_ie),
4431                 YoutubeSearchIE(youtube_ie),
4432                 youtube_ie,
4433                 MetacafeIE(youtube_ie),
4434                 DailymotionIE(),
4435                 google_ie,
4436                 GoogleSearchIE(google_ie),
4437                 PhotobucketIE(),
4438                 yahoo_ie,
4439                 YahooSearchIE(yahoo_ie),
4440                 DepositFilesIE(),
4441                 FacebookIE(),
4442                 BlipTVIE(),
4443                 VimeoIE(),
4444                 MyVideoIE(),
4445                 ComedyCentralIE(),
4446                 EscapistIE(),
4447                 CollegeHumorIE(),
4448                 XVideosIE(),
4449                 SoundcloudIE(),
4450                 InfoQIE(),
4451                 MixcloudIE(),
4452                 StanfordOpenClassroomIE(),
4453                 MTVIE(),
4454
4455                 GenericIE()
4456         ]
4457
4458 def _real_main():
4459         parser, opts, args = parseOpts()
4460
4461         # Open appropriate CookieJar
4462         if opts.cookiefile is None:
4463                 jar = cookielib.CookieJar()
4464         else:
4465                 try:
4466                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4467                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4468                                 jar.load()
4469                 except (IOError, OSError), err:
4470                         sys.exit(u'ERROR: unable to open cookie file')
4471
4472         # Dump user agent
4473         if opts.dump_user_agent:
4474                 print std_headers['User-Agent']
4475                 sys.exit(0)
4476
4477         # Batch file verification
4478         batchurls = []
4479         if opts.batchfile is not None:
4480                 try:
4481                         if opts.batchfile == '-':
4482                                 batchfd = sys.stdin
4483                         else:
4484                                 batchfd = open(opts.batchfile, 'r')
4485                         batchurls = batchfd.readlines()
4486                         batchurls = [x.strip() for x in batchurls]
4487                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4488                 except IOError:
4489                         sys.exit(u'ERROR: batch file could not be read')
4490         all_urls = batchurls + args
4491
4492         # General configuration
4493         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4494         proxy_handler = urllib2.ProxyHandler()
4495         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4496         urllib2.install_opener(opener)
4497         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4498
4499         if opts.verbose:
4500                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4501
4502         extractors = gen_extractors()
4503
4504         if opts.list_extractors:
4505                 for ie in extractors:
4506                         print(ie.IE_NAME)
4507                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4508                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4509                         for mu in matchedUrls:
4510                                 print(u'  ' + mu)
4511                 sys.exit(0)
4512
4513         # Conflicting, missing and erroneous options
4514         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4515                 parser.error(u'using .netrc conflicts with giving username/password')
4516         if opts.password is not None and opts.username is None:
4517                 parser.error(u'account username missing')
4518         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4519                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4520         if opts.usetitle and opts.useliteral:
4521                 parser.error(u'using title conflicts with using literal title')
4522         if opts.username is not None and opts.password is None:
4523                 opts.password = getpass.getpass(u'Type account password and press return:')
4524         if opts.ratelimit is not None:
4525                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4526                 if numeric_limit is None:
4527                         parser.error(u'invalid rate limit specified')
4528                 opts.ratelimit = numeric_limit
4529         if opts.retries is not None:
4530                 try:
4531                         opts.retries = long(opts.retries)
4532                 except (TypeError, ValueError), err:
4533                         parser.error(u'invalid retry count specified')
4534         try:
4535                 opts.playliststart = int(opts.playliststart)
4536                 if opts.playliststart <= 0:
4537                         raise ValueError(u'Playlist start must be positive')
4538         except (TypeError, ValueError), err:
4539                 parser.error(u'invalid playlist start number specified')
4540         try:
4541                 opts.playlistend = int(opts.playlistend)
4542                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4543                         raise ValueError(u'Playlist end must be greater than playlist start')
4544         except (TypeError, ValueError), err:
4545                 parser.error(u'invalid playlist end number specified')
4546         if opts.extractaudio:
4547                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4548                         parser.error(u'invalid audio format specified')
4549
4550         # File downloader
4551         fd = FileDownloader({
4552                 'usenetrc': opts.usenetrc,
4553                 'username': opts.username,
4554                 'password': opts.password,
4555                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4556                 'forceurl': opts.geturl,
4557                 'forcetitle': opts.gettitle,
4558                 'forcethumbnail': opts.getthumbnail,
4559                 'forcedescription': opts.getdescription,
4560                 'forcefilename': opts.getfilename,
4561                 'forceformat': opts.getformat,
4562                 'simulate': opts.simulate,
4563                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4564                 'format': opts.format,
4565                 'format_limit': opts.format_limit,
4566                 'listformats': opts.listformats,
4567                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4568                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4569                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4570                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4571                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4572                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4573                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4574                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4575                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4576                         or u'%(id)s.%(ext)s'),
4577                 'ignoreerrors': opts.ignoreerrors,
4578                 'ratelimit': opts.ratelimit,
4579                 'nooverwrites': opts.nooverwrites,
4580                 'retries': opts.retries,
4581                 'continuedl': opts.continue_dl,
4582                 'noprogress': opts.noprogress,
4583                 'playliststart': opts.playliststart,
4584                 'playlistend': opts.playlistend,
4585                 'logtostderr': opts.outtmpl == '-',
4586                 'consoletitle': opts.consoletitle,
4587                 'nopart': opts.nopart,
4588                 'updatetime': opts.updatetime,
4589                 'writedescription': opts.writedescription,
4590                 'writeinfojson': opts.writeinfojson,
4591                 'matchtitle': opts.matchtitle,
4592                 'rejecttitle': opts.rejecttitle,
4593                 'max_downloads': opts.max_downloads,
4594                 'prefer_free_formats': opts.prefer_free_formats,
4595                 'verbose': opts.verbose,
4596                 })
4597         for extractor in extractors:
4598                 fd.add_info_extractor(extractor)
4599
4600         # PostProcessors
4601         if opts.extractaudio:
4602                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4603
4604         # Update version
4605         if opts.update_self:
4606                 updateSelf(fd, sys.argv[0])
4607
4608         # Maybe do nothing
4609         if len(all_urls) < 1:
4610                 if not opts.update_self:
4611                         parser.error(u'you must provide at least one URL')
4612                 else:
4613                         sys.exit()
4614         
4615         try:
4616                 retcode = fd.download(all_urls)
4617         except MaxDownloadsReached:
4618                 fd.to_screen(u'--max-download limit reached, aborting.')
4619                 retcode = 101
4620
4621         # Dump cookie jar if requested
4622         if opts.cookiefile is not None:
4623                 try:
4624                         jar.save()
4625                 except (IOError, OSError), err:
4626                         sys.exit(u'ERROR: unable to save cookie jar')
4627
4628         sys.exit(retcode)
4629
4630 def main():
4631         try:
4632                 _real_main()
4633         except DownloadError:
4634                 sys.exit(1)
4635         except SameFileError:
4636                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4637         except KeyboardInterrupt:
4638                 sys.exit(u'\nERROR: Interrupted by user')
4639
4640 if __name__ == '__main__':
4641         main()
4642
4643 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: