added youtube closed captions .srt support (see #90)
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25
26 import cookielib
27 import datetime
28 import getpass
29 import gzip
30 import htmlentitydefs
31 import HTMLParser
32 import httplib
33 import locale
34 import math
35 import netrc
36 import optparse
37 import os
38 import os.path
39 import re
40 import shlex
41 import socket
42 import string
43 import subprocess
44 import sys
45 import time
46 import urllib
47 import urllib2
48 import warnings
49 import zlib
50
51 if os.name == 'nt':
52         import ctypes
53
54 try:
55         import email.utils
56 except ImportError: # Python 2.4
57         import email.Utils
58 try:
59         import cStringIO as StringIO
60 except ImportError:
61         import StringIO
62
63 # parse_qs was moved from the cgi module to the urlparse module recently.
64 try:
65         from urlparse import parse_qs
66 except ImportError:
67         from cgi import parse_qs
68
69 try:
70         import lxml.etree
71 except ImportError:
72         pass # Handled below
73
74 try:
75         import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
78
79 std_headers = {
80         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83         'Accept-Encoding': 'gzip, deflate',
84         'Accept-Language': 'en-us,en;q=0.5',
85 }
86
87 try:
88         import json
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90         import re
91         class json(object):
92                 @staticmethod
93                 def loads(s):
94                         s = s.decode('UTF-8')
95                         def raiseError(msg, i):
96                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97                         def skipSpace(i, expectMore=True):
98                                 while i < len(s) and s[i] in ' \t\r\n':
99                                         i += 1
100                                 if expectMore:
101                                         if i >= len(s):
102                                                 raiseError('Premature end', i)
103                                 return i
104                         def decodeEscape(match):
105                                 esc = match.group(1)
106                                 _STATIC = {
107                                         '"': '"',
108                                         '\\': '\\',
109                                         '/': '/',
110                                         'b': unichr(0x8),
111                                         'f': unichr(0xc),
112                                         'n': '\n',
113                                         'r': '\r',
114                                         't': '\t',
115                                 }
116                                 if esc in _STATIC:
117                                         return _STATIC[esc]
118                                 if esc[0] == 'u':
119                                         if len(esc) == 1+4:
120                                                 return unichr(int(esc[1:5], 16))
121                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
122                                                 hi = int(esc[1:5], 16)
123                                                 low = int(esc[7:11], 16)
124                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125                                 raise ValueError('Unknown escape ' + str(esc))
126                         def parseString(i):
127                                 i += 1
128                                 e = i
129                                 while True:
130                                         e = s.index('"', e)
131                                         bslashes = 0
132                                         while s[e-bslashes-1] == '\\':
133                                                 bslashes += 1
134                                         if bslashes % 2 == 1:
135                                                 e += 1
136                                                 continue
137                                         break
138                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139                                 stri = rexp.sub(decodeEscape, s[i:e])
140                                 return (e+1,stri)
141                         def parseObj(i):
142                                 i += 1
143                                 res = {}
144                                 i = skipSpace(i)
145                                 if s[i] == '}': # Empty dictionary
146                                         return (i+1,res)
147                                 while True:
148                                         if s[i] != '"':
149                                                 raiseError('Expected a string object key', i)
150                                         i,key = parseString(i)
151                                         i = skipSpace(i)
152                                         if i >= len(s) or s[i] != ':':
153                                                 raiseError('Expected a colon', i)
154                                         i,val = parse(i+1)
155                                         res[key] = val
156                                         i = skipSpace(i)
157                                         if s[i] == '}':
158                                                 return (i+1, res)
159                                         if s[i] != ',':
160                                                 raiseError('Expected comma or closing curly brace', i)
161                                         i = skipSpace(i+1)
162                         def parseArray(i):
163                                 res = []
164                                 i = skipSpace(i+1)
165                                 if s[i] == ']': # Empty array
166                                         return (i+1,res)
167                                 while True:
168                                         i,val = parse(i)
169                                         res.append(val)
170                                         i = skipSpace(i) # Raise exception if premature end
171                                         if s[i] == ']':
172                                                 return (i+1, res)
173                                         if s[i] != ',':
174                                                 raiseError('Expected a comma or closing bracket', i)
175                                         i = skipSpace(i+1)
176                         def parseDiscrete(i):
177                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
178                                         if s.startswith(k, i):
179                                                 return (i+len(k), v)
180                                 raiseError('Not a boolean (or null)', i)
181                         def parseNumber(i):
182                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
183                                 if mobj is None:
184                                         raiseError('Not a number', i)
185                                 nums = mobj.group(1)
186                                 if '.' in nums or 'e' in nums or 'E' in nums:
187                                         return (i+len(nums), float(nums))
188                                 return (i+len(nums), int(nums))
189                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
190                         def parse(i):
191                                 i = skipSpace(i)
192                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
193                                 i = skipSpace(i, False)
194                                 return (i,res)
195                         i,res = parse(0)
196                         if i < len(s):
197                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
198                         return res
199
200 def preferredencoding():
201         """Get preferred encoding.
202
203         Returns the best encoding scheme for the system, based on
204         locale.getpreferredencoding() and some further tweaks.
205         """
206         def yield_preferredencoding():
207                 try:
208                         pref = locale.getpreferredencoding()
209                         u'TEST'.encode(pref)
210                 except:
211                         pref = 'UTF-8'
212                 while True:
213                         yield pref
214         return yield_preferredencoding().next()
215
216
217 def htmlentity_transform(matchobj):
218         """Transforms an HTML entity to a Unicode character.
219
220         This function receives a match object and is intended to be used with
221         the re.sub() function.
222         """
223         entity = matchobj.group(1)
224
225         # Known non-numeric HTML entity
226         if entity in htmlentitydefs.name2codepoint:
227                 return unichr(htmlentitydefs.name2codepoint[entity])
228
229         # Unicode character
230         mobj = re.match(ur'(?u)#(x?\d+)', entity)
231         if mobj is not None:
232                 numstr = mobj.group(1)
233                 if numstr.startswith(u'x'):
234                         base = 16
235                         numstr = u'0%s' % numstr
236                 else:
237                         base = 10
238                 return unichr(long(numstr, base))
239
240         # Unknown entity in name, return its literal representation
241         return (u'&%s;' % entity)
242
243
244 def sanitize_title(utitle):
245         """Sanitizes a video title so it could be used as part of a filename."""
246         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247         return utitle.replace(unicode(os.sep), u'%')
248
249
250 def sanitize_open(filename, open_mode):
251         """Try to open the given filename, and slightly tweak it if this fails.
252
253         Attempts to open the given filename. If this fails, it tries to change
254         the filename slightly, step by step, until it's either able to open it
255         or it fails and raises a final exception, like the standard open()
256         function.
257
258         It returns the tuple (stream, definitive_file_name).
259         """
260         try:
261                 if filename == u'-':
262                         if sys.platform == 'win32':
263                                 import msvcrt
264                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265                         return (sys.stdout, filename)
266                 stream = open(_encodeFilename(filename), open_mode)
267                 return (stream, filename)
268         except (IOError, OSError), err:
269                 # In case of error, try to remove win32 forbidden chars
270                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
271
272                 # An exception here should be caught in the caller
273                 stream = open(_encodeFilename(filename), open_mode)
274                 return (stream, filename)
275
276
277 def timeconvert(timestr):
278         """Convert RFC 2822 defined time string into system timestamp"""
279         timestamp = None
280         timetuple = email.utils.parsedate_tz(timestr)
281         if timetuple is not None:
282                 timestamp = email.utils.mktime_tz(timetuple)
283         return timestamp
284
285 def _simplify_title(title):
286         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287         return expr.sub(u'_', title).strip(u'_')
288
289 def _orderedSet(iterable):
290         """ Remove all duplicates from the input iterable """
291         res = []
292         for el in iterable:
293                 if el not in res:
294                         res.append(el)
295         return res
296
297 def _unescapeHTML(s):
298         """
299         @param s a string (of type unicode)
300         """
301         assert type(s) == type(u'')
302
303         htmlParser = HTMLParser.HTMLParser()
304         return htmlParser.unescape(s)
305
306 def _encodeFilename(s):
307         """
308         @param s The name of the file (of type unicode)
309         """
310
311         assert type(s) == type(u'')
312
313         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
317                 return s
318         else:
319                 return s.encode(sys.getfilesystemencoding(), 'ignore')
320
321 class DownloadError(Exception):
322         """Download Error exception.
323
324         This exception may be thrown by FileDownloader objects if they are not
325         configured to continue on errors. They will contain the appropriate
326         error message.
327         """
328         pass
329
330
331 class SameFileError(Exception):
332         """Same File exception.
333
334         This exception will be thrown by FileDownloader objects if they detect
335         multiple files would have to be downloaded to the same file on disk.
336         """
337         pass
338
339
340 class PostProcessingError(Exception):
341         """Post Processing exception.
342
343         This exception may be raised by PostProcessor's .run() method to
344         indicate an error in the postprocessing task.
345         """
346         pass
347
348 class MaxDownloadsReached(Exception):
349         """ --max-downloads limit has been reached. """
350         pass
351
352
353 class UnavailableVideoError(Exception):
354         """Unavailable Format exception.
355
356         This exception will be thrown when a video is requested
357         in a format that is not available for that video.
358         """
359         pass
360
361
362 class ContentTooShortError(Exception):
363         """Content Too Short exception.
364
365         This exception may be raised by FileDownloader objects when a file they
366         download is too small for what the server announced first, indicating
367         the connection was probably interrupted.
368         """
369         # Both in bytes
370         downloaded = None
371         expected = None
372
373         def __init__(self, downloaded, expected):
374                 self.downloaded = downloaded
375                 self.expected = expected
376
377
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379         """Handler for HTTP requests and responses.
380
381         This class, when installed with an OpenerDirector, automatically adds
382         the standard headers to every HTTP request and handles gzipped and
383         deflated responses from web servers. If compression is to be avoided in
384         a particular request, the original request in the program code only has
385         to include the HTTP header "Youtubedl-No-Compression", which will be
386         removed before making the real request.
387
388         Part of this code was copied from:
389
390         http://techknack.net/python-urllib2-handlers/
391
392         Andrew Rowls, the author of that code, agreed to release it to the
393         public domain.
394         """
395
396         @staticmethod
397         def deflate(data):
398                 try:
399                         return zlib.decompress(data, -zlib.MAX_WBITS)
400                 except zlib.error:
401                         return zlib.decompress(data)
402
403         @staticmethod
404         def addinfourl_wrapper(stream, headers, url, code):
405                 if hasattr(urllib2.addinfourl, 'getcode'):
406                         return urllib2.addinfourl(stream, headers, url, code)
407                 ret = urllib2.addinfourl(stream, headers, url)
408                 ret.code = code
409                 return ret
410
411         def http_request(self, req):
412                 for h in std_headers:
413                         if h in req.headers:
414                                 del req.headers[h]
415                         req.add_header(h, std_headers[h])
416                 if 'Youtubedl-no-compression' in req.headers:
417                         if 'Accept-encoding' in req.headers:
418                                 del req.headers['Accept-encoding']
419                         del req.headers['Youtubedl-no-compression']
420                 return req
421
422         def http_response(self, req, resp):
423                 old_resp = resp
424                 # gzip
425                 if resp.headers.get('Content-encoding', '') == 'gzip':
426                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428                         resp.msg = old_resp.msg
429                 # deflate
430                 if resp.headers.get('Content-encoding', '') == 'deflate':
431                         gz = StringIO.StringIO(self.deflate(resp.read()))
432                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433                         resp.msg = old_resp.msg
434                 return resp
435
436
437 class FileDownloader(object):
438         """File Downloader class.
439
440         File downloader objects are the ones responsible of downloading the
441         actual video file and writing it to disk if the user has requested
442         it, among some other tasks. In most cases there should be one per
443         program. As, given a video URL, the downloader doesn't know how to
444         extract all the needed information, task that InfoExtractors do, it
445         has to pass the URL to one of them.
446
447         For this, file downloader objects have a method that allows
448         InfoExtractors to be registered in a given order. When it is passed
449         a URL, the file downloader handles it to the first InfoExtractor it
450         finds that reports being able to handle it. The InfoExtractor extracts
451         all the information about the video or videos the URL refers to, and
452         asks the FileDownloader to process the video information, possibly
453         downloading the video.
454
455         File downloaders accept a lot of parameters. In order not to saturate
456         the object constructor with arguments, it receives a dictionary of
457         options instead. These options are available through the params
458         attribute for the InfoExtractors to use. The FileDownloader also
459         registers itself as the downloader in charge for the InfoExtractors
460         that are added to it, so this is a "mutual registration".
461
462         Available options:
463
464         username:         Username for authentication purposes.
465         password:         Password for authentication purposes.
466         usenetrc:         Use netrc for authentication instead.
467         quiet:            Do not print messages to stdout.
468         forceurl:         Force printing final URL.
469         forcetitle:       Force printing title.
470         forcethumbnail:   Force printing thumbnail URL.
471         forcedescription: Force printing description.
472         forcefilename:    Force printing final filename.
473         simulate:         Do not download the video files.
474         format:           Video format code.
475         format_limit:     Highest quality format to try.
476         outtmpl:          Template for output names.
477         ignoreerrors:     Do not stop on download errors.
478         ratelimit:        Download speed limit, in bytes/sec.
479         nooverwrites:     Prevent overwriting files.
480         retries:          Number of times to retry for HTTP error 5xx
481         continuedl:       Try to continue downloads if possible.
482         noprogress:       Do not print the progress bar.
483         playliststart:    Playlist item to start at.
484         playlistend:      Playlist item to end at.
485         matchtitle:       Download only matching titles.
486         rejecttitle:      Reject downloads for matching titles.
487         logtostderr:      Log messages to stderr instead of stdout.
488         consoletitle:     Display progress in console window's titlebar.
489         nopart:           Do not use temporary .part files.
490         updatetime:       Use the Last-modified header to set output file timestamps.
491         writedescription: Write the video description to a .description file
492         writeinfojson:    Write the video description to a .info.json file
493         writesubtitles:   Write the video subtitles to a .srt file
494         """
495
496         params = None
497         _ies = []
498         _pps = []
499         _download_retcode = None
500         _num_downloads = None
501         _screen_file = None
502
503         def __init__(self, params):
504                 """Create a FileDownloader object with the given options."""
505                 self._ies = []
506                 self._pps = []
507                 self._download_retcode = 0
508                 self._num_downloads = 0
509                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
510                 self.params = params
511
512         @staticmethod
513         def format_bytes(bytes):
514                 if bytes is None:
515                         return 'N/A'
516                 if type(bytes) is str:
517                         bytes = float(bytes)
518                 if bytes == 0.0:
519                         exponent = 0
520                 else:
521                         exponent = long(math.log(bytes, 1024.0))
522                 suffix = 'bkMGTPEZY'[exponent]
523                 converted = float(bytes) / float(1024 ** exponent)
524                 return '%.2f%s' % (converted, suffix)
525
526         @staticmethod
527         def calc_percent(byte_counter, data_len):
528                 if data_len is None:
529                         return '---.-%'
530                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
531
532         @staticmethod
533         def calc_eta(start, now, total, current):
534                 if total is None:
535                         return '--:--'
536                 dif = now - start
537                 if current == 0 or dif < 0.001: # One millisecond
538                         return '--:--'
539                 rate = float(current) / dif
540                 eta = long((float(total) - float(current)) / rate)
541                 (eta_mins, eta_secs) = divmod(eta, 60)
542                 if eta_mins > 99:
543                         return '--:--'
544                 return '%02d:%02d' % (eta_mins, eta_secs)
545
546         @staticmethod
547         def calc_speed(start, now, bytes):
548                 dif = now - start
549                 if bytes == 0 or dif < 0.001: # One millisecond
550                         return '%10s' % '---b/s'
551                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
552
553         @staticmethod
554         def best_block_size(elapsed_time, bytes):
555                 new_min = max(bytes / 2.0, 1.0)
556                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
557                 if elapsed_time < 0.001:
558                         return long(new_max)
559                 rate = bytes / elapsed_time
560                 if rate > new_max:
561                         return long(new_max)
562                 if rate < new_min:
563                         return long(new_min)
564                 return long(rate)
565
566         @staticmethod
567         def parse_bytes(bytestr):
568                 """Parse a string indicating a byte quantity into a long integer."""
569                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
570                 if matchobj is None:
571                         return None
572                 number = float(matchobj.group(1))
573                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
574                 return long(round(number * multiplier))
575
576         def add_info_extractor(self, ie):
577                 """Add an InfoExtractor object to the end of the list."""
578                 self._ies.append(ie)
579                 ie.set_downloader(self)
580
581         def add_post_processor(self, pp):
582                 """Add a PostProcessor object to the end of the chain."""
583                 self._pps.append(pp)
584                 pp.set_downloader(self)
585
586         def to_screen(self, message, skip_eol=False):
587                 """Print message to stdout if not in quiet mode."""
588                 assert type(message) == type(u'')
589                 if not self.params.get('quiet', False):
590                         terminator = [u'\n', u''][skip_eol]
591                         output = message + terminator
592
593                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
594                                 output = output.encode(preferredencoding(), 'ignore')
595                         self._screen_file.write(output)
596                         self._screen_file.flush()
597
598         def to_stderr(self, message):
599                 """Print message to stderr."""
600                 print >>sys.stderr, message.encode(preferredencoding())
601
602         def to_cons_title(self, message):
603                 """Set console/terminal window title to message."""
604                 if not self.params.get('consoletitle', False):
605                         return
606                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
607                         # c_wchar_p() might not be necessary if `message` is
608                         # already of type unicode()
609                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
610                 elif 'TERM' in os.environ:
611                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
612
613         def fixed_template(self):
614                 """Checks if the output template is fixed."""
615                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
616
617         def trouble(self, message=None):
618                 """Determine action to take when a download problem appears.
619
620                 Depending on if the downloader has been configured to ignore
621                 download errors or not, this method may throw an exception or
622                 not when errors are found, after printing the message.
623                 """
624                 if message is not None:
625                         self.to_stderr(message)
626                 if not self.params.get('ignoreerrors', False):
627                         raise DownloadError(message)
628                 self._download_retcode = 1
629
630         def slow_down(self, start_time, byte_counter):
631                 """Sleep if the download speed is over the rate limit."""
632                 rate_limit = self.params.get('ratelimit', None)
633                 if rate_limit is None or byte_counter == 0:
634                         return
635                 now = time.time()
636                 elapsed = now - start_time
637                 if elapsed <= 0.0:
638                         return
639                 speed = float(byte_counter) / elapsed
640                 if speed > rate_limit:
641                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
642
643         def temp_name(self, filename):
644                 """Returns a temporary filename for the given filename."""
645                 if self.params.get('nopart', False) or filename == u'-' or \
646                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
647                         return filename
648                 return filename + u'.part'
649
650         def undo_temp_name(self, filename):
651                 if filename.endswith(u'.part'):
652                         return filename[:-len(u'.part')]
653                 return filename
654
655         def try_rename(self, old_filename, new_filename):
656                 try:
657                         if old_filename == new_filename:
658                                 return
659                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
660                 except (IOError, OSError), err:
661                         self.trouble(u'ERROR: unable to rename file')
662
663         def try_utime(self, filename, last_modified_hdr):
664                 """Try to set the last-modified time of the given file."""
665                 if last_modified_hdr is None:
666                         return
667                 if not os.path.isfile(_encodeFilename(filename)):
668                         return
669                 timestr = last_modified_hdr
670                 if timestr is None:
671                         return
672                 filetime = timeconvert(timestr)
673                 if filetime is None:
674                         return filetime
675                 try:
676                         os.utime(filename, (time.time(), filetime))
677                 except:
678                         pass
679                 return filetime
680
681         def report_writedescription(self, descfn):
682                 """ Report that the description file is being written """
683                 self.to_screen(u'[info] Writing video description to: ' + descfn)
684
685         def report_writesubtitles(self, srtfn):
686                 """ Report that the subtitles file is being written """
687                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
688
689         def report_writeinfojson(self, infofn):
690                 """ Report that the metadata file has been written """
691                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
692
693         def report_destination(self, filename):
694                 """Report destination filename."""
695                 self.to_screen(u'[download] Destination: ' + filename)
696
697         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
698                 """Report download progress."""
699                 if self.params.get('noprogress', False):
700                         return
701                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
702                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
703                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
704                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
705
706         def report_resuming_byte(self, resume_len):
707                 """Report attempt to resume at given byte."""
708                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
709
710         def report_retry(self, count, retries):
711                 """Report retry in case of HTTP error 5xx"""
712                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
713
714         def report_file_already_downloaded(self, file_name):
715                 """Report file has already been fully downloaded."""
716                 try:
717                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
718                 except (UnicodeEncodeError), err:
719                         self.to_screen(u'[download] The file has already been downloaded')
720
721         def report_unable_to_resume(self):
722                 """Report it was impossible to resume download."""
723                 self.to_screen(u'[download] Unable to resume')
724
725         def report_finish(self):
726                 """Report download finished."""
727                 if self.params.get('noprogress', False):
728                         self.to_screen(u'[download] Download completed')
729                 else:
730                         self.to_screen(u'')
731
732         def increment_downloads(self):
733                 """Increment the ordinal that assigns a number to each file."""
734                 self._num_downloads += 1
735
736         def prepare_filename(self, info_dict):
737                 """Generate the output filename."""
738                 try:
739                         template_dict = dict(info_dict)
740                         template_dict['epoch'] = unicode(long(time.time()))
741                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
742                         filename = self.params['outtmpl'] % template_dict
743                         return filename
744                 except (ValueError, KeyError), err:
745                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
746                         return None
747
748         def _match_entry(self, info_dict):
749                 """ Returns None iff the file should be downloaded """
750
751                 title = info_dict['title']
752                 matchtitle = self.params.get('matchtitle', False)
753                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
754                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
755                 rejecttitle = self.params.get('rejecttitle', False)
756                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
757                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
758                 return None
759
760         def process_info(self, info_dict):
761                 """Process a single dictionary returned by an InfoExtractor."""
762
763                 reason = self._match_entry(info_dict)
764                 if reason is not None:
765                         self.to_screen(u'[download] ' + reason)
766                         return
767
768                 max_downloads = self.params.get('max_downloads')
769                 if max_downloads is not None:
770                         if self._num_downloads > int(max_downloads):
771                                 raise MaxDownloadsReached()
772
773                 filename = self.prepare_filename(info_dict)
774                 
775                 # Forced printings
776                 if self.params.get('forcetitle', False):
777                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
778                 if self.params.get('forceurl', False):
779                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
780                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
781                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
782                 if self.params.get('forcedescription', False) and 'description' in info_dict:
783                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
784                 if self.params.get('forcefilename', False) and filename is not None:
785                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
786                 if self.params.get('forceformat', False):
787                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
788
789                 # Do nothing else if in simulate mode
790                 if self.params.get('simulate', False):
791                         return
792
793                 if filename is None:
794                         return
795
796                 try:
797                         dn = os.path.dirname(_encodeFilename(filename))
798                         if dn != '' and not os.path.exists(dn): # dn is already encoded
799                                 os.makedirs(dn)
800                 except (OSError, IOError), err:
801                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
802                         return
803
804                 if self.params.get('writedescription', False):
805                         try:
806                                 descfn = filename + u'.description'
807                                 self.report_writedescription(descfn)
808                                 descfile = open(_encodeFilename(descfn), 'wb')
809                                 try:
810                                         descfile.write(info_dict['description'].encode('utf-8'))
811                                 finally:
812                                         descfile.close()
813                         except (OSError, IOError):
814                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
815                                 return
816                                 
817                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
818                         # subtitles download errors are already managed as troubles in relevant IE
819                         # that way it will silently go on when used with unsupporting IE 
820                         try:
821                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
822                                 self.report_writesubtitles(srtfn)
823                                 srtfile = open(_encodeFilename(srtfn), 'wb')
824                                 try:
825                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
826                                 finally:
827                                         srtfile.close()
828                         except (OSError, IOError):
829                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
830                                 return
831
832                 if self.params.get('writeinfojson', False):
833                         infofn = filename + u'.info.json'
834                         self.report_writeinfojson(infofn)
835                         try:
836                                 json.dump
837                         except (NameError,AttributeError):
838                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
839                                 return
840                         try:
841                                 infof = open(_encodeFilename(infofn), 'wb')
842                                 try:
843                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
844                                         json.dump(json_info_dict, infof)
845                                 finally:
846                                         infof.close()
847                         except (OSError, IOError):
848                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
849                                 return
850
851                 if not self.params.get('skip_download', False):
852                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
853                                 success = True
854                         else:
855                                 try:
856                                         success = self._do_download(filename, info_dict)
857                                 except (OSError, IOError), err:
858                                         raise UnavailableVideoError
859                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
860                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
861                                         return
862                                 except (ContentTooShortError, ), err:
863                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
864                                         return
865         
866                         if success:
867                                 try:
868                                         self.post_process(filename, info_dict)
869                                 except (PostProcessingError), err:
870                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
871                                         return
872
873         def download(self, url_list):
874                 """Download a given list of URLs."""
875                 if len(url_list) > 1 and self.fixed_template():
876                         raise SameFileError(self.params['outtmpl'])
877
878                 for url in url_list:
879                         suitable_found = False
880                         for ie in self._ies:
881                                 # Go to next InfoExtractor if not suitable
882                                 if not ie.suitable(url):
883                                         continue
884
885                                 # Suitable InfoExtractor found
886                                 suitable_found = True
887
888                                 # Extract information from URL and process it
889                                 ie.extract(url)
890
891                                 # Suitable InfoExtractor had been found; go to next URL
892                                 break
893
894                         if not suitable_found:
895                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
896
897                 return self._download_retcode
898
899         def post_process(self, filename, ie_info):
900                 """Run the postprocessing chain on the given file."""
901                 info = dict(ie_info)
902                 info['filepath'] = filename
903                 for pp in self._pps:
904                         info = pp.run(info)
905                         if info is None:
906                                 break
907
908         def _download_with_rtmpdump(self, filename, url, player_url):
909                 self.report_destination(filename)
910                 tmpfilename = self.temp_name(filename)
911
912                 # Check for rtmpdump first
913                 try:
914                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
915                 except (OSError, IOError):
916                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
917                         return False
918
919                 # Download using rtmpdump. rtmpdump returns exit code 2 when
920                 # the connection was interrumpted and resuming appears to be
921                 # possible. This is part of rtmpdump's normal usage, AFAIK.
922                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
923                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
924                 if self.params.get('verbose', False):
925                         try:
926                                 import pipes
927                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
928                         except ImportError:
929                                 shell_quote = repr
930                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
931                 retval = subprocess.call(args)
932                 while retval == 2 or retval == 1:
933                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
934                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
935                         time.sleep(5.0) # This seems to be needed
936                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
937                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
938                         if prevsize == cursize and retval == 1:
939                                 break
940                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
941                         if prevsize == cursize and retval == 2 and cursize > 1024:
942                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
943                                 retval = 0
944                                 break
945                 if retval == 0:
946                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
947                         self.try_rename(tmpfilename, filename)
948                         return True
949                 else:
950                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
951                         return False
952
953         def _do_download(self, filename, info_dict):
954                 url = info_dict['url']
955                 player_url = info_dict.get('player_url', None)
956
957                 # Check file already present
958                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
959                         self.report_file_already_downloaded(filename)
960                         return True
961
962                 # Attempt to download using rtmpdump
963                 if url.startswith('rtmp'):
964                         return self._download_with_rtmpdump(filename, url, player_url)
965
966                 tmpfilename = self.temp_name(filename)
967                 stream = None
968
969                 # Do not include the Accept-Encoding header
970                 headers = {'Youtubedl-no-compression': 'True'}
971                 basic_request = urllib2.Request(url, None, headers)
972                 request = urllib2.Request(url, None, headers)
973
974                 # Establish possible resume length
975                 if os.path.isfile(_encodeFilename(tmpfilename)):
976                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
977                 else:
978                         resume_len = 0
979
980                 open_mode = 'wb'
981                 if resume_len != 0:
982                         if self.params.get('continuedl', False):
983                                 self.report_resuming_byte(resume_len)
984                                 request.add_header('Range','bytes=%d-' % resume_len)
985                                 open_mode = 'ab'
986                         else:
987                                 resume_len = 0
988
989                 count = 0
990                 retries = self.params.get('retries', 0)
991                 while count <= retries:
992                         # Establish connection
993                         try:
994                                 if count == 0 and 'urlhandle' in info_dict:
995                                         data = info_dict['urlhandle']
996                                 data = urllib2.urlopen(request)
997                                 break
998                         except (urllib2.HTTPError, ), err:
999                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1000                                         # Unexpected HTTP error
1001                                         raise
1002                                 elif err.code == 416:
1003                                         # Unable to resume (requested range not satisfiable)
1004                                         try:
1005                                                 # Open the connection again without the range header
1006                                                 data = urllib2.urlopen(basic_request)
1007                                                 content_length = data.info()['Content-Length']
1008                                         except (urllib2.HTTPError, ), err:
1009                                                 if err.code < 500 or err.code >= 600:
1010                                                         raise
1011                                         else:
1012                                                 # Examine the reported length
1013                                                 if (content_length is not None and
1014                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1015                                                         # The file had already been fully downloaded.
1016                                                         # Explanation to the above condition: in issue #175 it was revealed that
1017                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1018                                                         # changing the file size slightly and causing problems for some users. So
1019                                                         # I decided to implement a suggested change and consider the file
1020                                                         # completely downloaded if the file size differs less than 100 bytes from
1021                                                         # the one in the hard drive.
1022                                                         self.report_file_already_downloaded(filename)
1023                                                         self.try_rename(tmpfilename, filename)
1024                                                         return True
1025                                                 else:
1026                                                         # The length does not match, we start the download over
1027                                                         self.report_unable_to_resume()
1028                                                         open_mode = 'wb'
1029                                                         break
1030                         # Retry
1031                         count += 1
1032                         if count <= retries:
1033                                 self.report_retry(count, retries)
1034
1035                 if count > retries:
1036                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1037                         return False
1038
1039                 data_len = data.info().get('Content-length', None)
1040                 if data_len is not None:
1041                         data_len = long(data_len) + resume_len
1042                 data_len_str = self.format_bytes(data_len)
1043                 byte_counter = 0 + resume_len
1044                 block_size = 1024
1045                 start = time.time()
1046                 while True:
1047                         # Download and write
1048                         before = time.time()
1049                         data_block = data.read(block_size)
1050                         after = time.time()
1051                         if len(data_block) == 0:
1052                                 break
1053                         byte_counter += len(data_block)
1054
1055                         # Open file just in time
1056                         if stream is None:
1057                                 try:
1058                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1059                                         assert stream is not None
1060                                         filename = self.undo_temp_name(tmpfilename)
1061                                         self.report_destination(filename)
1062                                 except (OSError, IOError), err:
1063                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1064                                         return False
1065                         try:
1066                                 stream.write(data_block)
1067                         except (IOError, OSError), err:
1068                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1069                                 return False
1070                         block_size = self.best_block_size(after - before, len(data_block))
1071
1072                         # Progress message
1073                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1074                         if data_len is None:
1075                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1076                         else:
1077                                 percent_str = self.calc_percent(byte_counter, data_len)
1078                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1079                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1080
1081                         # Apply rate limit
1082                         self.slow_down(start, byte_counter - resume_len)
1083
1084                 if stream is None:
1085                         self.trouble(u'\nERROR: Did not get any data blocks')
1086                         return False
1087                 stream.close()
1088                 self.report_finish()
1089                 if data_len is not None and byte_counter != data_len:
1090                         raise ContentTooShortError(byte_counter, long(data_len))
1091                 self.try_rename(tmpfilename, filename)
1092
1093                 # Update file modification time
1094                 if self.params.get('updatetime', True):
1095                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1096
1097                 return True
1098
1099
1100 class InfoExtractor(object):
1101         """Information Extractor class.
1102
1103         Information extractors are the classes that, given a URL, extract
1104         information from the video (or videos) the URL refers to. This
1105         information includes the real video URL, the video title and simplified
1106         title, author and others. The information is stored in a dictionary
1107         which is then passed to the FileDownloader. The FileDownloader
1108         processes this information possibly downloading the video to the file
1109         system, among other possible outcomes. The dictionaries must include
1110         the following fields:
1111
1112         id:             Video identifier.
1113         url:            Final video URL.
1114         uploader:       Nickname of the video uploader.
1115         title:          Literal title.
1116         stitle:         Simplified title.
1117         ext:            Video filename extension.
1118         format:         Video format.
1119         player_url:     SWF Player URL (may be None).
1120
1121         The following fields are optional. Their primary purpose is to allow
1122         youtube-dl to serve as the backend for a video search function, such
1123         as the one in youtube2mp3.  They are only used when their respective
1124         forced printing functions are called:
1125
1126         thumbnail:      Full URL to a video thumbnail image.
1127         description:    One-line video description.
1128
1129         Subclasses of this one should re-define the _real_initialize() and
1130         _real_extract() methods and define a _VALID_URL regexp.
1131         Probably, they should also be added to the list of extractors.
1132         """
1133
1134         _ready = False
1135         _downloader = None
1136
1137         def __init__(self, downloader=None):
1138                 """Constructor. Receives an optional downloader."""
1139                 self._ready = False
1140                 self.set_downloader(downloader)
1141
1142         def suitable(self, url):
1143                 """Receives a URL and returns True if suitable for this IE."""
1144                 return re.match(self._VALID_URL, url) is not None
1145
1146         def initialize(self):
1147                 """Initializes an instance (authentication, etc)."""
1148                 if not self._ready:
1149                         self._real_initialize()
1150                         self._ready = True
1151
1152         def extract(self, url):
1153                 """Extracts URL information and returns it in list of dicts."""
1154                 self.initialize()
1155                 return self._real_extract(url)
1156
1157         def set_downloader(self, downloader):
1158                 """Sets the downloader for this IE."""
1159                 self._downloader = downloader
1160
1161         def _real_initialize(self):
1162                 """Real initialization process. Redefine in subclasses."""
1163                 pass
1164
1165         def _real_extract(self, url):
1166                 """Real extraction process. Redefine in subclasses."""
1167                 pass
1168
1169
1170 class YoutubeIE(InfoExtractor):
1171         """Information extractor for youtube.com."""
1172
1173         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1174         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1175         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1176         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1177         _NETRC_MACHINE = 'youtube'
1178         # Listed in order of quality
1179         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1180         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1181         _video_extensions = {
1182                 '13': '3gp',
1183                 '17': 'mp4',
1184                 '18': 'mp4',
1185                 '22': 'mp4',
1186                 '37': 'mp4',
1187                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1188                 '43': 'webm',
1189                 '44': 'webm',
1190                 '45': 'webm',
1191         }
1192         _video_dimensions = {
1193                 '5': '240x400',
1194                 '6': '???',
1195                 '13': '???',
1196                 '17': '144x176',
1197                 '18': '360x640',
1198                 '22': '720x1280',
1199                 '34': '360x640',
1200                 '35': '480x854',
1201                 '37': '1080x1920',
1202                 '38': '3072x4096',
1203                 '43': '360x640',
1204                 '44': '480x854',
1205                 '45': '720x1280',
1206         }       
1207         IE_NAME = u'youtube'
1208
1209         def report_lang(self):
1210                 """Report attempt to set language."""
1211                 self._downloader.to_screen(u'[youtube] Setting language')
1212
1213         def report_login(self):
1214                 """Report attempt to log in."""
1215                 self._downloader.to_screen(u'[youtube] Logging in')
1216
1217         def report_age_confirmation(self):
1218                 """Report attempt to confirm age."""
1219                 self._downloader.to_screen(u'[youtube] Confirming age')
1220
1221         def report_video_webpage_download(self, video_id):
1222                 """Report attempt to download video webpage."""
1223                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1224
1225         def report_video_info_webpage_download(self, video_id):
1226                 """Report attempt to download video info webpage."""
1227                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1228
1229         def report_video_subtitles_download(self, video_id):
1230                 """Report attempt to download video info webpage."""
1231                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1232
1233         def report_information_extraction(self, video_id):
1234                 """Report attempt to extract video information."""
1235                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1236
1237         def report_unavailable_format(self, video_id, format):
1238                 """Report extracted video URL."""
1239                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1240
1241         def report_rtmp_download(self):
1242                 """Indicate the download will use the RTMP protocol."""
1243                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1244
1245         def _closed_captions_xml_to_srt(self, xml_string):
1246                 srt = ''
1247                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1248                 # TODO parse xml instead of regex
1249                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1250                         if not dur: dur = '4'
1251                         start = float(start)
1252                         end = start + float(dur)
1253                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1254                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1255                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1256                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1257                         srt += str(n) + '\n'
1258                         srt += start + ' --> ' + end + '\n'
1259                         srt += caption + '\n\n'
1260                 return srt
1261
1262         def _print_formats(self, formats):
1263                 print 'Available formats:'
1264                 for x in formats:
1265                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1266
1267         def _real_initialize(self):
1268                 if self._downloader is None:
1269                         return
1270
1271                 username = None
1272                 password = None
1273                 downloader_params = self._downloader.params
1274
1275                 # Attempt to use provided username and password or .netrc data
1276                 if downloader_params.get('username', None) is not None:
1277                         username = downloader_params['username']
1278                         password = downloader_params['password']
1279                 elif downloader_params.get('usenetrc', False):
1280                         try:
1281                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1282                                 if info is not None:
1283                                         username = info[0]
1284                                         password = info[2]
1285                                 else:
1286                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1287                         except (IOError, netrc.NetrcParseError), err:
1288                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1289                                 return
1290
1291                 # Set language
1292                 request = urllib2.Request(self._LANG_URL)
1293                 try:
1294                         self.report_lang()
1295                         urllib2.urlopen(request).read()
1296                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1297                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1298                         return
1299
1300                 # No authentication to be performed
1301                 if username is None:
1302                         return
1303
1304                 # Log in
1305                 login_form = {
1306                                 'current_form': 'loginForm',
1307                                 'next':         '/',
1308                                 'action_login': 'Log In',
1309                                 'username':     username,
1310                                 'password':     password,
1311                                 }
1312                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1313                 try:
1314                         self.report_login()
1315                         login_results = urllib2.urlopen(request).read()
1316                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1317                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1318                                 return
1319                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1320                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1321                         return
1322
1323                 # Confirm age
1324                 age_form = {
1325                                 'next_url':             '/',
1326                                 'action_confirm':       'Confirm',
1327                                 }
1328                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1329                 try:
1330                         self.report_age_confirmation()
1331                         age_results = urllib2.urlopen(request).read()
1332                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1333                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1334                         return
1335
1336         def _real_extract(self, url):
1337                 # Extract video id from URL
1338                 mobj = re.match(self._VALID_URL, url)
1339                 if mobj is None:
1340                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1341                         return
1342                 video_id = mobj.group(2)
1343
1344                 # Get video webpage
1345                 self.report_video_webpage_download(video_id)
1346                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1347                 try:
1348                         video_webpage = urllib2.urlopen(request).read()
1349                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1350                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1351                         return
1352
1353                 # Attempt to extract SWF player URL
1354                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1355                 if mobj is not None:
1356                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1357                 else:
1358                         player_url = None
1359
1360                 # Get video info
1361                 self.report_video_info_webpage_download(video_id)
1362                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1363                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1364                                         % (video_id, el_type))
1365                         request = urllib2.Request(video_info_url)
1366                         try:
1367                                 video_info_webpage = urllib2.urlopen(request).read()
1368                                 video_info = parse_qs(video_info_webpage)
1369                                 if 'token' in video_info:
1370                                         break
1371                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1372                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1373                                 return
1374                 if 'token' not in video_info:
1375                         if 'reason' in video_info:
1376                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1377                         else:
1378                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1379                         return
1380
1381                 # Start extracting information
1382                 self.report_information_extraction(video_id)
1383
1384                 # uploader
1385                 if 'author' not in video_info:
1386                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1387                         return
1388                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1389
1390                 # title
1391                 if 'title' not in video_info:
1392                         self._downloader.trouble(u'ERROR: unable to extract video title')
1393                         return
1394                 video_title = urllib.unquote_plus(video_info['title'][0])
1395                 video_title = video_title.decode('utf-8')
1396                 video_title = sanitize_title(video_title)
1397
1398                 # simplified title
1399                 simple_title = _simplify_title(video_title)
1400
1401                 # thumbnail image
1402                 if 'thumbnail_url' not in video_info:
1403                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1404                         video_thumbnail = ''
1405                 else:   # don't panic if we can't find it
1406                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1407
1408                 # upload date
1409                 upload_date = u'NA'
1410                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1411                 if mobj is not None:
1412                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1413                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1414                         for expression in format_expressions:
1415                                 try:
1416                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1417                                 except:
1418                                         pass
1419
1420                 # description
1421                 try:
1422                         lxml.etree
1423                 except NameError:
1424                         video_description = u'No description available.'
1425                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1426                         if mobj is not None:
1427                                 video_description = mobj.group(1).decode('utf-8')
1428                 else:
1429                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1430                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1431                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1432                         # TODO use another parser
1433                         
1434                 # closed captions
1435                 video_subtitles = None
1436                 if self._downloader.params.get('writesubtitles', False):
1437                         self.report_video_subtitles_download(video_id)
1438                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1439                         try:
1440                                 srt_list = urllib2.urlopen(request).read()
1441                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1443                         else:
1444                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1445                                 if srt_lang_list:
1446                                         if 'en' in srt_lang_list: srt_lang = 'en'
1447                                         else: srt_lang = srt_lang_list[0] # TODO choose better and provide an override
1448                                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1449                                         try:
1450                                                 srt_xml = urllib2.urlopen(request).read()
1451                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1452                                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1453                                         else:
1454                                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1455                                 else:
1456                                         self._downloader.trouble(u'WARNING: video has no subtitles')
1457
1458                 # token
1459                 video_token = urllib.unquote_plus(video_info['token'][0])
1460
1461                 # Decide which formats to download
1462                 req_format = self._downloader.params.get('format', None)
1463
1464                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1465                         self.report_rtmp_download()
1466                         video_url_list = [(None, video_info['conn'][0])]
1467                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1468                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1469                         url_data = [parse_qs(uds) for uds in url_data_strs]
1470                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1471                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1472
1473                         format_limit = self._downloader.params.get('format_limit', None)
1474                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1475                         if format_limit is not None and format_limit in available_formats:
1476                                 format_list = available_formats[available_formats.index(format_limit):]
1477                         else:
1478                                 format_list = available_formats
1479                         existing_formats = [x for x in format_list if x in url_map]
1480                         if len(existing_formats) == 0:
1481                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1482                                 return
1483                         if self._downloader.params.get('listformats', None):
1484                                 self._print_formats(existing_formats)
1485                                 return
1486                         if req_format is None or req_format == 'best':
1487                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1488                         elif req_format == 'worst':
1489                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1490                         elif req_format in ('-1', 'all'):
1491                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1492                         else:
1493                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1494                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1495                                 req_formats = req_format.split('/')
1496                                 video_url_list = None
1497                                 for rf in req_formats:
1498                                         if rf in url_map:
1499                                                 video_url_list = [(rf, url_map[rf])]
1500                                                 break
1501                                 if video_url_list is None:
1502                                         self._downloader.trouble(u'ERROR: requested format not available')
1503                                         return
1504                 else:
1505                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1506                         return
1507
1508                 for format_param, video_real_url in video_url_list:
1509                         # At this point we have a new video
1510                         self._downloader.increment_downloads()
1511
1512                         # Extension
1513                         video_extension = self._video_extensions.get(format_param, 'flv')
1514
1515                         try:
1516                                 # Process video information
1517                                 self._downloader.process_info({
1518                                         'id':           video_id.decode('utf-8'),
1519                                         'url':          video_real_url.decode('utf-8'),
1520                                         'uploader':     video_uploader.decode('utf-8'),
1521                                         'upload_date':  upload_date,
1522                                         'title':        video_title,
1523                                         'stitle':       simple_title,
1524                                         'ext':          video_extension.decode('utf-8'),
1525                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1526                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1527                                         'description':  video_description,
1528                                         'player_url':   player_url,
1529                                         'subtitles':    video_subtitles
1530                                 })
1531                         except UnavailableVideoError, err:
1532                                 self._downloader.trouble(u'\nERROR: unable to download video')
1533
1534
1535 class MetacafeIE(InfoExtractor):
1536         """Information Extractor for metacafe.com."""
1537
1538         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1539         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1540         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1541         _youtube_ie = None
1542         IE_NAME = u'metacafe'
1543
1544         def __init__(self, youtube_ie, downloader=None):
1545                 InfoExtractor.__init__(self, downloader)
1546                 self._youtube_ie = youtube_ie
1547
1548         def report_disclaimer(self):
1549                 """Report disclaimer retrieval."""
1550                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1551
1552         def report_age_confirmation(self):
1553                 """Report attempt to confirm age."""
1554                 self._downloader.to_screen(u'[metacafe] Confirming age')
1555
1556         def report_download_webpage(self, video_id):
1557                 """Report webpage download."""
1558                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1559
1560         def report_extraction(self, video_id):
1561                 """Report information extraction."""
1562                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1563
1564         def _real_initialize(self):
1565                 # Retrieve disclaimer
1566                 request = urllib2.Request(self._DISCLAIMER)
1567                 try:
1568                         self.report_disclaimer()
1569                         disclaimer = urllib2.urlopen(request).read()
1570                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1571                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1572                         return
1573
1574                 # Confirm age
1575                 disclaimer_form = {
1576                         'filters': '0',
1577                         'submit': "Continue - I'm over 18",
1578                         }
1579                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1580                 try:
1581                         self.report_age_confirmation()
1582                         disclaimer = urllib2.urlopen(request).read()
1583                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1584                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1585                         return
1586
1587         def _real_extract(self, url):
1588                 # Extract id and simplified title from URL
1589                 mobj = re.match(self._VALID_URL, url)
1590                 if mobj is None:
1591                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1592                         return
1593
1594                 video_id = mobj.group(1)
1595
1596                 # Check if video comes from YouTube
1597                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1598                 if mobj2 is not None:
1599                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1600                         return
1601
1602                 # At this point we have a new video
1603                 self._downloader.increment_downloads()
1604
1605                 simple_title = mobj.group(2).decode('utf-8')
1606
1607                 # Retrieve video webpage to extract further information
1608                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1609                 try:
1610                         self.report_download_webpage(video_id)
1611                         webpage = urllib2.urlopen(request).read()
1612                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1613                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1614                         return
1615
1616                 # Extract URL, uploader and title from webpage
1617                 self.report_extraction(video_id)
1618                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1619                 if mobj is not None:
1620                         mediaURL = urllib.unquote(mobj.group(1))
1621                         video_extension = mediaURL[-3:]
1622
1623                         # Extract gdaKey if available
1624                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1625                         if mobj is None:
1626                                 video_url = mediaURL
1627                         else:
1628                                 gdaKey = mobj.group(1)
1629                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1630                 else:
1631                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1632                         if mobj is None:
1633                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1634                                 return
1635                         vardict = parse_qs(mobj.group(1))
1636                         if 'mediaData' not in vardict:
1637                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1638                                 return
1639                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1640                         if mobj is None:
1641                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1642                                 return
1643                         mediaURL = mobj.group(1).replace('\\/', '/')
1644                         video_extension = mediaURL[-3:]
1645                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1646
1647                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1648                 if mobj is None:
1649                         self._downloader.trouble(u'ERROR: unable to extract title')
1650                         return
1651                 video_title = mobj.group(1).decode('utf-8')
1652                 video_title = sanitize_title(video_title)
1653
1654                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1655                 if mobj is None:
1656                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1657                         return
1658                 video_uploader = mobj.group(1)
1659
1660                 try:
1661                         # Process video information
1662                         self._downloader.process_info({
1663                                 'id':           video_id.decode('utf-8'),
1664                                 'url':          video_url.decode('utf-8'),
1665                                 'uploader':     video_uploader.decode('utf-8'),
1666                                 'upload_date':  u'NA',
1667                                 'title':        video_title,
1668                                 'stitle':       simple_title,
1669                                 'ext':          video_extension.decode('utf-8'),
1670                                 'format':       u'NA',
1671                                 'player_url':   None,
1672                         })
1673                 except UnavailableVideoError:
1674                         self._downloader.trouble(u'\nERROR: unable to download video')
1675
1676
1677 class DailymotionIE(InfoExtractor):
1678         """Information Extractor for Dailymotion"""
1679
1680         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1681         IE_NAME = u'dailymotion'
1682
1683         def __init__(self, downloader=None):
1684                 InfoExtractor.__init__(self, downloader)
1685
1686         def report_download_webpage(self, video_id):
1687                 """Report webpage download."""
1688                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1689
1690         def report_extraction(self, video_id):
1691                 """Report information extraction."""
1692                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1693
1694         def _real_extract(self, url):
1695                 # Extract id and simplified title from URL
1696                 mobj = re.match(self._VALID_URL, url)
1697                 if mobj is None:
1698                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1699                         return
1700
1701                 # At this point we have a new video
1702                 self._downloader.increment_downloads()
1703                 video_id = mobj.group(1)
1704
1705                 video_extension = 'flv'
1706
1707                 # Retrieve video webpage to extract further information
1708                 request = urllib2.Request(url)
1709                 request.add_header('Cookie', 'family_filter=off')
1710                 try:
1711                         self.report_download_webpage(video_id)
1712                         webpage = urllib2.urlopen(request).read()
1713                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1714                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1715                         return
1716
1717                 # Extract URL, uploader and title from webpage
1718                 self.report_extraction(video_id)
1719                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1720                 if mobj is None:
1721                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1722                         return
1723                 sequence = urllib.unquote(mobj.group(1))
1724                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1725                 if mobj is None:
1726                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1727                         return
1728                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1729
1730                 # if needed add http://www.dailymotion.com/ if relative URL
1731
1732                 video_url = mediaURL
1733
1734                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1735                 if mobj is None:
1736                         self._downloader.trouble(u'ERROR: unable to extract title')
1737                         return
1738                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1739                 video_title = sanitize_title(video_title)
1740                 simple_title = _simplify_title(video_title)
1741
1742                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1743                 if mobj is None:
1744                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1745                         return
1746                 video_uploader = mobj.group(1)
1747
1748                 try:
1749                         # Process video information
1750                         self._downloader.process_info({
1751                                 'id':           video_id.decode('utf-8'),
1752                                 'url':          video_url.decode('utf-8'),
1753                                 'uploader':     video_uploader.decode('utf-8'),
1754                                 'upload_date':  u'NA',
1755                                 'title':        video_title,
1756                                 'stitle':       simple_title,
1757                                 'ext':          video_extension.decode('utf-8'),
1758                                 'format':       u'NA',
1759                                 'player_url':   None,
1760                         })
1761                 except UnavailableVideoError:
1762                         self._downloader.trouble(u'\nERROR: unable to download video')
1763
1764
1765 class GoogleIE(InfoExtractor):
1766         """Information extractor for video.google.com."""
1767
1768         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1769         IE_NAME = u'video.google'
1770
1771         def __init__(self, downloader=None):
1772                 InfoExtractor.__init__(self, downloader)
1773
1774         def report_download_webpage(self, video_id):
1775                 """Report webpage download."""
1776                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1777
1778         def report_extraction(self, video_id):
1779                 """Report information extraction."""
1780                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1781
1782         def _real_extract(self, url):
1783                 # Extract id from URL
1784                 mobj = re.match(self._VALID_URL, url)
1785                 if mobj is None:
1786                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1787                         return
1788
1789                 # At this point we have a new video
1790                 self._downloader.increment_downloads()
1791                 video_id = mobj.group(1)
1792
1793                 video_extension = 'mp4'
1794
1795                 # Retrieve video webpage to extract further information
1796                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1797                 try:
1798                         self.report_download_webpage(video_id)
1799                         webpage = urllib2.urlopen(request).read()
1800                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1801                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1802                         return
1803
1804                 # Extract URL, uploader, and title from webpage
1805                 self.report_extraction(video_id)
1806                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1807                 if mobj is None:
1808                         video_extension = 'flv'
1809                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1810                 if mobj is None:
1811                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1812                         return
1813                 mediaURL = urllib.unquote(mobj.group(1))
1814                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1815                 mediaURL = mediaURL.replace('\\x26', '\x26')
1816
1817                 video_url = mediaURL
1818
1819                 mobj = re.search(r'<title>(.*)</title>', webpage)
1820                 if mobj is None:
1821                         self._downloader.trouble(u'ERROR: unable to extract title')
1822                         return
1823                 video_title = mobj.group(1).decode('utf-8')
1824                 video_title = sanitize_title(video_title)
1825                 simple_title = _simplify_title(video_title)
1826
1827                 # Extract video description
1828                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1829                 if mobj is None:
1830                         self._downloader.trouble(u'ERROR: unable to extract video description')
1831                         return
1832                 video_description = mobj.group(1).decode('utf-8')
1833                 if not video_description:
1834                         video_description = 'No description available.'
1835
1836                 # Extract video thumbnail
1837                 if self._downloader.params.get('forcethumbnail', False):
1838                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1839                         try:
1840                                 webpage = urllib2.urlopen(request).read()
1841                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1842                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1843                                 return
1844                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1845                         if mobj is None:
1846                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1847                                 return
1848                         video_thumbnail = mobj.group(1)
1849                 else:   # we need something to pass to process_info
1850                         video_thumbnail = ''
1851
1852                 try:
1853                         # Process video information
1854                         self._downloader.process_info({
1855                                 'id':           video_id.decode('utf-8'),
1856                                 'url':          video_url.decode('utf-8'),
1857                                 'uploader':     u'NA',
1858                                 'upload_date':  u'NA',
1859                                 'title':        video_title,
1860                                 'stitle':       simple_title,
1861                                 'ext':          video_extension.decode('utf-8'),
1862                                 'format':       u'NA',
1863                                 'player_url':   None,
1864                         })
1865                 except UnavailableVideoError:
1866                         self._downloader.trouble(u'\nERROR: unable to download video')
1867
1868
1869 class PhotobucketIE(InfoExtractor):
1870         """Information extractor for photobucket.com."""
1871
1872         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1873         IE_NAME = u'photobucket'
1874
1875         def __init__(self, downloader=None):
1876                 InfoExtractor.__init__(self, downloader)
1877
1878         def report_download_webpage(self, video_id):
1879                 """Report webpage download."""
1880                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1881
1882         def report_extraction(self, video_id):
1883                 """Report information extraction."""
1884                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1885
1886         def _real_extract(self, url):
1887                 # Extract id from URL
1888                 mobj = re.match(self._VALID_URL, url)
1889                 if mobj is None:
1890                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1891                         return
1892
1893                 # At this point we have a new video
1894                 self._downloader.increment_downloads()
1895                 video_id = mobj.group(1)
1896
1897                 video_extension = 'flv'
1898
1899                 # Retrieve video webpage to extract further information
1900                 request = urllib2.Request(url)
1901                 try:
1902                         self.report_download_webpage(video_id)
1903                         webpage = urllib2.urlopen(request).read()
1904                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1905                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1906                         return
1907
1908                 # Extract URL, uploader, and title from webpage
1909                 self.report_extraction(video_id)
1910                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1911                 if mobj is None:
1912                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1913                         return
1914                 mediaURL = urllib.unquote(mobj.group(1))
1915
1916                 video_url = mediaURL
1917
1918                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1919                 if mobj is None:
1920                         self._downloader.trouble(u'ERROR: unable to extract title')
1921                         return
1922                 video_title = mobj.group(1).decode('utf-8')
1923                 video_title = sanitize_title(video_title)
1924                 simple_title = _simplify_title(vide_title)
1925
1926                 video_uploader = mobj.group(2).decode('utf-8')
1927
1928                 try:
1929                         # Process video information
1930                         self._downloader.process_info({
1931                                 'id':           video_id.decode('utf-8'),
1932                                 'url':          video_url.decode('utf-8'),
1933                                 'uploader':     video_uploader,
1934                                 'upload_date':  u'NA',
1935                                 'title':        video_title,
1936                                 'stitle':       simple_title,
1937                                 'ext':          video_extension.decode('utf-8'),
1938                                 'format':       u'NA',
1939                                 'player_url':   None,
1940                         })
1941                 except UnavailableVideoError:
1942                         self._downloader.trouble(u'\nERROR: unable to download video')
1943
1944
1945 class YahooIE(InfoExtractor):
1946         """Information extractor for video.yahoo.com."""
1947
1948         # _VALID_URL matches all Yahoo! Video URLs
1949         # _VPAGE_URL matches only the extractable '/watch/' URLs
1950         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1951         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1952         IE_NAME = u'video.yahoo'
1953
1954         def __init__(self, downloader=None):
1955                 InfoExtractor.__init__(self, downloader)
1956
1957         def report_download_webpage(self, video_id):
1958                 """Report webpage download."""
1959                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1960
1961         def report_extraction(self, video_id):
1962                 """Report information extraction."""
1963                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1964
1965         def _real_extract(self, url, new_video=True):
1966                 # Extract ID from URL
1967                 mobj = re.match(self._VALID_URL, url)
1968                 if mobj is None:
1969                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1970                         return
1971
1972                 # At this point we have a new video
1973                 self._downloader.increment_downloads()
1974                 video_id = mobj.group(2)
1975                 video_extension = 'flv'
1976
1977                 # Rewrite valid but non-extractable URLs as
1978                 # extractable English language /watch/ URLs
1979                 if re.match(self._VPAGE_URL, url) is None:
1980                         request = urllib2.Request(url)
1981                         try:
1982                                 webpage = urllib2.urlopen(request).read()
1983                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1984                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1985                                 return
1986
1987                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1988                         if mobj is None:
1989                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1990                                 return
1991                         yahoo_id = mobj.group(1)
1992
1993                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1994                         if mobj is None:
1995                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1996                                 return
1997                         yahoo_vid = mobj.group(1)
1998
1999                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2000                         return self._real_extract(url, new_video=False)
2001
2002                 # Retrieve video webpage to extract further information
2003                 request = urllib2.Request(url)
2004                 try:
2005                         self.report_download_webpage(video_id)
2006                         webpage = urllib2.urlopen(request).read()
2007                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2008                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2009                         return
2010
2011                 # Extract uploader and title from webpage
2012                 self.report_extraction(video_id)
2013                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2014                 if mobj is None:
2015                         self._downloader.trouble(u'ERROR: unable to extract video title')
2016                         return
2017                 video_title = mobj.group(1).decode('utf-8')
2018                 simple_title = _simplify_title(video_title)
2019
2020                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2021                 if mobj is None:
2022                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2023                         return
2024                 video_uploader = mobj.group(1).decode('utf-8')
2025
2026                 # Extract video thumbnail
2027                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2028                 if mobj is None:
2029                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2030                         return
2031                 video_thumbnail = mobj.group(1).decode('utf-8')
2032
2033                 # Extract video description
2034                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2035                 if mobj is None:
2036                         self._downloader.trouble(u'ERROR: unable to extract video description')
2037                         return
2038                 video_description = mobj.group(1).decode('utf-8')
2039                 if not video_description:
2040                         video_description = 'No description available.'
2041
2042                 # Extract video height and width
2043                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2044                 if mobj is None:
2045                         self._downloader.trouble(u'ERROR: unable to extract video height')
2046                         return
2047                 yv_video_height = mobj.group(1)
2048
2049                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2050                 if mobj is None:
2051                         self._downloader.trouble(u'ERROR: unable to extract video width')
2052                         return
2053                 yv_video_width = mobj.group(1)
2054
2055                 # Retrieve video playlist to extract media URL
2056                 # I'm not completely sure what all these options are, but we
2057                 # seem to need most of them, otherwise the server sends a 401.
2058                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2059                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2060                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2061                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2062                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2063                 try:
2064                         self.report_download_webpage(video_id)
2065                         webpage = urllib2.urlopen(request).read()
2066                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2067                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2068                         return
2069
2070                 # Extract media URL from playlist XML
2071                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2072                 if mobj is None:
2073                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2074                         return
2075                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2076                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2077
2078                 try:
2079                         # Process video information
2080                         self._downloader.process_info({
2081                                 'id':           video_id.decode('utf-8'),
2082                                 'url':          video_url,
2083                                 'uploader':     video_uploader,
2084                                 'upload_date':  u'NA',
2085                                 'title':        video_title,
2086                                 'stitle':       simple_title,
2087                                 'ext':          video_extension.decode('utf-8'),
2088                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2089                                 'description':  video_description,
2090                                 'thumbnail':    video_thumbnail,
2091                                 'player_url':   None,
2092                         })
2093                 except UnavailableVideoError:
2094                         self._downloader.trouble(u'\nERROR: unable to download video')
2095
2096
2097 class VimeoIE(InfoExtractor):
2098         """Information extractor for vimeo.com."""
2099
2100         # _VALID_URL matches Vimeo URLs
2101         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2102         IE_NAME = u'vimeo'
2103
2104         def __init__(self, downloader=None):
2105                 InfoExtractor.__init__(self, downloader)
2106
2107         def report_download_webpage(self, video_id):
2108                 """Report webpage download."""
2109                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2110
2111         def report_extraction(self, video_id):
2112                 """Report information extraction."""
2113                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2114
2115         def _real_extract(self, url, new_video=True):
2116                 # Extract ID from URL
2117                 mobj = re.match(self._VALID_URL, url)
2118                 if mobj is None:
2119                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2120                         return
2121
2122                 # At this point we have a new video
2123                 self._downloader.increment_downloads()
2124                 video_id = mobj.group(1)
2125
2126                 # Retrieve video webpage to extract further information
2127                 request = urllib2.Request(url, None, std_headers)
2128                 try:
2129                         self.report_download_webpage(video_id)
2130                         webpage = urllib2.urlopen(request).read()
2131                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2132                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2133                         return
2134
2135                 # Now we begin extracting as much information as we can from what we
2136                 # retrieved. First we extract the information common to all extractors,
2137                 # and latter we extract those that are Vimeo specific.
2138                 self.report_extraction(video_id)
2139
2140                 # Extract the config JSON
2141                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2142                 try:
2143                         config = json.loads(config)
2144                 except:
2145                         self._downloader.trouble(u'ERROR: unable to extract info section')
2146                         return
2147                 
2148                 # Extract title
2149                 video_title = config["video"]["title"]
2150                 simple_title = _simplify_title(video_title)
2151
2152                 # Extract uploader
2153                 video_uploader = config["video"]["owner"]["name"]
2154
2155                 # Extract video thumbnail
2156                 video_thumbnail = config["video"]["thumbnail"]
2157
2158                 # Extract video description
2159                 try:
2160                         lxml.etree
2161                 except NameError:
2162                         video_description = u'No description available.'
2163                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2164                         if mobj is not None:
2165                                 video_description = mobj.group(1)
2166                 else:
2167                         html_parser = lxml.etree.HTMLParser()
2168                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2169                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2170                         # TODO use another parser
2171
2172                 # Extract upload date
2173                 video_upload_date = u'NA'
2174                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2175                 if mobj is not None:
2176                         video_upload_date = mobj.group(1)
2177
2178                 # Vimeo specific: extract request signature and timestamp
2179                 sig = config['request']['signature']
2180                 timestamp = config['request']['timestamp']
2181
2182                 # Vimeo specific: extract video codec and quality information
2183                 # TODO bind to format param
2184                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2185                 for codec in codecs:
2186                         if codec[0] in config["video"]["files"]:
2187                                 video_codec = codec[0]
2188                                 video_extension = codec[1]
2189                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2190                                 else: quality = 'sd'
2191                                 break
2192                 else:
2193                         self._downloader.trouble(u'ERROR: no known codec found')
2194                         return
2195
2196                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2197                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2198
2199                 try:
2200                         # Process video information
2201                         self._downloader.process_info({
2202                                 'id':           video_id,
2203                                 'url':          video_url,
2204                                 'uploader':     video_uploader,
2205                                 'upload_date':  video_upload_date,
2206                                 'title':        video_title,
2207                                 'stitle':       simple_title,
2208                                 'ext':          video_extension,
2209                                 'thumbnail':    video_thumbnail,
2210                                 'description':  video_description,
2211                                 'player_url':   None,
2212                         })
2213                 except UnavailableVideoError:
2214                         self._downloader.trouble(u'ERROR: unable to download video')
2215
2216
2217 class GenericIE(InfoExtractor):
2218         """Generic last-resort information extractor."""
2219
2220         _VALID_URL = r'.*'
2221         IE_NAME = u'generic'
2222
2223         def __init__(self, downloader=None):
2224                 InfoExtractor.__init__(self, downloader)
2225
2226         def report_download_webpage(self, video_id):
2227                 """Report webpage download."""
2228                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2229                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2230
2231         def report_extraction(self, video_id):
2232                 """Report information extraction."""
2233                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2234
2235         def _real_extract(self, url):
2236                 # At this point we have a new video
2237                 self._downloader.increment_downloads()
2238
2239                 video_id = url.split('/')[-1]
2240                 request = urllib2.Request(url)
2241                 try:
2242                         self.report_download_webpage(video_id)
2243                         webpage = urllib2.urlopen(request).read()
2244                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2245                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2246                         return
2247                 except ValueError, err:
2248                         # since this is the last-resort InfoExtractor, if
2249                         # this error is thrown, it'll be thrown here
2250                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2251                         return
2252
2253                 self.report_extraction(video_id)
2254                 # Start with something easy: JW Player in SWFObject
2255                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2256                 if mobj is None:
2257                         # Broaden the search a little bit
2258                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2259                 if mobj is None:
2260                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2261                         return
2262
2263                 # It's possible that one of the regexes
2264                 # matched, but returned an empty group:
2265                 if mobj.group(1) is None:
2266                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2267                         return
2268
2269                 video_url = urllib.unquote(mobj.group(1))
2270                 video_id = os.path.basename(video_url)
2271
2272                 # here's a fun little line of code for you:
2273                 video_extension = os.path.splitext(video_id)[1][1:]
2274                 video_id = os.path.splitext(video_id)[0]
2275
2276                 # it's tempting to parse this further, but you would
2277                 # have to take into account all the variations like
2278                 #   Video Title - Site Name
2279                 #   Site Name | Video Title
2280                 #   Video Title - Tagline | Site Name
2281                 # and so on and so forth; it's just not practical
2282                 mobj = re.search(r'<title>(.*)</title>', webpage)
2283                 if mobj is None:
2284                         self._downloader.trouble(u'ERROR: unable to extract title')
2285                         return
2286                 video_title = mobj.group(1).decode('utf-8')
2287                 video_title = sanitize_title(video_title)
2288                 simple_title = _simplify_title(video_title)
2289
2290                 # video uploader is domain name
2291                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2292                 if mobj is None:
2293                         self._downloader.trouble(u'ERROR: unable to extract title')
2294                         return
2295                 video_uploader = mobj.group(1).decode('utf-8')
2296
2297                 try:
2298                         # Process video information
2299                         self._downloader.process_info({
2300                                 'id':           video_id.decode('utf-8'),
2301                                 'url':          video_url.decode('utf-8'),
2302                                 'uploader':     video_uploader,
2303                                 'upload_date':  u'NA',
2304                                 'title':        video_title,
2305                                 'stitle':       simple_title,
2306                                 'ext':          video_extension.decode('utf-8'),
2307                                 'format':       u'NA',
2308                                 'player_url':   None,
2309                         })
2310                 except UnavailableVideoError, err:
2311                         self._downloader.trouble(u'\nERROR: unable to download video')
2312
2313
2314 class YoutubeSearchIE(InfoExtractor):
2315         """Information Extractor for YouTube search queries."""
2316         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2317         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2318         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2319         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2320         _youtube_ie = None
2321         _max_youtube_results = 1000
2322         IE_NAME = u'youtube:search'
2323
2324         def __init__(self, youtube_ie, downloader=None):
2325                 InfoExtractor.__init__(self, downloader)
2326                 self._youtube_ie = youtube_ie
2327
2328         def report_download_page(self, query, pagenum):
2329                 """Report attempt to download playlist page with given number."""
2330                 query = query.decode(preferredencoding())
2331                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2332
2333         def _real_initialize(self):
2334                 self._youtube_ie.initialize()
2335
2336         def _real_extract(self, query):
2337                 mobj = re.match(self._VALID_URL, query)
2338                 if mobj is None:
2339                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2340                         return
2341
2342                 prefix, query = query.split(':')
2343                 prefix = prefix[8:]
2344                 query = query.encode('utf-8')
2345                 if prefix == '':
2346                         self._download_n_results(query, 1)
2347                         return
2348                 elif prefix == 'all':
2349                         self._download_n_results(query, self._max_youtube_results)
2350                         return
2351                 else:
2352                         try:
2353                                 n = long(prefix)
2354                                 if n <= 0:
2355                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2356                                         return
2357                                 elif n > self._max_youtube_results:
2358                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2359                                         n = self._max_youtube_results
2360                                 self._download_n_results(query, n)
2361                                 return
2362                         except ValueError: # parsing prefix as integer fails
2363                                 self._download_n_results(query, 1)
2364                                 return
2365
2366         def _download_n_results(self, query, n):
2367                 """Downloads a specified number of results for a query"""
2368
2369                 video_ids = []
2370                 already_seen = set()
2371                 pagenum = 1
2372
2373                 while True:
2374                         self.report_download_page(query, pagenum)
2375                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2376                         request = urllib2.Request(result_url)
2377                         try:
2378                                 page = urllib2.urlopen(request).read()
2379                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2380                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2381                                 return
2382
2383                         # Extract video identifiers
2384                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2385                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2386                                 if video_id not in already_seen:
2387                                         video_ids.append(video_id)
2388                                         already_seen.add(video_id)
2389                                         if len(video_ids) == n:
2390                                                 # Specified n videos reached
2391                                                 for id in video_ids:
2392                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2393                                                 return
2394
2395                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2396                                 for id in video_ids:
2397                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2398                                 return
2399
2400                         pagenum = pagenum + 1
2401
2402
2403 class GoogleSearchIE(InfoExtractor):
2404         """Information Extractor for Google Video search queries."""
2405         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2406         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2407         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2408         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2409         _google_ie = None
2410         _max_google_results = 1000
2411         IE_NAME = u'video.google:search'
2412
2413         def __init__(self, google_ie, downloader=None):
2414                 InfoExtractor.__init__(self, downloader)
2415                 self._google_ie = google_ie
2416
2417         def report_download_page(self, query, pagenum):
2418                 """Report attempt to download playlist page with given number."""
2419                 query = query.decode(preferredencoding())
2420                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2421
2422         def _real_initialize(self):
2423                 self._google_ie.initialize()
2424
2425         def _real_extract(self, query):
2426                 mobj = re.match(self._VALID_URL, query)
2427                 if mobj is None:
2428                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2429                         return
2430
2431                 prefix, query = query.split(':')
2432                 prefix = prefix[8:]
2433                 query = query.encode('utf-8')
2434                 if prefix == '':
2435                         self._download_n_results(query, 1)
2436                         return
2437                 elif prefix == 'all':
2438                         self._download_n_results(query, self._max_google_results)
2439                         return
2440                 else:
2441                         try:
2442                                 n = long(prefix)
2443                                 if n <= 0:
2444                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2445                                         return
2446                                 elif n > self._max_google_results:
2447                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2448                                         n = self._max_google_results
2449                                 self._download_n_results(query, n)
2450                                 return
2451                         except ValueError: # parsing prefix as integer fails
2452                                 self._download_n_results(query, 1)
2453                                 return
2454
2455         def _download_n_results(self, query, n):
2456                 """Downloads a specified number of results for a query"""
2457
2458                 video_ids = []
2459                 pagenum = 0
2460
2461                 while True:
2462                         self.report_download_page(query, pagenum)
2463                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2464                         request = urllib2.Request(result_url)
2465                         try:
2466                                 page = urllib2.urlopen(request).read()
2467                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2469                                 return
2470
2471                         # Extract video identifiers
2472                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2473                                 video_id = mobj.group(1)
2474                                 if video_id not in video_ids:
2475                                         video_ids.append(video_id)
2476                                         if len(video_ids) == n:
2477                                                 # Specified n videos reached
2478                                                 for id in video_ids:
2479                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2480                                                 return
2481
2482                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2483                                 for id in video_ids:
2484                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2485                                 return
2486
2487                         pagenum = pagenum + 1
2488
2489
2490 class YahooSearchIE(InfoExtractor):
2491         """Information Extractor for Yahoo! Video search queries."""
2492         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2493         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2494         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2495         _MORE_PAGES_INDICATOR = r'\s*Next'
2496         _yahoo_ie = None
2497         _max_yahoo_results = 1000
2498         IE_NAME = u'video.yahoo:search'
2499
2500         def __init__(self, yahoo_ie, downloader=None):
2501                 InfoExtractor.__init__(self, downloader)
2502                 self._yahoo_ie = yahoo_ie
2503
2504         def report_download_page(self, query, pagenum):
2505                 """Report attempt to download playlist page with given number."""
2506                 query = query.decode(preferredencoding())
2507                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2508
2509         def _real_initialize(self):
2510                 self._yahoo_ie.initialize()
2511
2512         def _real_extract(self, query):
2513                 mobj = re.match(self._VALID_URL, query)
2514                 if mobj is None:
2515                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2516                         return
2517
2518                 prefix, query = query.split(':')
2519                 prefix = prefix[8:]
2520                 query = query.encode('utf-8')
2521                 if prefix == '':
2522                         self._download_n_results(query, 1)
2523                         return
2524                 elif prefix == 'all':
2525                         self._download_n_results(query, self._max_yahoo_results)
2526                         return
2527                 else:
2528                         try:
2529                                 n = long(prefix)
2530                                 if n <= 0:
2531                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2532                                         return
2533                                 elif n > self._max_yahoo_results:
2534                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2535                                         n = self._max_yahoo_results
2536                                 self._download_n_results(query, n)
2537                                 return
2538                         except ValueError: # parsing prefix as integer fails
2539                                 self._download_n_results(query, 1)
2540                                 return
2541
2542         def _download_n_results(self, query, n):
2543                 """Downloads a specified number of results for a query"""
2544
2545                 video_ids = []
2546                 already_seen = set()
2547                 pagenum = 1
2548
2549                 while True:
2550                         self.report_download_page(query, pagenum)
2551                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2552                         request = urllib2.Request(result_url)
2553                         try:
2554                                 page = urllib2.urlopen(request).read()
2555                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2556                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2557                                 return
2558
2559                         # Extract video identifiers
2560                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2561                                 video_id = mobj.group(1)
2562                                 if video_id not in already_seen:
2563                                         video_ids.append(video_id)
2564                                         already_seen.add(video_id)
2565                                         if len(video_ids) == n:
2566                                                 # Specified n videos reached
2567                                                 for id in video_ids:
2568                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2569                                                 return
2570
2571                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2572                                 for id in video_ids:
2573                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2574                                 return
2575
2576                         pagenum = pagenum + 1
2577
2578
2579 class YoutubePlaylistIE(InfoExtractor):
2580         """Information Extractor for YouTube playlists."""
2581
2582         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2583         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2584         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2585         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2586         _youtube_ie = None
2587         IE_NAME = u'youtube:playlist'
2588
2589         def __init__(self, youtube_ie, downloader=None):
2590                 InfoExtractor.__init__(self, downloader)
2591                 self._youtube_ie = youtube_ie
2592
2593         def report_download_page(self, playlist_id, pagenum):
2594                 """Report attempt to download playlist page with given number."""
2595                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2596
2597         def _real_initialize(self):
2598                 self._youtube_ie.initialize()
2599
2600         def _real_extract(self, url):
2601                 # Extract playlist id
2602                 mobj = re.match(self._VALID_URL, url)
2603                 if mobj is None:
2604                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2605                         return
2606
2607                 # Single video case
2608                 if mobj.group(3) is not None:
2609                         self._youtube_ie.extract(mobj.group(3))
2610                         return
2611
2612                 # Download playlist pages
2613                 # prefix is 'p' as default for playlists but there are other types that need extra care
2614                 playlist_prefix = mobj.group(1)
2615                 if playlist_prefix == 'a':
2616                         playlist_access = 'artist'
2617                 else:
2618                         playlist_prefix = 'p'
2619                         playlist_access = 'view_play_list'
2620                 playlist_id = mobj.group(2)
2621                 video_ids = []
2622                 pagenum = 1
2623
2624                 while True:
2625                         self.report_download_page(playlist_id, pagenum)
2626                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2627                         request = urllib2.Request(url)
2628                         try:
2629                                 page = urllib2.urlopen(request).read()
2630                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2631                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2632                                 return
2633
2634                         # Extract video identifiers
2635                         ids_in_page = []
2636                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2637                                 if mobj.group(1) not in ids_in_page:
2638                                         ids_in_page.append(mobj.group(1))
2639                         video_ids.extend(ids_in_page)
2640
2641                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2642                                 break
2643                         pagenum = pagenum + 1
2644
2645                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2646                 playlistend = self._downloader.params.get('playlistend', -1)
2647                 video_ids = video_ids[playliststart:playlistend]
2648
2649                 for id in video_ids:
2650                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2651                 return
2652
2653
2654 class YoutubeUserIE(InfoExtractor):
2655         """Information Extractor for YouTube users."""
2656
2657         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2658         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2659         _GDATA_PAGE_SIZE = 50
2660         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2661         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2662         _youtube_ie = None
2663         IE_NAME = u'youtube:user'
2664
2665         def __init__(self, youtube_ie, downloader=None):
2666                 InfoExtractor.__init__(self, downloader)
2667                 self._youtube_ie = youtube_ie
2668
2669         def report_download_page(self, username, start_index):
2670                 """Report attempt to download user page."""
2671                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2672                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2673
2674         def _real_initialize(self):
2675                 self._youtube_ie.initialize()
2676
2677         def _real_extract(self, url):
2678                 # Extract username
2679                 mobj = re.match(self._VALID_URL, url)
2680                 if mobj is None:
2681                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2682                         return
2683
2684                 username = mobj.group(1)
2685
2686                 # Download video ids using YouTube Data API. Result size per
2687                 # query is limited (currently to 50 videos) so we need to query
2688                 # page by page until there are no video ids - it means we got
2689                 # all of them.
2690
2691                 video_ids = []
2692                 pagenum = 0
2693
2694                 while True:
2695                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2696                         self.report_download_page(username, start_index)
2697
2698                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2699
2700                         try:
2701                                 page = urllib2.urlopen(request).read()
2702                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2703                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2704                                 return
2705
2706                         # Extract video identifiers
2707                         ids_in_page = []
2708
2709                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2710                                 if mobj.group(1) not in ids_in_page:
2711                                         ids_in_page.append(mobj.group(1))
2712
2713                         video_ids.extend(ids_in_page)
2714
2715                         # A little optimization - if current page is not
2716                         # "full", ie. does not contain PAGE_SIZE video ids then
2717                         # we can assume that this page is the last one - there
2718                         # are no more ids on further pages - no need to query
2719                         # again.
2720
2721                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2722                                 break
2723
2724                         pagenum += 1
2725
2726                 all_ids_count = len(video_ids)
2727                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2728                 playlistend = self._downloader.params.get('playlistend', -1)
2729
2730                 if playlistend == -1:
2731                         video_ids = video_ids[playliststart:]
2732                 else:
2733                         video_ids = video_ids[playliststart:playlistend]
2734
2735                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2736                                 (username, all_ids_count, len(video_ids)))
2737
2738                 for video_id in video_ids:
2739                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2740
2741
2742 class DepositFilesIE(InfoExtractor):
2743         """Information extractor for depositfiles.com"""
2744
2745         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2746         IE_NAME = u'DepositFiles'
2747
2748         def __init__(self, downloader=None):
2749                 InfoExtractor.__init__(self, downloader)
2750
2751         def report_download_webpage(self, file_id):
2752                 """Report webpage download."""
2753                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2754
2755         def report_extraction(self, file_id):
2756                 """Report information extraction."""
2757                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2758
2759         def _real_extract(self, url):
2760                 # At this point we have a new file
2761                 self._downloader.increment_downloads()
2762
2763                 file_id = url.split('/')[-1]
2764                 # Rebuild url in english locale
2765                 url = 'http://depositfiles.com/en/files/' + file_id
2766
2767                 # Retrieve file webpage with 'Free download' button pressed
2768                 free_download_indication = { 'gateway_result' : '1' }
2769                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2770                 try:
2771                         self.report_download_webpage(file_id)
2772                         webpage = urllib2.urlopen(request).read()
2773                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2774                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2775                         return
2776
2777                 # Search for the real file URL
2778                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2779                 if (mobj is None) or (mobj.group(1) is None):
2780                         # Try to figure out reason of the error.
2781                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2782                         if (mobj is not None) and (mobj.group(1) is not None):
2783                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2784                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2785                         else:
2786                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2787                         return
2788
2789                 file_url = mobj.group(1)
2790                 file_extension = os.path.splitext(file_url)[1][1:]
2791
2792                 # Search for file title
2793                 mobj = re.search(r'<b title="(.*?)">', webpage)
2794                 if mobj is None:
2795                         self._downloader.trouble(u'ERROR: unable to extract title')
2796                         return
2797                 file_title = mobj.group(1).decode('utf-8')
2798
2799                 try:
2800                         # Process file information
2801                         self._downloader.process_info({
2802                                 'id':           file_id.decode('utf-8'),
2803                                 'url':          file_url.decode('utf-8'),
2804                                 'uploader':     u'NA',
2805                                 'upload_date':  u'NA',
2806                                 'title':        file_title,
2807                                 'stitle':       file_title,
2808                                 'ext':          file_extension.decode('utf-8'),
2809                                 'format':       u'NA',
2810                                 'player_url':   None,
2811                         })
2812                 except UnavailableVideoError, err:
2813                         self._downloader.trouble(u'ERROR: unable to download file')
2814
2815
2816 class FacebookIE(InfoExtractor):
2817         """Information Extractor for Facebook"""
2818
2819         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2820         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2821         _NETRC_MACHINE = 'facebook'
2822         _available_formats = ['video', 'highqual', 'lowqual']
2823         _video_extensions = {
2824                 'video': 'mp4',
2825                 'highqual': 'mp4',
2826                 'lowqual': 'mp4',
2827         }
2828         IE_NAME = u'facebook'
2829
2830         def __init__(self, downloader=None):
2831                 InfoExtractor.__init__(self, downloader)
2832
2833         def _reporter(self, message):
2834                 """Add header and report message."""
2835                 self._downloader.to_screen(u'[facebook] %s' % message)
2836
2837         def report_login(self):
2838                 """Report attempt to log in."""
2839                 self._reporter(u'Logging in')
2840
2841         def report_video_webpage_download(self, video_id):
2842                 """Report attempt to download video webpage."""
2843                 self._reporter(u'%s: Downloading video webpage' % video_id)
2844
2845         def report_information_extraction(self, video_id):
2846                 """Report attempt to extract video information."""
2847                 self._reporter(u'%s: Extracting video information' % video_id)
2848
2849         def _parse_page(self, video_webpage):
2850                 """Extract video information from page"""
2851                 # General data
2852                 data = {'title': r'\("video_title", "(.*?)"\)',
2853                         'description': r'<div class="datawrap">(.*?)</div>',
2854                         'owner': r'\("video_owner_name", "(.*?)"\)',
2855                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2856                         }
2857                 video_info = {}
2858                 for piece in data.keys():
2859                         mobj = re.search(data[piece], video_webpage)
2860                         if mobj is not None:
2861                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2862
2863                 # Video urls
2864                 video_urls = {}
2865                 for fmt in self._available_formats:
2866                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2867                         if mobj is not None:
2868                                 # URL is in a Javascript segment inside an escaped Unicode format within
2869                                 # the generally utf-8 page
2870                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2871                 video_info['video_urls'] = video_urls
2872
2873                 return video_info
2874
2875         def _real_initialize(self):
2876                 if self._downloader is None:
2877                         return
2878
2879                 useremail = None
2880                 password = None
2881                 downloader_params = self._downloader.params
2882
2883                 # Attempt to use provided username and password or .netrc data
2884                 if downloader_params.get('username', None) is not None:
2885                         useremail = downloader_params['username']
2886                         password = downloader_params['password']
2887                 elif downloader_params.get('usenetrc', False):
2888                         try:
2889                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2890                                 if info is not None:
2891                                         useremail = info[0]
2892                                         password = info[2]
2893                                 else:
2894                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2895                         except (IOError, netrc.NetrcParseError), err:
2896                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2897                                 return
2898
2899                 if useremail is None:
2900                         return
2901
2902                 # Log in
2903                 login_form = {
2904                         'email': useremail,
2905                         'pass': password,
2906                         'login': 'Log+In'
2907                         }
2908                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2909                 try:
2910                         self.report_login()
2911                         login_results = urllib2.urlopen(request).read()
2912                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2913                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2914                                 return
2915                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2916                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2917                         return
2918
2919         def _real_extract(self, url):
2920                 mobj = re.match(self._VALID_URL, url)
2921                 if mobj is None:
2922                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2923                         return
2924                 video_id = mobj.group('ID')
2925
2926                 # Get video webpage
2927                 self.report_video_webpage_download(video_id)
2928                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2929                 try:
2930                         page = urllib2.urlopen(request)
2931                         video_webpage = page.read()
2932                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2933                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2934                         return
2935
2936                 # Start extracting information
2937                 self.report_information_extraction(video_id)
2938
2939                 # Extract information
2940                 video_info = self._parse_page(video_webpage)
2941
2942                 # uploader
2943                 if 'owner' not in video_info:
2944                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2945                         return
2946                 video_uploader = video_info['owner']
2947
2948                 # title
2949                 if 'title' not in video_info:
2950                         self._downloader.trouble(u'ERROR: unable to extract video title')
2951                         return
2952                 video_title = video_info['title']
2953                 video_title = video_title.decode('utf-8')
2954                 video_title = sanitize_title(video_title)
2955
2956                 simple_title = _simplify_title(video_title)
2957
2958                 # thumbnail image
2959                 if 'thumbnail' not in video_info:
2960                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2961                         video_thumbnail = ''
2962                 else:
2963                         video_thumbnail = video_info['thumbnail']
2964
2965                 # upload date
2966                 upload_date = u'NA'
2967                 if 'upload_date' in video_info:
2968                         upload_time = video_info['upload_date']
2969                         timetuple = email.utils.parsedate_tz(upload_time)
2970                         if timetuple is not None:
2971                                 try:
2972                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2973                                 except:
2974                                         pass
2975
2976                 # description
2977                 video_description = video_info.get('description', 'No description available.')
2978
2979                 url_map = video_info['video_urls']
2980                 if len(url_map.keys()) > 0:
2981                         # Decide which formats to download
2982                         req_format = self._downloader.params.get('format', None)
2983                         format_limit = self._downloader.params.get('format_limit', None)
2984
2985                         if format_limit is not None and format_limit in self._available_formats:
2986                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2987                         else:
2988                                 format_list = self._available_formats
2989                         existing_formats = [x for x in format_list if x in url_map]
2990                         if len(existing_formats) == 0:
2991                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2992                                 return
2993                         if req_format is None:
2994                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2995                         elif req_format == 'worst':
2996                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2997                         elif req_format == '-1':
2998                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2999                         else:
3000                                 # Specific format
3001                                 if req_format not in url_map:
3002                                         self._downloader.trouble(u'ERROR: requested format not available')
3003                                         return
3004                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3005
3006                 for format_param, video_real_url in video_url_list:
3007
3008                         # At this point we have a new video
3009                         self._downloader.increment_downloads()
3010
3011                         # Extension
3012                         video_extension = self._video_extensions.get(format_param, 'mp4')
3013
3014                         try:
3015                                 # Process video information
3016                                 self._downloader.process_info({
3017                                         'id':           video_id.decode('utf-8'),
3018                                         'url':          video_real_url.decode('utf-8'),
3019                                         'uploader':     video_uploader.decode('utf-8'),
3020                                         'upload_date':  upload_date,
3021                                         'title':        video_title,
3022                                         'stitle':       simple_title,
3023                                         'ext':          video_extension.decode('utf-8'),
3024                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3025                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3026                                         'description':  video_description.decode('utf-8'),
3027                                         'player_url':   None,
3028                                 })
3029                         except UnavailableVideoError, err:
3030                                 self._downloader.trouble(u'\nERROR: unable to download video')
3031
3032 class BlipTVIE(InfoExtractor):
3033         """Information extractor for blip.tv"""
3034
3035         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3036         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3037         IE_NAME = u'blip.tv'
3038
3039         def report_extraction(self, file_id):
3040                 """Report information extraction."""
3041                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3042
3043         def report_direct_download(self, title):
3044                 """Report information extraction."""
3045                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3046
3047         def _real_extract(self, url):
3048                 mobj = re.match(self._VALID_URL, url)
3049                 if mobj is None:
3050                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3051                         return
3052
3053                 if '?' in url:
3054                         cchar = '&'
3055                 else:
3056                         cchar = '?'
3057                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3058                 request = urllib2.Request(json_url)
3059                 self.report_extraction(mobj.group(1))
3060                 info = None
3061                 try:
3062                         urlh = urllib2.urlopen(request)
3063                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3064                                 basename = url.split('/')[-1]
3065                                 title,ext = os.path.splitext(basename)
3066                                 title = title.decode('UTF-8')
3067                                 ext = ext.replace('.', '')
3068                                 self.report_direct_download(title)
3069                                 info = {
3070                                         'id': title,
3071                                         'url': url,
3072                                         'title': title,
3073                                         'stitle': _simplify_title(title),
3074                                         'ext': ext,
3075                                         'urlhandle': urlh
3076                                 }
3077                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3078                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3079                         return
3080                 if info is None: # Regular URL
3081                         try:
3082                                 json_code = urlh.read()
3083                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3084                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3085                                 return
3086
3087                         try:
3088                                 json_data = json.loads(json_code)
3089                                 if 'Post' in json_data:
3090                                         data = json_data['Post']
3091                                 else:
3092                                         data = json_data
3093         
3094                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3095                                 video_url = data['media']['url']
3096                                 umobj = re.match(self._URL_EXT, video_url)
3097                                 if umobj is None:
3098                                         raise ValueError('Can not determine filename extension')
3099                                 ext = umobj.group(1)
3100         
3101                                 info = {
3102                                         'id': data['item_id'],
3103                                         'url': video_url,
3104                                         'uploader': data['display_name'],
3105                                         'upload_date': upload_date,
3106                                         'title': data['title'],
3107                                         'stitle': _simplify_title(data['title']),
3108                                         'ext': ext,
3109                                         'format': data['media']['mimeType'],
3110                                         'thumbnail': data['thumbnailUrl'],
3111                                         'description': data['description'],
3112                                         'player_url': data['embedUrl']
3113                                 }
3114                         except (ValueError,KeyError), err:
3115                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3116                                 return
3117
3118                 self._downloader.increment_downloads()
3119
3120                 try:
3121                         self._downloader.process_info(info)
3122                 except UnavailableVideoError, err:
3123                         self._downloader.trouble(u'\nERROR: unable to download video')
3124
3125
3126 class MyVideoIE(InfoExtractor):
3127         """Information Extractor for myvideo.de."""
3128
3129         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3130         IE_NAME = u'myvideo'
3131
3132         def __init__(self, downloader=None):
3133                 InfoExtractor.__init__(self, downloader)
3134         
3135         def report_download_webpage(self, video_id):
3136                 """Report webpage download."""
3137                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3138
3139         def report_extraction(self, video_id):
3140                 """Report information extraction."""
3141                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3142
3143         def _real_extract(self,url):
3144                 mobj = re.match(self._VALID_URL, url)
3145                 if mobj is None:
3146                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3147                         return
3148
3149                 video_id = mobj.group(1)
3150
3151                 # Get video webpage
3152                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3153                 try:
3154                         self.report_download_webpage(video_id)
3155                         webpage = urllib2.urlopen(request).read()
3156                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3157                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3158                         return
3159
3160                 self.report_extraction(video_id)
3161                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3162                                  webpage)
3163                 if mobj is None:
3164                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3165                         return
3166                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3167
3168                 mobj = re.search('<title>([^<]+)</title>', webpage)
3169                 if mobj is None:
3170                         self._downloader.trouble(u'ERROR: unable to extract title')
3171                         return
3172
3173                 video_title = mobj.group(1)
3174                 video_title = sanitize_title(video_title)
3175
3176                 simple_title = _simplify_title(video_title)
3177
3178                 try:
3179                         self._downloader.process_info({
3180                                 'id':           video_id,
3181                                 'url':          video_url,
3182                                 'uploader':     u'NA',
3183                                 'upload_date':  u'NA',
3184                                 'title':        video_title,
3185                                 'stitle':       simple_title,
3186                                 'ext':          u'flv',
3187                                 'format':       u'NA',
3188                                 'player_url':   None,
3189                         })
3190                 except UnavailableVideoError:
3191                         self._downloader.trouble(u'\nERROR: Unable to download video')
3192
3193 class ComedyCentralIE(InfoExtractor):
3194         """Information extractor for The Daily Show and Colbert Report """
3195
3196         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3197         IE_NAME = u'comedycentral'
3198
3199         def report_extraction(self, episode_id):
3200                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3201         
3202         def report_config_download(self, episode_id):
3203                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3204
3205         def report_index_download(self, episode_id):
3206                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3207
3208         def report_player_url(self, episode_id):
3209                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3210
3211         def _real_extract(self, url):
3212                 mobj = re.match(self._VALID_URL, url)
3213                 if mobj is None:
3214                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3215                         return
3216
3217                 if mobj.group('shortname'):
3218                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3219                                 url = u'http://www.thedailyshow.com/full-episodes/'
3220                         else:
3221                                 url = u'http://www.colbertnation.com/full-episodes/'
3222                         mobj = re.match(self._VALID_URL, url)
3223                         assert mobj is not None
3224
3225                 dlNewest = not mobj.group('episode')
3226                 if dlNewest:
3227                         epTitle = mobj.group('showname')
3228                 else:
3229                         epTitle = mobj.group('episode')
3230
3231                 req = urllib2.Request(url)
3232                 self.report_extraction(epTitle)
3233                 try:
3234                         htmlHandle = urllib2.urlopen(req)
3235                         html = htmlHandle.read()
3236                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3237                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3238                         return
3239                 if dlNewest:
3240                         url = htmlHandle.geturl()
3241                         mobj = re.match(self._VALID_URL, url)
3242                         if mobj is None:
3243                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3244                                 return
3245                         if mobj.group('episode') == '':
3246                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3247                                 return
3248                         epTitle = mobj.group('episode')
3249
3250                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3251                 if len(mMovieParams) == 0:
3252                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3253                         return
3254
3255                 playerUrl_raw = mMovieParams[0][0]
3256                 self.report_player_url(epTitle)
3257                 try:
3258                         urlHandle = urllib2.urlopen(playerUrl_raw)
3259                         playerUrl = urlHandle.geturl()
3260                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3261                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3262                         return
3263
3264                 uri = mMovieParams[0][1]
3265                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3266                 self.report_index_download(epTitle)
3267                 try:
3268                         indexXml = urllib2.urlopen(indexUrl).read()
3269                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3270                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3271                         return
3272
3273                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3274                 itemEls = idoc.findall('.//item')
3275                 for itemEl in itemEls:
3276                         mediaId = itemEl.findall('./guid')[0].text
3277                         shortMediaId = mediaId.split(':')[-1]
3278                         showId = mediaId.split(':')[-2].replace('.com', '')
3279                         officialTitle = itemEl.findall('./title')[0].text
3280                         officialDate = itemEl.findall('./pubDate')[0].text
3281
3282                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3283                                                 urllib.urlencode({'uri': mediaId}))
3284                         configReq = urllib2.Request(configUrl)
3285                         self.report_config_download(epTitle)
3286                         try:
3287                                 configXml = urllib2.urlopen(configReq).read()
3288                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3289                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3290                                 return
3291
3292                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3293                         turls = []
3294                         for rendition in cdoc.findall('.//rendition'):
3295                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3296                                 turls.append(finfo)
3297
3298                         if len(turls) == 0:
3299                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3300                                 continue
3301
3302                         # For now, just pick the highest bitrate
3303                         format,video_url = turls[-1]
3304
3305                         self._downloader.increment_downloads()
3306
3307                         effTitle = showId + u'-' + epTitle
3308                         info = {
3309                                 'id': shortMediaId,
3310                                 'url': video_url,
3311                                 'uploader': showId,
3312                                 'upload_date': officialDate,
3313                                 'title': effTitle,
3314                                 'stitle': _simplify_title(effTitle),
3315                                 'ext': 'mp4',
3316                                 'format': format,
3317                                 'thumbnail': None,
3318                                 'description': officialTitle,
3319                                 'player_url': playerUrl
3320                         }
3321
3322                         try:
3323                                 self._downloader.process_info(info)
3324                         except UnavailableVideoError, err:
3325                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3326                                 continue
3327
3328
3329 class EscapistIE(InfoExtractor):
3330         """Information extractor for The Escapist """
3331
3332         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3333         IE_NAME = u'escapist'
3334
3335         def report_extraction(self, showName):
3336                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3337
3338         def report_config_download(self, showName):
3339                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3340
3341         def _real_extract(self, url):
3342                 htmlParser = HTMLParser.HTMLParser()
3343
3344                 mobj = re.match(self._VALID_URL, url)
3345                 if mobj is None:
3346                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3347                         return
3348                 showName = mobj.group('showname')
3349                 videoId = mobj.group('episode')
3350
3351                 self.report_extraction(showName)
3352                 try:
3353                         webPage = urllib2.urlopen(url).read()
3354                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3355                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3356                         return
3357
3358                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3359                 description = htmlParser.unescape(descMatch.group(1))
3360                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3361                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3362                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3363                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3364                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3365                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3366
3367                 self.report_config_download(showName)
3368                 try:
3369                         configJSON = urllib2.urlopen(configUrl).read()
3370                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3371                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3372                         return
3373
3374                 # Technically, it's JavaScript, not JSON
3375                 configJSON = configJSON.replace("'", '"')
3376
3377                 try:
3378                         config = json.loads(configJSON)
3379                 except (ValueError,), err:
3380                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3381                         return
3382
3383                 playlist = config['playlist']
3384                 videoUrl = playlist[1]['url']
3385
3386                 self._downloader.increment_downloads()
3387                 info = {
3388                         'id': videoId,
3389                         'url': videoUrl,
3390                         'uploader': showName,
3391                         'upload_date': None,
3392                         'title': showName,
3393                         'stitle': _simplify_title(showName),
3394                         'ext': 'flv',
3395                         'format': 'flv',
3396                         'thumbnail': imgUrl,
3397                         'description': description,
3398                         'player_url': playerUrl,
3399                 }
3400
3401                 try:
3402                         self._downloader.process_info(info)
3403                 except UnavailableVideoError, err:
3404                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3405
3406
3407 class CollegeHumorIE(InfoExtractor):
3408         """Information extractor for collegehumor.com"""
3409
3410         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3411         IE_NAME = u'collegehumor'
3412
3413         def report_webpage(self, video_id):
3414                 """Report information extraction."""
3415                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3416
3417         def report_extraction(self, video_id):
3418                 """Report information extraction."""
3419                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3420
3421         def _real_extract(self, url):
3422                 htmlParser = HTMLParser.HTMLParser()
3423
3424                 mobj = re.match(self._VALID_URL, url)
3425                 if mobj is None:
3426                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3427                         return
3428                 video_id = mobj.group('videoid')
3429
3430                 self.report_webpage(video_id)
3431                 request = urllib2.Request(url)
3432                 try:
3433                         webpage = urllib2.urlopen(request).read()
3434                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3435                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3436                         return
3437
3438                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3439                 if m is None:
3440                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3441                         return
3442                 internal_video_id = m.group('internalvideoid')
3443
3444                 info = {
3445                         'id': video_id,
3446                         'internal_id': internal_video_id,
3447                 }
3448
3449                 self.report_extraction(video_id)
3450                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3451                 try:
3452                         metaXml = urllib2.urlopen(xmlUrl).read()
3453                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3454                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3455                         return
3456
3457                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3458                 try:
3459                         videoNode = mdoc.findall('./video')[0]
3460                         info['description'] = videoNode.findall('./description')[0].text
3461                         info['title'] = videoNode.findall('./caption')[0].text
3462                         info['stitle'] = _simplify_title(info['title'])
3463                         info['url'] = videoNode.findall('./file')[0].text
3464                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3465                         info['ext'] = info['url'].rpartition('.')[2]
3466                         info['format'] = info['ext']
3467                 except IndexError:
3468                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3469                         return
3470
3471                 self._downloader.increment_downloads()
3472
3473                 try:
3474                         self._downloader.process_info(info)
3475                 except UnavailableVideoError, err:
3476                         self._downloader.trouble(u'\nERROR: unable to download video')
3477
3478
3479 class XVideosIE(InfoExtractor):
3480         """Information extractor for xvideos.com"""
3481
3482         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3483         IE_NAME = u'xvideos'
3484
3485         def report_webpage(self, video_id):
3486                 """Report information extraction."""
3487                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3488
3489         def report_extraction(self, video_id):
3490                 """Report information extraction."""
3491                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3492
3493         def _real_extract(self, url):
3494                 htmlParser = HTMLParser.HTMLParser()
3495
3496                 mobj = re.match(self._VALID_URL, url)
3497                 if mobj is None:
3498                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3499                         return
3500                 video_id = mobj.group(1).decode('utf-8')
3501
3502                 self.report_webpage(video_id)
3503
3504                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3505                 try:
3506                         webpage = urllib2.urlopen(request).read()
3507                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3508                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3509                         return
3510
3511                 self.report_extraction(video_id)
3512
3513
3514                 # Extract video URL
3515                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3516                 if mobj is None:
3517                         self._downloader.trouble(u'ERROR: unable to extract video url')
3518                         return
3519                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3520
3521
3522                 # Extract title
3523                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3524                 if mobj is None:
3525                         self._downloader.trouble(u'ERROR: unable to extract video title')
3526                         return
3527                 video_title = mobj.group(1).decode('utf-8')
3528
3529
3530                 # Extract video thumbnail
3531                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3532                 if mobj is None:
3533                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3534                         return
3535                 video_thumbnail = mobj.group(1).decode('utf-8')
3536
3537
3538
3539                 self._downloader.increment_downloads()
3540                 info = {
3541                         'id': video_id,
3542                         'url': video_url,
3543                         'uploader': None,
3544                         'upload_date': None,
3545                         'title': video_title,
3546                         'stitle': _simplify_title(video_title),
3547                         'ext': 'flv',
3548                         'format': 'flv',
3549                         'thumbnail': video_thumbnail,
3550                         'description': None,
3551                         'player_url': None,
3552                 }
3553
3554                 try:
3555                         self._downloader.process_info(info)
3556                 except UnavailableVideoError, err:
3557                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3558
3559
3560 class SoundcloudIE(InfoExtractor):
3561         """Information extractor for soundcloud.com
3562            To access the media, the uid of the song and a stream token
3563            must be extracted from the page source and the script must make
3564            a request to media.soundcloud.com/crossdomain.xml. Then
3565            the media can be grabbed by requesting from an url composed
3566            of the stream token and uid
3567          """
3568
3569         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3570         IE_NAME = u'soundcloud'
3571
3572         def __init__(self, downloader=None):
3573                 InfoExtractor.__init__(self, downloader)
3574
3575         def report_webpage(self, video_id):
3576                 """Report information extraction."""
3577                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3578
3579         def report_extraction(self, video_id):
3580                 """Report information extraction."""
3581                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3582
3583         def _real_extract(self, url):
3584                 htmlParser = HTMLParser.HTMLParser()
3585
3586                 mobj = re.match(self._VALID_URL, url)
3587                 if mobj is None:
3588                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3589                         return
3590
3591                 # extract uploader (which is in the url)
3592                 uploader = mobj.group(1).decode('utf-8')
3593                 # extract simple title (uploader + slug of song title)
3594                 slug_title =  mobj.group(2).decode('utf-8')
3595                 simple_title = uploader + '-' + slug_title
3596
3597                 self.report_webpage('%s/%s' % (uploader, slug_title))
3598
3599                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3600                 try:
3601                         webpage = urllib2.urlopen(request).read()
3602                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3603                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3604                         return
3605
3606                 self.report_extraction('%s/%s' % (uploader, slug_title))
3607
3608                 # extract uid and stream token that soundcloud hands out for access
3609                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3610                 if mobj:
3611                         video_id = mobj.group(1)
3612                         stream_token = mobj.group(2)
3613
3614                 # extract unsimplified title
3615                 mobj = re.search('"title":"(.*?)",', webpage)
3616                 if mobj:
3617                         title = mobj.group(1)
3618
3619                 # construct media url (with uid/token)
3620                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3621                 mediaURL = mediaURL % (video_id, stream_token)
3622
3623                 # description
3624                 description = u'No description available'
3625                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3626                 if mobj:
3627                         description = mobj.group(1)
3628                 
3629                 # upload date
3630                 upload_date = None
3631                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3632                 if mobj:
3633                         try:
3634                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3635                         except Exception, e:
3636                                 print str(e)
3637
3638                 # for soundcloud, a request to a cross domain is required for cookies
3639                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3640
3641                 try:
3642                         self._downloader.process_info({
3643                                 'id':           video_id.decode('utf-8'),
3644                                 'url':          mediaURL,
3645                                 'uploader':     uploader.decode('utf-8'),
3646                                 'upload_date':  upload_date,
3647                                 'title':        simple_title.decode('utf-8'),
3648                                 'stitle':       simple_title.decode('utf-8'),
3649                                 'ext':          u'mp3',
3650                                 'format':       u'NA',
3651                                 'player_url':   None,
3652                                 'description': description.decode('utf-8')
3653                         })
3654                 except UnavailableVideoError:
3655                         self._downloader.trouble(u'\nERROR: unable to download video')
3656
3657
3658 class InfoQIE(InfoExtractor):
3659         """Information extractor for infoq.com"""
3660
3661         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3662         IE_NAME = u'infoq'
3663
3664         def report_webpage(self, video_id):
3665                 """Report information extraction."""
3666                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3667
3668         def report_extraction(self, video_id):
3669                 """Report information extraction."""
3670                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3671
3672         def _real_extract(self, url):
3673                 htmlParser = HTMLParser.HTMLParser()
3674
3675                 mobj = re.match(self._VALID_URL, url)
3676                 if mobj is None:
3677                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3678                         return
3679
3680                 self.report_webpage(url)
3681
3682                 request = urllib2.Request(url)
3683                 try:
3684                         webpage = urllib2.urlopen(request).read()
3685                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3686                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3687                         return
3688
3689                 self.report_extraction(url)
3690
3691
3692                 # Extract video URL
3693                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3694                 if mobj is None:
3695                         self._downloader.trouble(u'ERROR: unable to extract video url')
3696                         return
3697                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3698
3699
3700                 # Extract title
3701                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3702                 if mobj is None:
3703                         self._downloader.trouble(u'ERROR: unable to extract video title')
3704                         return
3705                 video_title = mobj.group(1).decode('utf-8')
3706
3707                 # Extract description
3708                 video_description = u'No description available.'
3709                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3710                 if mobj is not None:
3711                         video_description = mobj.group(1).decode('utf-8')
3712
3713                 video_filename = video_url.split('/')[-1]
3714                 video_id, extension = video_filename.split('.')
3715
3716                 self._downloader.increment_downloads()
3717                 info = {
3718                         'id': video_id,
3719                         'url': video_url,
3720                         'uploader': None,
3721                         'upload_date': None,
3722                         'title': video_title,
3723                         'stitle': _simplify_title(video_title),
3724                         'ext': extension,
3725                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3726                         'thumbnail': None,
3727                         'description': video_description,
3728                         'player_url': None,
3729                 }
3730
3731                 try:
3732                         self._downloader.process_info(info)
3733                 except UnavailableVideoError, err:
3734                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3735
3736 class MixcloudIE(InfoExtractor):
3737         """Information extractor for www.mixcloud.com"""
3738         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3739         IE_NAME = u'mixcloud'
3740
3741         def __init__(self, downloader=None):
3742                 InfoExtractor.__init__(self, downloader)
3743
3744         def report_download_json(self, file_id):
3745                 """Report JSON download."""
3746                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3747
3748         def report_extraction(self, file_id):
3749                 """Report information extraction."""
3750                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3751
3752         def get_urls(self, jsonData, fmt, bitrate='best'):
3753                 """Get urls from 'audio_formats' section in json"""
3754                 file_url = None
3755                 try:
3756                         bitrate_list = jsonData[fmt]
3757                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3758                                 bitrate = max(bitrate_list) # select highest
3759
3760                         url_list = jsonData[fmt][bitrate]
3761                 except TypeError: # we have no bitrate info.
3762                         url_list = jsonData[fmt]
3763                                 
3764                 return url_list
3765
3766         def check_urls(self, url_list):
3767                 """Returns 1st active url from list"""
3768                 for url in url_list:
3769                         try:
3770                                 urllib2.urlopen(url)
3771                                 return url
3772                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3773                                 url = None
3774
3775                 return None
3776
3777         def _print_formats(self, formats):
3778                 print 'Available formats:'
3779                 for fmt in formats.keys():
3780                         for b in formats[fmt]:
3781                                 try:
3782                                         ext = formats[fmt][b][0]
3783                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3784                                 except TypeError: # we have no bitrate info
3785                                         ext = formats[fmt][0]
3786                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3787                                         break
3788
3789         def _real_extract(self, url):
3790                 mobj = re.match(self._VALID_URL, url)
3791                 if mobj is None:
3792                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3793                         return
3794                 # extract uploader & filename from url
3795                 uploader = mobj.group(1).decode('utf-8')
3796                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3797
3798                 # construct API request
3799                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3800                 # retrieve .json file with links to files
3801                 request = urllib2.Request(file_url)
3802                 try:
3803                         self.report_download_json(file_url)
3804                         jsonData = urllib2.urlopen(request).read()
3805                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3806                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3807                         return
3808
3809                 # parse JSON
3810                 json_data = json.loads(jsonData)
3811                 player_url = json_data['player_swf_url']
3812                 formats = dict(json_data['audio_formats'])
3813
3814                 req_format = self._downloader.params.get('format', None)
3815                 bitrate = None
3816
3817                 if self._downloader.params.get('listformats', None):
3818                         self._print_formats(formats)
3819                         return
3820
3821                 if req_format is None or req_format == 'best':
3822                         for format_param in formats.keys():
3823                                 url_list = self.get_urls(formats, format_param)
3824                                 # check urls
3825                                 file_url = self.check_urls(url_list)
3826                                 if file_url is not None:
3827                                         break # got it!
3828                 else:
3829                         if req_format not in formats.keys():
3830                                 self._downloader.trouble(u'ERROR: format is not available')
3831                                 return
3832
3833                         url_list = self.get_urls(formats, req_format)
3834                         file_url = self.check_urls(url_list)
3835                         format_param = req_format
3836
3837                 # We have audio
3838                 self._downloader.increment_downloads()
3839                 try:
3840                         # Process file information
3841                         self._downloader.process_info({
3842                                 'id': file_id.decode('utf-8'),
3843                                 'url': file_url.decode('utf-8'),
3844                                 'uploader':     uploader.decode('utf-8'),
3845                                 'upload_date': u'NA',
3846                                 'title': json_data['name'],
3847                                 'stitle': _simplify_title(json_data['name']),
3848                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3849                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3850                                 'thumbnail': json_data['thumbnail_url'],
3851                                 'description': json_data['description'],
3852                                 'player_url': player_url.decode('utf-8'),
3853                         })
3854                 except UnavailableVideoError, err:
3855                         self._downloader.trouble(u'ERROR: unable to download file')
3856
3857 class StanfordOpenClassroomIE(InfoExtractor):
3858         """Information extractor for Stanford's Open ClassRoom"""
3859
3860         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3861         IE_NAME = u'stanfordoc'
3862
3863         def report_download_webpage(self, objid):
3864                 """Report information extraction."""
3865                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3866
3867         def report_extraction(self, video_id):
3868                 """Report information extraction."""
3869                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3870
3871         def _real_extract(self, url):
3872                 mobj = re.match(self._VALID_URL, url)
3873                 if mobj is None:
3874                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3875                         return
3876
3877                 if mobj.group('course') and mobj.group('video'): # A specific video
3878                         course = mobj.group('course')
3879                         video = mobj.group('video')
3880                         info = {
3881                                 'id': _simplify_title(course + '_' + video),
3882                         }
3883         
3884                         self.report_extraction(info['id'])
3885                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3886                         xmlUrl = baseUrl + video + '.xml'
3887                         try:
3888                                 metaXml = urllib2.urlopen(xmlUrl).read()
3889                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3890                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3891                                 return
3892                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3893                         try:
3894                                 info['title'] = mdoc.findall('./title')[0].text
3895                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3896                         except IndexError:
3897                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3898                                 return
3899                         info['stitle'] = _simplify_title(info['title'])
3900                         info['ext'] = info['url'].rpartition('.')[2]
3901                         info['format'] = info['ext']
3902                         self._downloader.increment_downloads()
3903                         try:
3904                                 self._downloader.process_info(info)
3905                         except UnavailableVideoError, err:
3906                                 self._downloader.trouble(u'\nERROR: unable to download video')
3907                 elif mobj.group('course'): # A course page
3908                         unescapeHTML = HTMLParser.HTMLParser().unescape
3909
3910                         course = mobj.group('course')
3911                         info = {
3912                                 'id': _simplify_title(course),
3913                                 'type': 'playlist',
3914                         }
3915
3916                         self.report_download_webpage(info['id'])
3917                         try:
3918                                 coursepage = urllib2.urlopen(url).read()
3919                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3920                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3921                                 return
3922
3923                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3924                         if m:
3925                                 info['title'] = unescapeHTML(m.group(1))
3926                         else:
3927                                 info['title'] = info['id']
3928                         info['stitle'] = _simplify_title(info['title'])
3929
3930                         m = re.search('<description>([^<]+)</description>', coursepage)
3931                         if m:
3932                                 info['description'] = unescapeHTML(m.group(1))
3933
3934                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3935                         info['list'] = [
3936                                 {
3937                                         'type': 'reference',
3938                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3939                                 }
3940                                         for vpage in links]
3941
3942                         for entry in info['list']:
3943                                 assert entry['type'] == 'reference'
3944                                 self.extract(entry['url'])
3945                 else: # Root page
3946                         unescapeHTML = HTMLParser.HTMLParser().unescape
3947
3948                         info = {
3949                                 'id': 'Stanford OpenClassroom',
3950                                 'type': 'playlist',
3951                         }
3952
3953                         self.report_download_webpage(info['id'])
3954                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3955                         try:
3956                                 rootpage = urllib2.urlopen(rootURL).read()
3957                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3958                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3959                                 return
3960
3961                         info['title'] = info['id']
3962                         info['stitle'] = _simplify_title(info['title'])
3963
3964                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3965                         info['list'] = [
3966                                 {
3967                                         'type': 'reference',
3968                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3969                                 }
3970                                         for cpage in links]
3971
3972                         for entry in info['list']:
3973                                 assert entry['type'] == 'reference'
3974                                 self.extract(entry['url'])
3975
3976 class MTVIE(InfoExtractor):
3977         """Information extractor for MTV.com"""
3978
3979         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3980         IE_NAME = u'mtv'
3981
3982         def report_webpage(self, video_id):
3983                 """Report information extraction."""
3984                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3985
3986         def report_extraction(self, video_id):
3987                 """Report information extraction."""
3988                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3989
3990         def _real_extract(self, url):
3991                 mobj = re.match(self._VALID_URL, url)
3992                 if mobj is None:
3993                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3994                         return
3995                 if not mobj.group('proto'):
3996                         url = 'http://' + url
3997                 video_id = mobj.group('videoid')
3998                 self.report_webpage(video_id)
3999
4000                 request = urllib2.Request(url)
4001                 try:
4002                         webpage = urllib2.urlopen(request).read()
4003                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4004                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4005                         return
4006
4007                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4008                 if mobj is None:
4009                         self._downloader.trouble(u'ERROR: unable to extract song name')
4010                         return
4011                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4012                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4013                 if mobj is None:
4014                         self._downloader.trouble(u'ERROR: unable to extract performer')
4015                         return
4016                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4017                 video_title = performer + ' - ' + song_name 
4018
4019                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4020                 if mobj is None:
4021                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4022                         return
4023                 mtvn_uri = mobj.group(1)
4024
4025                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4026                 if mobj is None:
4027                         self._downloader.trouble(u'ERROR: unable to extract content id')
4028                         return
4029                 content_id = mobj.group(1)
4030
4031                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4032                 self.report_extraction(video_id)
4033                 request = urllib2.Request(videogen_url)
4034                 try:
4035                         metadataXml = urllib2.urlopen(request).read()
4036                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4037                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4038                         return
4039
4040                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4041                 renditions = mdoc.findall('.//rendition')
4042
4043                 # For now, always pick the highest quality.
4044                 rendition = renditions[-1]
4045
4046                 try:
4047                         _,_,ext = rendition.attrib['type'].partition('/')
4048                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4049                         video_url = rendition.find('./src').text
4050                 except KeyError:
4051                         self._downloader.trouble('Invalid rendition field.')
4052                         return
4053
4054                 self._downloader.increment_downloads()
4055                 info = {
4056                         'id': video_id,
4057                         'url': video_url,
4058                         'uploader': performer,
4059                         'title': video_title,
4060                         'stitle': _simplify_title(video_title),
4061                         'ext': ext,
4062                         'format': format,
4063                 }
4064
4065                 try:
4066                         self._downloader.process_info(info)
4067                 except UnavailableVideoError, err:
4068                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4069
4070
4071 class PostProcessor(object):
4072         """Post Processor class.
4073
4074         PostProcessor objects can be added to downloaders with their
4075         add_post_processor() method. When the downloader has finished a
4076         successful download, it will take its internal chain of PostProcessors
4077         and start calling the run() method on each one of them, first with
4078         an initial argument and then with the returned value of the previous
4079         PostProcessor.
4080
4081         The chain will be stopped if one of them ever returns None or the end
4082         of the chain is reached.
4083
4084         PostProcessor objects follow a "mutual registration" process similar
4085         to InfoExtractor objects.
4086         """
4087
4088         _downloader = None
4089
4090         def __init__(self, downloader=None):
4091                 self._downloader = downloader
4092
4093         def set_downloader(self, downloader):
4094                 """Sets the downloader for this PP."""
4095                 self._downloader = downloader
4096
4097         def run(self, information):
4098                 """Run the PostProcessor.
4099
4100                 The "information" argument is a dictionary like the ones
4101                 composed by InfoExtractors. The only difference is that this
4102                 one has an extra field called "filepath" that points to the
4103                 downloaded file.
4104
4105                 When this method returns None, the postprocessing chain is
4106                 stopped. However, this method may return an information
4107                 dictionary that will be passed to the next postprocessing
4108                 object in the chain. It can be the one it received after
4109                 changing some fields.
4110
4111                 In addition, this method may raise a PostProcessingError
4112                 exception that will be taken into account by the downloader
4113                 it was called from.
4114                 """
4115                 return information # by default, do nothing
4116
4117 class AudioConversionError(BaseException):
4118         def __init__(self, message):
4119                 self.message = message
4120
4121 class FFmpegExtractAudioPP(PostProcessor):
4122
4123         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4124                 PostProcessor.__init__(self, downloader)
4125                 if preferredcodec is None:
4126                         preferredcodec = 'best'
4127                 self._preferredcodec = preferredcodec
4128                 self._preferredquality = preferredquality
4129                 self._keepvideo = keepvideo
4130
4131         @staticmethod
4132         def get_audio_codec(path):
4133                 try:
4134                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4135                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4136                         output = handle.communicate()[0]
4137                         if handle.wait() != 0:
4138                                 return None
4139                 except (IOError, OSError):
4140                         return None
4141                 audio_codec = None
4142                 for line in output.split('\n'):
4143                         if line.startswith('codec_name='):
4144                                 audio_codec = line.split('=')[1].strip()
4145                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4146                                 return audio_codec
4147                 return None
4148
4149         @staticmethod
4150         def run_ffmpeg(path, out_path, codec, more_opts):
4151                 if codec is None:
4152                         acodec_opts = []
4153                 else:
4154                         acodec_opts = ['-acodec', codec]
4155                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4156                 try:
4157                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4158                         stdout,stderr = p.communicate()
4159                 except (IOError, OSError):
4160                         e = sys.exc_info()[1]
4161                         if isinstance(e, OSError) and e.errno == 2:
4162                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4163                         else:
4164                                 raise e
4165                 if p.returncode != 0:
4166                         msg = stderr.strip().split('\n')[-1]
4167                         raise AudioConversionError(msg)
4168
4169         def run(self, information):
4170                 path = information['filepath']
4171
4172                 filecodec = self.get_audio_codec(path)
4173                 if filecodec is None:
4174                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4175                         return None
4176
4177                 more_opts = []
4178                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4179                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4180                                 # Lossless, but in another container
4181                                 acodec = 'copy'
4182                                 extension = self._preferredcodec
4183                                 more_opts = ['-absf', 'aac_adtstoasc']
4184                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4185                                 # Lossless if possible
4186                                 acodec = 'copy'
4187                                 extension = filecodec
4188                                 if filecodec == 'aac':
4189                                         more_opts = ['-f', 'adts']
4190                                 if filecodec == 'vorbis':
4191                                         extension = 'ogg'
4192                         else:
4193                                 # MP3 otherwise.
4194                                 acodec = 'libmp3lame'
4195                                 extension = 'mp3'
4196                                 more_opts = []
4197                                 if self._preferredquality is not None:
4198                                         more_opts += ['-ab', self._preferredquality]
4199                 else:
4200                         # We convert the audio (lossy)
4201                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4202                         extension = self._preferredcodec
4203                         more_opts = []
4204                         if self._preferredquality is not None:
4205                                 more_opts += ['-ab', self._preferredquality]
4206                         if self._preferredcodec == 'aac':
4207                                 more_opts += ['-f', 'adts']
4208                         if self._preferredcodec == 'm4a':
4209                                 more_opts += ['-absf', 'aac_adtstoasc']
4210                         if self._preferredcodec == 'vorbis':
4211                                 extension = 'ogg'
4212                         if self._preferredcodec == 'wav':
4213                                 extension = 'wav'
4214                                 more_opts += ['-f', 'wav']
4215
4216                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4217                 new_path = prefix + sep + extension
4218                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4219                 try:
4220                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4221                 except:
4222                         etype,e,tb = sys.exc_info()
4223                         if isinstance(e, AudioConversionError):
4224                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4225                         else:
4226                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4227                         return None
4228
4229                 # Try to update the date time for extracted audio file.
4230                 if information.get('filetime') is not None:
4231                         try:
4232                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4233                         except:
4234                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4235
4236                 if not self._keepvideo:
4237                         try:
4238                                 os.remove(_encodeFilename(path))
4239                         except (IOError, OSError):
4240                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4241                                 return None
4242
4243                 information['filepath'] = new_path
4244                 return information
4245
4246
4247 def updateSelf(downloader, filename):
4248         ''' Update the program file with the latest version from the repository '''
4249         # Note: downloader only used for options
4250         if not os.access(filename, os.W_OK):
4251                 sys.exit('ERROR: no write permissions on %s' % filename)
4252
4253         downloader.to_screen(u'Updating to latest version...')
4254
4255         try:
4256                 try:
4257                         urlh = urllib.urlopen(UPDATE_URL)
4258                         newcontent = urlh.read()
4259                         
4260                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4261                         if vmatch is not None and vmatch.group(1) == __version__:
4262                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4263                                 return
4264                 finally:
4265                         urlh.close()
4266         except (IOError, OSError), err:
4267                 sys.exit('ERROR: unable to download latest version')
4268
4269         try:
4270                 outf = open(filename, 'wb')
4271                 try:
4272                         outf.write(newcontent)
4273                 finally:
4274                         outf.close()
4275         except (IOError, OSError), err:
4276                 sys.exit('ERROR: unable to overwrite current version')
4277
4278         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4279
4280 def parseOpts():
4281         def _readOptions(filename_bytes):
4282                 try:
4283                         optionf = open(filename_bytes)
4284                 except IOError:
4285                         return [] # silently skip if file is not present
4286                 try:
4287                         res = []
4288                         for l in optionf:
4289                                 res += shlex.split(l, comments=True)
4290                 finally:
4291                         optionf.close()
4292                 return res
4293
4294         def _format_option_string(option):
4295                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4296
4297                 opts = []
4298
4299                 if option._short_opts: opts.append(option._short_opts[0])
4300                 if option._long_opts: opts.append(option._long_opts[0])
4301                 if len(opts) > 1: opts.insert(1, ', ')
4302
4303                 if option.takes_value(): opts.append(' %s' % option.metavar)
4304
4305                 return "".join(opts)
4306
4307         def _find_term_columns():
4308                 columns = os.environ.get('COLUMNS', None)
4309                 if columns:
4310                         return int(columns)
4311
4312                 try:
4313                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4314                         out,err = sp.communicate()
4315                         return int(out.split()[1])
4316                 except:
4317                         pass
4318                 return None
4319
4320         max_width = 80
4321         max_help_position = 80
4322
4323         # No need to wrap help messages if we're on a wide console
4324         columns = _find_term_columns()
4325         if columns: max_width = columns
4326
4327         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4328         fmt.format_option_strings = _format_option_string
4329
4330         kw = {
4331                 'version'   : __version__,
4332                 'formatter' : fmt,
4333                 'usage' : '%prog [options] url [url...]',
4334                 'conflict_handler' : 'resolve',
4335         }
4336
4337         parser = optparse.OptionParser(**kw)
4338
4339         # option groups
4340         general        = optparse.OptionGroup(parser, 'General Options')
4341         selection      = optparse.OptionGroup(parser, 'Video Selection')
4342         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4343         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4344         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4345         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4346         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4347
4348         general.add_option('-h', '--help',
4349                         action='help', help='print this help text and exit')
4350         general.add_option('-v', '--version',
4351                         action='version', help='print program version and exit')
4352         general.add_option('-U', '--update',
4353                         action='store_true', dest='update_self', help='update this program to latest version')
4354         general.add_option('-i', '--ignore-errors',
4355                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4356         general.add_option('-r', '--rate-limit',
4357                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4358         general.add_option('-R', '--retries',
4359                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4360         general.add_option('--dump-user-agent',
4361                         action='store_true', dest='dump_user_agent',
4362                         help='display the current browser identification', default=False)
4363         general.add_option('--list-extractors',
4364                         action='store_true', dest='list_extractors',
4365                         help='List all supported extractors and the URLs they would handle', default=False)
4366
4367         selection.add_option('--playlist-start',
4368                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4369         selection.add_option('--playlist-end',
4370                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4371         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4372         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4373         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4374
4375         authentication.add_option('-u', '--username',
4376                         dest='username', metavar='USERNAME', help='account username')
4377         authentication.add_option('-p', '--password',
4378                         dest='password', metavar='PASSWORD', help='account password')
4379         authentication.add_option('-n', '--netrc',
4380                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4381
4382
4383         video_format.add_option('-f', '--format',
4384                         action='store', dest='format', metavar='FORMAT', help='video format code')
4385         video_format.add_option('--all-formats',
4386                         action='store_const', dest='format', help='download all available video formats', const='all')
4387         video_format.add_option('--prefer-free-formats',
4388                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4389         video_format.add_option('--max-quality',
4390                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4391         video_format.add_option('-F', '--list-formats',
4392                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4393
4394
4395         verbosity.add_option('-q', '--quiet',
4396                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4397         verbosity.add_option('-s', '--simulate',
4398                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4399         verbosity.add_option('--skip-download',
4400                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4401         verbosity.add_option('-g', '--get-url',
4402                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4403         verbosity.add_option('-e', '--get-title',
4404                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4405         verbosity.add_option('--get-thumbnail',
4406                         action='store_true', dest='getthumbnail',
4407                         help='simulate, quiet but print thumbnail URL', default=False)
4408         verbosity.add_option('--get-description',
4409                         action='store_true', dest='getdescription',
4410                         help='simulate, quiet but print video description', default=False)
4411         verbosity.add_option('--get-filename',
4412                         action='store_true', dest='getfilename',
4413                         help='simulate, quiet but print output filename', default=False)
4414         verbosity.add_option('--get-format',
4415                         action='store_true', dest='getformat',
4416                         help='simulate, quiet but print output format', default=False)
4417         verbosity.add_option('--no-progress',
4418                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4419         verbosity.add_option('--console-title',
4420                         action='store_true', dest='consoletitle',
4421                         help='display progress in console titlebar', default=False)
4422         verbosity.add_option('-v', '--verbose',
4423                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4424
4425
4426         filesystem.add_option('-t', '--title',
4427                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4428         filesystem.add_option('-l', '--literal',
4429                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4430         filesystem.add_option('-A', '--auto-number',
4431                         action='store_true', dest='autonumber',
4432                         help='number downloaded files starting from 00000', default=False)
4433         filesystem.add_option('-o', '--output',
4434                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4435         filesystem.add_option('-a', '--batch-file',
4436                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4437         filesystem.add_option('-w', '--no-overwrites',
4438                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4439         filesystem.add_option('-c', '--continue',
4440                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4441         filesystem.add_option('--no-continue',
4442                         action='store_false', dest='continue_dl',
4443                         help='do not resume partially downloaded files (restart from beginning)')
4444         filesystem.add_option('--cookies',
4445                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4446         filesystem.add_option('--no-part',
4447                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4448         filesystem.add_option('--no-mtime',
4449                         action='store_false', dest='updatetime',
4450                         help='do not use the Last-modified header to set the file modification time', default=True)
4451         filesystem.add_option('--write-description',
4452                         action='store_true', dest='writedescription',
4453                         help='write video description to a .description file', default=False)
4454         filesystem.add_option('--write-info-json',
4455                         action='store_true', dest='writeinfojson',
4456                         help='write video metadata to a .info.json file', default=False)
4457         filesystem.add_option('--write-srt',
4458                         action='store_true', dest='writesubtitles',
4459                         help='write video subtitles to a .srt file', default=False)
4460
4461
4462         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4463                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4464         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4465                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4466         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4467                         help='ffmpeg audio bitrate specification, 128k by default')
4468         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4469                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4470
4471
4472         parser.add_option_group(general)
4473         parser.add_option_group(selection)
4474         parser.add_option_group(filesystem)
4475         parser.add_option_group(verbosity)
4476         parser.add_option_group(video_format)
4477         parser.add_option_group(authentication)
4478         parser.add_option_group(postproc)
4479
4480         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4481         if xdg_config_home:
4482                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4483         else:
4484                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4485         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4486         opts, args = parser.parse_args(argv)
4487
4488         return parser, opts, args
4489
4490 def gen_extractors():
4491         """ Return a list of an instance of every supported extractor.
4492         The order does matter; the first extractor matched is the one handling the URL.
4493         """
4494         youtube_ie = YoutubeIE()
4495         google_ie = GoogleIE()
4496         yahoo_ie = YahooIE()
4497         return [
4498                 YoutubePlaylistIE(youtube_ie),
4499                 YoutubeUserIE(youtube_ie),
4500                 YoutubeSearchIE(youtube_ie),
4501                 youtube_ie,
4502                 MetacafeIE(youtube_ie),
4503                 DailymotionIE(),
4504                 google_ie,
4505                 GoogleSearchIE(google_ie),
4506                 PhotobucketIE(),
4507                 yahoo_ie,
4508                 YahooSearchIE(yahoo_ie),
4509                 DepositFilesIE(),
4510                 FacebookIE(),
4511                 BlipTVIE(),
4512                 VimeoIE(),
4513                 MyVideoIE(),
4514                 ComedyCentralIE(),
4515                 EscapistIE(),
4516                 CollegeHumorIE(),
4517                 XVideosIE(),
4518                 SoundcloudIE(),
4519                 InfoQIE(),
4520                 MixcloudIE(),
4521                 StanfordOpenClassroomIE(),
4522                 MTVIE(),
4523
4524                 GenericIE()
4525         ]
4526
4527 def _real_main():
4528         parser, opts, args = parseOpts()
4529
4530         # Open appropriate CookieJar
4531         if opts.cookiefile is None:
4532                 jar = cookielib.CookieJar()
4533         else:
4534                 try:
4535                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4536                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4537                                 jar.load()
4538                 except (IOError, OSError), err:
4539                         sys.exit(u'ERROR: unable to open cookie file')
4540
4541         # Dump user agent
4542         if opts.dump_user_agent:
4543                 print std_headers['User-Agent']
4544                 sys.exit(0)
4545
4546         # Batch file verification
4547         batchurls = []
4548         if opts.batchfile is not None:
4549                 try:
4550                         if opts.batchfile == '-':
4551                                 batchfd = sys.stdin
4552                         else:
4553                                 batchfd = open(opts.batchfile, 'r')
4554                         batchurls = batchfd.readlines()
4555                         batchurls = [x.strip() for x in batchurls]
4556                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4557                 except IOError:
4558                         sys.exit(u'ERROR: batch file could not be read')
4559         all_urls = batchurls + args
4560
4561         # General configuration
4562         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4563         proxy_handler = urllib2.ProxyHandler()
4564         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4565         urllib2.install_opener(opener)
4566         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4567
4568         if opts.verbose:
4569                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4570
4571         extractors = gen_extractors()
4572
4573         if opts.list_extractors:
4574                 for ie in extractors:
4575                         print(ie.IE_NAME)
4576                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4577                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4578                         for mu in matchedUrls:
4579                                 print(u'  ' + mu)
4580                 sys.exit(0)
4581
4582         # Conflicting, missing and erroneous options
4583         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4584                 parser.error(u'using .netrc conflicts with giving username/password')
4585         if opts.password is not None and opts.username is None:
4586                 parser.error(u'account username missing')
4587         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4588                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4589         if opts.usetitle and opts.useliteral:
4590                 parser.error(u'using title conflicts with using literal title')
4591         if opts.username is not None and opts.password is None:
4592                 opts.password = getpass.getpass(u'Type account password and press return:')
4593         if opts.ratelimit is not None:
4594                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4595                 if numeric_limit is None:
4596                         parser.error(u'invalid rate limit specified')
4597                 opts.ratelimit = numeric_limit
4598         if opts.retries is not None:
4599                 try:
4600                         opts.retries = long(opts.retries)
4601                 except (TypeError, ValueError), err:
4602                         parser.error(u'invalid retry count specified')
4603         try:
4604                 opts.playliststart = int(opts.playliststart)
4605                 if opts.playliststart <= 0:
4606                         raise ValueError(u'Playlist start must be positive')
4607         except (TypeError, ValueError), err:
4608                 parser.error(u'invalid playlist start number specified')
4609         try:
4610                 opts.playlistend = int(opts.playlistend)
4611                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4612                         raise ValueError(u'Playlist end must be greater than playlist start')
4613         except (TypeError, ValueError), err:
4614                 parser.error(u'invalid playlist end number specified')
4615         if opts.extractaudio:
4616                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4617                         parser.error(u'invalid audio format specified')
4618
4619         # File downloader
4620         fd = FileDownloader({
4621                 'usenetrc': opts.usenetrc,
4622                 'username': opts.username,
4623                 'password': opts.password,
4624                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4625                 'forceurl': opts.geturl,
4626                 'forcetitle': opts.gettitle,
4627                 'forcethumbnail': opts.getthumbnail,
4628                 'forcedescription': opts.getdescription,
4629                 'forcefilename': opts.getfilename,
4630                 'forceformat': opts.getformat,
4631                 'simulate': opts.simulate,
4632                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4633                 'format': opts.format,
4634                 'format_limit': opts.format_limit,
4635                 'listformats': opts.listformats,
4636                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4637                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4638                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4639                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4640                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4641                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4642                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4643                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4644                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4645                         or u'%(id)s.%(ext)s'),
4646                 'ignoreerrors': opts.ignoreerrors,
4647                 'ratelimit': opts.ratelimit,
4648                 'nooverwrites': opts.nooverwrites,
4649                 'retries': opts.retries,
4650                 'continuedl': opts.continue_dl,
4651                 'noprogress': opts.noprogress,
4652                 'playliststart': opts.playliststart,
4653                 'playlistend': opts.playlistend,
4654                 'logtostderr': opts.outtmpl == '-',
4655                 'consoletitle': opts.consoletitle,
4656                 'nopart': opts.nopart,
4657                 'updatetime': opts.updatetime,
4658                 'writedescription': opts.writedescription,
4659                 'writeinfojson': opts.writeinfojson,
4660                 'writesubtitles': opts.writesubtitles,
4661                 'matchtitle': opts.matchtitle,
4662                 'rejecttitle': opts.rejecttitle,
4663                 'max_downloads': opts.max_downloads,
4664                 'prefer_free_formats': opts.prefer_free_formats,
4665                 'verbose': opts.verbose,
4666                 })
4667         for extractor in extractors:
4668                 fd.add_info_extractor(extractor)
4669
4670         # PostProcessors
4671         if opts.extractaudio:
4672                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4673
4674         # Update version
4675         if opts.update_self:
4676                 updateSelf(fd, sys.argv[0])
4677
4678         # Maybe do nothing
4679         if len(all_urls) < 1:
4680                 if not opts.update_self:
4681                         parser.error(u'you must provide at least one URL')
4682                 else:
4683                         sys.exit()
4684         
4685         try:
4686                 retcode = fd.download(all_urls)
4687         except MaxDownloadsReached:
4688                 fd.to_screen(u'--max-download limit reached, aborting.')
4689                 retcode = 101
4690
4691         # Dump cookie jar if requested
4692         if opts.cookiefile is not None:
4693                 try:
4694                         jar.save()
4695                 except (IOError, OSError), err:
4696                         sys.exit(u'ERROR: unable to save cookie jar')
4697
4698         sys.exit(retcode)
4699
4700 def main():
4701         try:
4702                 _real_main()
4703         except DownloadError:
4704                 sys.exit(1)
4705         except SameFileError:
4706                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4707         except KeyboardInterrupt:
4708                 sys.exit(u'\nERROR: Interrupted by user')
4709
4710 if __name__ == '__main__':
4711         main()
4712
4713 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: