Merge branch 'master' of git://github.com/rg3/youtube-dl into closed-captions
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25
26 import cookielib
27 import datetime
28 import getpass
29 import gzip
30 import htmlentitydefs
31 import HTMLParser
32 import httplib
33 import locale
34 import math
35 import netrc
36 import optparse
37 import os
38 import os.path
39 import re
40 import shlex
41 import socket
42 import string
43 import subprocess
44 import sys
45 import time
46 import urllib
47 import urllib2
48 import warnings
49 import zlib
50
51 if os.name == 'nt':
52         import ctypes
53
54 try:
55         import email.utils
56 except ImportError: # Python 2.4
57         import email.Utils
58 try:
59         import cStringIO as StringIO
60 except ImportError:
61         import StringIO
62
63 # parse_qs was moved from the cgi module to the urlparse module recently.
64 try:
65         from urlparse import parse_qs
66 except ImportError:
67         from cgi import parse_qs
68
69 try:
70         import lxml.etree
71 except ImportError:
72         pass # Handled below
73
74 try:
75         import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
78
79 std_headers = {
80         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83         'Accept-Encoding': 'gzip, deflate',
84         'Accept-Language': 'en-us,en;q=0.5',
85 }
86
87 try:
88         import json
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90         import re
91         class json(object):
92                 @staticmethod
93                 def loads(s):
94                         s = s.decode('UTF-8')
95                         def raiseError(msg, i):
96                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97                         def skipSpace(i, expectMore=True):
98                                 while i < len(s) and s[i] in ' \t\r\n':
99                                         i += 1
100                                 if expectMore:
101                                         if i >= len(s):
102                                                 raiseError('Premature end', i)
103                                 return i
104                         def decodeEscape(match):
105                                 esc = match.group(1)
106                                 _STATIC = {
107                                         '"': '"',
108                                         '\\': '\\',
109                                         '/': '/',
110                                         'b': unichr(0x8),
111                                         'f': unichr(0xc),
112                                         'n': '\n',
113                                         'r': '\r',
114                                         't': '\t',
115                                 }
116                                 if esc in _STATIC:
117                                         return _STATIC[esc]
118                                 if esc[0] == 'u':
119                                         if len(esc) == 1+4:
120                                                 return unichr(int(esc[1:5], 16))
121                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
122                                                 hi = int(esc[1:5], 16)
123                                                 low = int(esc[7:11], 16)
124                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125                                 raise ValueError('Unknown escape ' + str(esc))
126                         def parseString(i):
127                                 i += 1
128                                 e = i
129                                 while True:
130                                         e = s.index('"', e)
131                                         bslashes = 0
132                                         while s[e-bslashes-1] == '\\':
133                                                 bslashes += 1
134                                         if bslashes % 2 == 1:
135                                                 e += 1
136                                                 continue
137                                         break
138                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139                                 stri = rexp.sub(decodeEscape, s[i:e])
140                                 return (e+1,stri)
141                         def parseObj(i):
142                                 i += 1
143                                 res = {}
144                                 i = skipSpace(i)
145                                 if s[i] == '}': # Empty dictionary
146                                         return (i+1,res)
147                                 while True:
148                                         if s[i] != '"':
149                                                 raiseError('Expected a string object key', i)
150                                         i,key = parseString(i)
151                                         i = skipSpace(i)
152                                         if i >= len(s) or s[i] != ':':
153                                                 raiseError('Expected a colon', i)
154                                         i,val = parse(i+1)
155                                         res[key] = val
156                                         i = skipSpace(i)
157                                         if s[i] == '}':
158                                                 return (i+1, res)
159                                         if s[i] != ',':
160                                                 raiseError('Expected comma or closing curly brace', i)
161                                         i = skipSpace(i+1)
162                         def parseArray(i):
163                                 res = []
164                                 i = skipSpace(i+1)
165                                 if s[i] == ']': # Empty array
166                                         return (i+1,res)
167                                 while True:
168                                         i,val = parse(i)
169                                         res.append(val)
170                                         i = skipSpace(i) # Raise exception if premature end
171                                         if s[i] == ']':
172                                                 return (i+1, res)
173                                         if s[i] != ',':
174                                                 raiseError('Expected a comma or closing bracket', i)
175                                         i = skipSpace(i+1)
176                         def parseDiscrete(i):
177                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
178                                         if s.startswith(k, i):
179                                                 return (i+len(k), v)
180                                 raiseError('Not a boolean (or null)', i)
181                         def parseNumber(i):
182                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
183                                 if mobj is None:
184                                         raiseError('Not a number', i)
185                                 nums = mobj.group(1)
186                                 if '.' in nums or 'e' in nums or 'E' in nums:
187                                         return (i+len(nums), float(nums))
188                                 return (i+len(nums), int(nums))
189                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
190                         def parse(i):
191                                 i = skipSpace(i)
192                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
193                                 i = skipSpace(i, False)
194                                 return (i,res)
195                         i,res = parse(0)
196                         if i < len(s):
197                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
198                         return res
199
200 def preferredencoding():
201         """Get preferred encoding.
202
203         Returns the best encoding scheme for the system, based on
204         locale.getpreferredencoding() and some further tweaks.
205         """
206         def yield_preferredencoding():
207                 try:
208                         pref = locale.getpreferredencoding()
209                         u'TEST'.encode(pref)
210                 except:
211                         pref = 'UTF-8'
212                 while True:
213                         yield pref
214         return yield_preferredencoding().next()
215
216
217 def htmlentity_transform(matchobj):
218         """Transforms an HTML entity to a Unicode character.
219
220         This function receives a match object and is intended to be used with
221         the re.sub() function.
222         """
223         entity = matchobj.group(1)
224
225         # Known non-numeric HTML entity
226         if entity in htmlentitydefs.name2codepoint:
227                 return unichr(htmlentitydefs.name2codepoint[entity])
228
229         # Unicode character
230         mobj = re.match(ur'(?u)#(x?\d+)', entity)
231         if mobj is not None:
232                 numstr = mobj.group(1)
233                 if numstr.startswith(u'x'):
234                         base = 16
235                         numstr = u'0%s' % numstr
236                 else:
237                         base = 10
238                 return unichr(long(numstr, base))
239
240         # Unknown entity in name, return its literal representation
241         return (u'&%s;' % entity)
242
243
244 def sanitize_title(utitle):
245         """Sanitizes a video title so it could be used as part of a filename."""
246         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247         return utitle.replace(unicode(os.sep), u'%')
248
249
250 def sanitize_open(filename, open_mode):
251         """Try to open the given filename, and slightly tweak it if this fails.
252
253         Attempts to open the given filename. If this fails, it tries to change
254         the filename slightly, step by step, until it's either able to open it
255         or it fails and raises a final exception, like the standard open()
256         function.
257
258         It returns the tuple (stream, definitive_file_name).
259         """
260         try:
261                 if filename == u'-':
262                         if sys.platform == 'win32':
263                                 import msvcrt
264                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265                         return (sys.stdout, filename)
266                 stream = open(_encodeFilename(filename), open_mode)
267                 return (stream, filename)
268         except (IOError, OSError), err:
269                 # In case of error, try to remove win32 forbidden chars
270                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
271
272                 # An exception here should be caught in the caller
273                 stream = open(_encodeFilename(filename), open_mode)
274                 return (stream, filename)
275
276
277 def timeconvert(timestr):
278         """Convert RFC 2822 defined time string into system timestamp"""
279         timestamp = None
280         timetuple = email.utils.parsedate_tz(timestr)
281         if timetuple is not None:
282                 timestamp = email.utils.mktime_tz(timetuple)
283         return timestamp
284
285 def _simplify_title(title):
286         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287         return expr.sub(u'_', title).strip(u'_')
288
289 def _orderedSet(iterable):
290         """ Remove all duplicates from the input iterable """
291         res = []
292         for el in iterable:
293                 if el not in res:
294                         res.append(el)
295         return res
296
297 def _unescapeHTML(s):
298         """
299         @param s a string (of type unicode)
300         """
301         assert type(s) == type(u'')
302
303         htmlParser = HTMLParser.HTMLParser()
304         return htmlParser.unescape(s)
305
306 def _encodeFilename(s):
307         """
308         @param s The name of the file (of type unicode)
309         """
310
311         assert type(s) == type(u'')
312
313         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
317                 return s
318         else:
319                 return s.encode(sys.getfilesystemencoding(), 'ignore')
320
321 class DownloadError(Exception):
322         """Download Error exception.
323
324         This exception may be thrown by FileDownloader objects if they are not
325         configured to continue on errors. They will contain the appropriate
326         error message.
327         """
328         pass
329
330
331 class SameFileError(Exception):
332         """Same File exception.
333
334         This exception will be thrown by FileDownloader objects if they detect
335         multiple files would have to be downloaded to the same file on disk.
336         """
337         pass
338
339
340 class PostProcessingError(Exception):
341         """Post Processing exception.
342
343         This exception may be raised by PostProcessor's .run() method to
344         indicate an error in the postprocessing task.
345         """
346         pass
347
348 class MaxDownloadsReached(Exception):
349         """ --max-downloads limit has been reached. """
350         pass
351
352
353 class UnavailableVideoError(Exception):
354         """Unavailable Format exception.
355
356         This exception will be thrown when a video is requested
357         in a format that is not available for that video.
358         """
359         pass
360
361
362 class ContentTooShortError(Exception):
363         """Content Too Short exception.
364
365         This exception may be raised by FileDownloader objects when a file they
366         download is too small for what the server announced first, indicating
367         the connection was probably interrupted.
368         """
369         # Both in bytes
370         downloaded = None
371         expected = None
372
373         def __init__(self, downloaded, expected):
374                 self.downloaded = downloaded
375                 self.expected = expected
376
377
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379         """Handler for HTTP requests and responses.
380
381         This class, when installed with an OpenerDirector, automatically adds
382         the standard headers to every HTTP request and handles gzipped and
383         deflated responses from web servers. If compression is to be avoided in
384         a particular request, the original request in the program code only has
385         to include the HTTP header "Youtubedl-No-Compression", which will be
386         removed before making the real request.
387
388         Part of this code was copied from:
389
390         http://techknack.net/python-urllib2-handlers/
391
392         Andrew Rowls, the author of that code, agreed to release it to the
393         public domain.
394         """
395
396         @staticmethod
397         def deflate(data):
398                 try:
399                         return zlib.decompress(data, -zlib.MAX_WBITS)
400                 except zlib.error:
401                         return zlib.decompress(data)
402
403         @staticmethod
404         def addinfourl_wrapper(stream, headers, url, code):
405                 if hasattr(urllib2.addinfourl, 'getcode'):
406                         return urllib2.addinfourl(stream, headers, url, code)
407                 ret = urllib2.addinfourl(stream, headers, url)
408                 ret.code = code
409                 return ret
410
411         def http_request(self, req):
412                 for h in std_headers:
413                         if h in req.headers:
414                                 del req.headers[h]
415                         req.add_header(h, std_headers[h])
416                 if 'Youtubedl-no-compression' in req.headers:
417                         if 'Accept-encoding' in req.headers:
418                                 del req.headers['Accept-encoding']
419                         del req.headers['Youtubedl-no-compression']
420                 return req
421
422         def http_response(self, req, resp):
423                 old_resp = resp
424                 # gzip
425                 if resp.headers.get('Content-encoding', '') == 'gzip':
426                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428                         resp.msg = old_resp.msg
429                 # deflate
430                 if resp.headers.get('Content-encoding', '') == 'deflate':
431                         gz = StringIO.StringIO(self.deflate(resp.read()))
432                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433                         resp.msg = old_resp.msg
434                 return resp
435
436
437 class FileDownloader(object):
438         """File Downloader class.
439
440         File downloader objects are the ones responsible of downloading the
441         actual video file and writing it to disk if the user has requested
442         it, among some other tasks. In most cases there should be one per
443         program. As, given a video URL, the downloader doesn't know how to
444         extract all the needed information, task that InfoExtractors do, it
445         has to pass the URL to one of them.
446
447         For this, file downloader objects have a method that allows
448         InfoExtractors to be registered in a given order. When it is passed
449         a URL, the file downloader handles it to the first InfoExtractor it
450         finds that reports being able to handle it. The InfoExtractor extracts
451         all the information about the video or videos the URL refers to, and
452         asks the FileDownloader to process the video information, possibly
453         downloading the video.
454
455         File downloaders accept a lot of parameters. In order not to saturate
456         the object constructor with arguments, it receives a dictionary of
457         options instead. These options are available through the params
458         attribute for the InfoExtractors to use. The FileDownloader also
459         registers itself as the downloader in charge for the InfoExtractors
460         that are added to it, so this is a "mutual registration".
461
462         Available options:
463
464         username:         Username for authentication purposes.
465         password:         Password for authentication purposes.
466         usenetrc:         Use netrc for authentication instead.
467         quiet:            Do not print messages to stdout.
468         forceurl:         Force printing final URL.
469         forcetitle:       Force printing title.
470         forcethumbnail:   Force printing thumbnail URL.
471         forcedescription: Force printing description.
472         forcefilename:    Force printing final filename.
473         simulate:         Do not download the video files.
474         format:           Video format code.
475         format_limit:     Highest quality format to try.
476         outtmpl:          Template for output names.
477         ignoreerrors:     Do not stop on download errors.
478         ratelimit:        Download speed limit, in bytes/sec.
479         nooverwrites:     Prevent overwriting files.
480         retries:          Number of times to retry for HTTP error 5xx
481         continuedl:       Try to continue downloads if possible.
482         noprogress:       Do not print the progress bar.
483         playliststart:    Playlist item to start at.
484         playlistend:      Playlist item to end at.
485         matchtitle:       Download only matching titles.
486         rejecttitle:      Reject downloads for matching titles.
487         logtostderr:      Log messages to stderr instead of stdout.
488         consoletitle:     Display progress in console window's titlebar.
489         nopart:           Do not use temporary .part files.
490         updatetime:       Use the Last-modified header to set output file timestamps.
491         writedescription: Write the video description to a .description file
492         writeinfojson:    Write the video description to a .info.json file
493         writesubtitles:   Write the video subtitles to a .srt file
494         """
495
496         params = None
497         _ies = []
498         _pps = []
499         _download_retcode = None
500         _num_downloads = None
501         _screen_file = None
502
503         def __init__(self, params):
504                 """Create a FileDownloader object with the given options."""
505                 self._ies = []
506                 self._pps = []
507                 self._download_retcode = 0
508                 self._num_downloads = 0
509                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
510                 self.params = params
511
512         @staticmethod
513         def format_bytes(bytes):
514                 if bytes is None:
515                         return 'N/A'
516                 if type(bytes) is str:
517                         bytes = float(bytes)
518                 if bytes == 0.0:
519                         exponent = 0
520                 else:
521                         exponent = long(math.log(bytes, 1024.0))
522                 suffix = 'bkMGTPEZY'[exponent]
523                 converted = float(bytes) / float(1024 ** exponent)
524                 return '%.2f%s' % (converted, suffix)
525
526         @staticmethod
527         def calc_percent(byte_counter, data_len):
528                 if data_len is None:
529                         return '---.-%'
530                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
531
532         @staticmethod
533         def calc_eta(start, now, total, current):
534                 if total is None:
535                         return '--:--'
536                 dif = now - start
537                 if current == 0 or dif < 0.001: # One millisecond
538                         return '--:--'
539                 rate = float(current) / dif
540                 eta = long((float(total) - float(current)) / rate)
541                 (eta_mins, eta_secs) = divmod(eta, 60)
542                 if eta_mins > 99:
543                         return '--:--'
544                 return '%02d:%02d' % (eta_mins, eta_secs)
545
546         @staticmethod
547         def calc_speed(start, now, bytes):
548                 dif = now - start
549                 if bytes == 0 or dif < 0.001: # One millisecond
550                         return '%10s' % '---b/s'
551                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
552
553         @staticmethod
554         def best_block_size(elapsed_time, bytes):
555                 new_min = max(bytes / 2.0, 1.0)
556                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
557                 if elapsed_time < 0.001:
558                         return long(new_max)
559                 rate = bytes / elapsed_time
560                 if rate > new_max:
561                         return long(new_max)
562                 if rate < new_min:
563                         return long(new_min)
564                 return long(rate)
565
566         @staticmethod
567         def parse_bytes(bytestr):
568                 """Parse a string indicating a byte quantity into a long integer."""
569                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
570                 if matchobj is None:
571                         return None
572                 number = float(matchobj.group(1))
573                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
574                 return long(round(number * multiplier))
575
576         def add_info_extractor(self, ie):
577                 """Add an InfoExtractor object to the end of the list."""
578                 self._ies.append(ie)
579                 ie.set_downloader(self)
580
581         def add_post_processor(self, pp):
582                 """Add a PostProcessor object to the end of the chain."""
583                 self._pps.append(pp)
584                 pp.set_downloader(self)
585
586         def to_screen(self, message, skip_eol=False):
587                 """Print message to stdout if not in quiet mode."""
588                 assert type(message) == type(u'')
589                 if not self.params.get('quiet', False):
590                         terminator = [u'\n', u''][skip_eol]
591                         output = message + terminator
592
593                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
594                                 output = output.encode(preferredencoding(), 'ignore')
595                         self._screen_file.write(output)
596                         self._screen_file.flush()
597
598         def to_stderr(self, message):
599                 """Print message to stderr."""
600                 print >>sys.stderr, message.encode(preferredencoding())
601
602         def to_cons_title(self, message):
603                 """Set console/terminal window title to message."""
604                 if not self.params.get('consoletitle', False):
605                         return
606                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
607                         # c_wchar_p() might not be necessary if `message` is
608                         # already of type unicode()
609                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
610                 elif 'TERM' in os.environ:
611                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
612
613         def fixed_template(self):
614                 """Checks if the output template is fixed."""
615                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
616
617         def trouble(self, message=None):
618                 """Determine action to take when a download problem appears.
619
620                 Depending on if the downloader has been configured to ignore
621                 download errors or not, this method may throw an exception or
622                 not when errors are found, after printing the message.
623                 """
624                 if message is not None:
625                         self.to_stderr(message)
626                 if not self.params.get('ignoreerrors', False):
627                         raise DownloadError(message)
628                 self._download_retcode = 1
629
630         def slow_down(self, start_time, byte_counter):
631                 """Sleep if the download speed is over the rate limit."""
632                 rate_limit = self.params.get('ratelimit', None)
633                 if rate_limit is None or byte_counter == 0:
634                         return
635                 now = time.time()
636                 elapsed = now - start_time
637                 if elapsed <= 0.0:
638                         return
639                 speed = float(byte_counter) / elapsed
640                 if speed > rate_limit:
641                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
642
643         def temp_name(self, filename):
644                 """Returns a temporary filename for the given filename."""
645                 if self.params.get('nopart', False) or filename == u'-' or \
646                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
647                         return filename
648                 return filename + u'.part'
649
650         def undo_temp_name(self, filename):
651                 if filename.endswith(u'.part'):
652                         return filename[:-len(u'.part')]
653                 return filename
654
655         def try_rename(self, old_filename, new_filename):
656                 try:
657                         if old_filename == new_filename:
658                                 return
659                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
660                 except (IOError, OSError), err:
661                         self.trouble(u'ERROR: unable to rename file')
662
663         def try_utime(self, filename, last_modified_hdr):
664                 """Try to set the last-modified time of the given file."""
665                 if last_modified_hdr is None:
666                         return
667                 if not os.path.isfile(_encodeFilename(filename)):
668                         return
669                 timestr = last_modified_hdr
670                 if timestr is None:
671                         return
672                 filetime = timeconvert(timestr)
673                 if filetime is None:
674                         return filetime
675                 try:
676                         os.utime(filename, (time.time(), filetime))
677                 except:
678                         pass
679                 return filetime
680
681         def report_writedescription(self, descfn):
682                 """ Report that the description file is being written """
683                 self.to_screen(u'[info] Writing video description to: ' + descfn)
684
685         def report_writesubtitles(self, srtfn):
686                 """ Report that the subtitles file is being written """
687                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
688
689         def report_writeinfojson(self, infofn):
690                 """ Report that the metadata file has been written """
691                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
692
693         def report_destination(self, filename):
694                 """Report destination filename."""
695                 self.to_screen(u'[download] Destination: ' + filename)
696
697         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
698                 """Report download progress."""
699                 if self.params.get('noprogress', False):
700                         return
701                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
702                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
703                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
704                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
705
706         def report_resuming_byte(self, resume_len):
707                 """Report attempt to resume at given byte."""
708                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
709
710         def report_retry(self, count, retries):
711                 """Report retry in case of HTTP error 5xx"""
712                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
713
714         def report_file_already_downloaded(self, file_name):
715                 """Report file has already been fully downloaded."""
716                 try:
717                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
718                 except (UnicodeEncodeError), err:
719                         self.to_screen(u'[download] The file has already been downloaded')
720
721         def report_unable_to_resume(self):
722                 """Report it was impossible to resume download."""
723                 self.to_screen(u'[download] Unable to resume')
724
725         def report_finish(self):
726                 """Report download finished."""
727                 if self.params.get('noprogress', False):
728                         self.to_screen(u'[download] Download completed')
729                 else:
730                         self.to_screen(u'')
731
732         def increment_downloads(self):
733                 """Increment the ordinal that assigns a number to each file."""
734                 self._num_downloads += 1
735
736         def prepare_filename(self, info_dict):
737                 """Generate the output filename."""
738                 try:
739                         template_dict = dict(info_dict)
740                         template_dict['epoch'] = unicode(long(time.time()))
741                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
742                         filename = self.params['outtmpl'] % template_dict
743                         return filename
744                 except (ValueError, KeyError), err:
745                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
746                         return None
747
748         def _match_entry(self, info_dict):
749                 """ Returns None iff the file should be downloaded """
750
751                 title = info_dict['title']
752                 matchtitle = self.params.get('matchtitle', False)
753                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
754                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
755                 rejecttitle = self.params.get('rejecttitle', False)
756                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
757                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
758                 return None
759
760         def process_info(self, info_dict):
761                 """Process a single dictionary returned by an InfoExtractor."""
762
763                 reason = self._match_entry(info_dict)
764                 if reason is not None:
765                         self.to_screen(u'[download] ' + reason)
766                         return
767
768                 max_downloads = self.params.get('max_downloads')
769                 if max_downloads is not None:
770                         if self._num_downloads > int(max_downloads):
771                                 raise MaxDownloadsReached()
772
773                 filename = self.prepare_filename(info_dict)
774                 
775                 # Forced printings
776                 if self.params.get('forcetitle', False):
777                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
778                 if self.params.get('forceurl', False):
779                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
780                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
781                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
782                 if self.params.get('forcedescription', False) and 'description' in info_dict:
783                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
784                 if self.params.get('forcefilename', False) and filename is not None:
785                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
786                 if self.params.get('forceformat', False):
787                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
788
789                 # Do nothing else if in simulate mode
790                 if self.params.get('simulate', False):
791                         return
792
793                 if filename is None:
794                         return
795
796                 try:
797                         dn = os.path.dirname(_encodeFilename(filename))
798                         if dn != '' and not os.path.exists(dn): # dn is already encoded
799                                 os.makedirs(dn)
800                 except (OSError, IOError), err:
801                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
802                         return
803
804                 if self.params.get('writedescription', False):
805                         try:
806                                 descfn = filename + u'.description'
807                                 self.report_writedescription(descfn)
808                                 descfile = open(_encodeFilename(descfn), 'wb')
809                                 try:
810                                         descfile.write(info_dict['description'].encode('utf-8'))
811                                 finally:
812                                         descfile.close()
813                         except (OSError, IOError):
814                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
815                                 return
816                                 
817                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
818                         # subtitles download errors are already managed as troubles in relevant IE
819                         # that way it will silently go on when used with unsupporting IE 
820                         try:
821                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
822                                 self.report_writesubtitles(srtfn)
823                                 srtfile = open(_encodeFilename(srtfn), 'wb')
824                                 try:
825                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
826                                 finally:
827                                         srtfile.close()
828                         except (OSError, IOError):
829                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
830                                 return
831
832                 if self.params.get('writeinfojson', False):
833                         infofn = filename + u'.info.json'
834                         self.report_writeinfojson(infofn)
835                         try:
836                                 json.dump
837                         except (NameError,AttributeError):
838                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
839                                 return
840                         try:
841                                 infof = open(_encodeFilename(infofn), 'wb')
842                                 try:
843                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
844                                         json.dump(json_info_dict, infof)
845                                 finally:
846                                         infof.close()
847                         except (OSError, IOError):
848                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
849                                 return
850
851                 if not self.params.get('skip_download', False):
852                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
853                                 success = True
854                         else:
855                                 try:
856                                         success = self._do_download(filename, info_dict)
857                                 except (OSError, IOError), err:
858                                         raise UnavailableVideoError
859                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
860                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
861                                         return
862                                 except (ContentTooShortError, ), err:
863                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
864                                         return
865         
866                         if success:
867                                 try:
868                                         self.post_process(filename, info_dict)
869                                 except (PostProcessingError), err:
870                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
871                                         return
872
873         def download(self, url_list):
874                 """Download a given list of URLs."""
875                 if len(url_list) > 1 and self.fixed_template():
876                         raise SameFileError(self.params['outtmpl'])
877
878                 for url in url_list:
879                         suitable_found = False
880                         for ie in self._ies:
881                                 # Go to next InfoExtractor if not suitable
882                                 if not ie.suitable(url):
883                                         continue
884
885                                 # Suitable InfoExtractor found
886                                 suitable_found = True
887
888                                 # Extract information from URL and process it
889                                 ie.extract(url)
890
891                                 # Suitable InfoExtractor had been found; go to next URL
892                                 break
893
894                         if not suitable_found:
895                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
896
897                 return self._download_retcode
898
899         def post_process(self, filename, ie_info):
900                 """Run the postprocessing chain on the given file."""
901                 info = dict(ie_info)
902                 info['filepath'] = filename
903                 for pp in self._pps:
904                         info = pp.run(info)
905                         if info is None:
906                                 break
907
908         def _download_with_rtmpdump(self, filename, url, player_url):
909                 self.report_destination(filename)
910                 tmpfilename = self.temp_name(filename)
911
912                 # Check for rtmpdump first
913                 try:
914                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
915                 except (OSError, IOError):
916                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
917                         return False
918
919                 # Download using rtmpdump. rtmpdump returns exit code 2 when
920                 # the connection was interrumpted and resuming appears to be
921                 # possible. This is part of rtmpdump's normal usage, AFAIK.
922                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
923                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
924                 if self.params.get('verbose', False):
925                         try:
926                                 import pipes
927                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
928                         except ImportError:
929                                 shell_quote = repr
930                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
931                 retval = subprocess.call(args)
932                 while retval == 2 or retval == 1:
933                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
934                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
935                         time.sleep(5.0) # This seems to be needed
936                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
937                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
938                         if prevsize == cursize and retval == 1:
939                                 break
940                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
941                         if prevsize == cursize and retval == 2 and cursize > 1024:
942                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
943                                 retval = 0
944                                 break
945                 if retval == 0:
946                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
947                         self.try_rename(tmpfilename, filename)
948                         return True
949                 else:
950                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
951                         return False
952
953         def _do_download(self, filename, info_dict):
954                 url = info_dict['url']
955                 player_url = info_dict.get('player_url', None)
956
957                 # Check file already present
958                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
959                         self.report_file_already_downloaded(filename)
960                         return True
961
962                 # Attempt to download using rtmpdump
963                 if url.startswith('rtmp'):
964                         return self._download_with_rtmpdump(filename, url, player_url)
965
966                 tmpfilename = self.temp_name(filename)
967                 stream = None
968
969                 # Do not include the Accept-Encoding header
970                 headers = {'Youtubedl-no-compression': 'True'}
971                 basic_request = urllib2.Request(url, None, headers)
972                 request = urllib2.Request(url, None, headers)
973
974                 # Establish possible resume length
975                 if os.path.isfile(_encodeFilename(tmpfilename)):
976                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
977                 else:
978                         resume_len = 0
979
980                 open_mode = 'wb'
981                 if resume_len != 0:
982                         if self.params.get('continuedl', False):
983                                 self.report_resuming_byte(resume_len)
984                                 request.add_header('Range','bytes=%d-' % resume_len)
985                                 open_mode = 'ab'
986                         else:
987                                 resume_len = 0
988
989                 count = 0
990                 retries = self.params.get('retries', 0)
991                 while count <= retries:
992                         # Establish connection
993                         try:
994                                 if count == 0 and 'urlhandle' in info_dict:
995                                         data = info_dict['urlhandle']
996                                 data = urllib2.urlopen(request)
997                                 break
998                         except (urllib2.HTTPError, ), err:
999                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1000                                         # Unexpected HTTP error
1001                                         raise
1002                                 elif err.code == 416:
1003                                         # Unable to resume (requested range not satisfiable)
1004                                         try:
1005                                                 # Open the connection again without the range header
1006                                                 data = urllib2.urlopen(basic_request)
1007                                                 content_length = data.info()['Content-Length']
1008                                         except (urllib2.HTTPError, ), err:
1009                                                 if err.code < 500 or err.code >= 600:
1010                                                         raise
1011                                         else:
1012                                                 # Examine the reported length
1013                                                 if (content_length is not None and
1014                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1015                                                         # The file had already been fully downloaded.
1016                                                         # Explanation to the above condition: in issue #175 it was revealed that
1017                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1018                                                         # changing the file size slightly and causing problems for some users. So
1019                                                         # I decided to implement a suggested change and consider the file
1020                                                         # completely downloaded if the file size differs less than 100 bytes from
1021                                                         # the one in the hard drive.
1022                                                         self.report_file_already_downloaded(filename)
1023                                                         self.try_rename(tmpfilename, filename)
1024                                                         return True
1025                                                 else:
1026                                                         # The length does not match, we start the download over
1027                                                         self.report_unable_to_resume()
1028                                                         open_mode = 'wb'
1029                                                         break
1030                         # Retry
1031                         count += 1
1032                         if count <= retries:
1033                                 self.report_retry(count, retries)
1034
1035                 if count > retries:
1036                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1037                         return False
1038
1039                 data_len = data.info().get('Content-length', None)
1040                 if data_len is not None:
1041                         data_len = long(data_len) + resume_len
1042                 data_len_str = self.format_bytes(data_len)
1043                 byte_counter = 0 + resume_len
1044                 block_size = 1024
1045                 start = time.time()
1046                 while True:
1047                         # Download and write
1048                         before = time.time()
1049                         data_block = data.read(block_size)
1050                         after = time.time()
1051                         if len(data_block) == 0:
1052                                 break
1053                         byte_counter += len(data_block)
1054
1055                         # Open file just in time
1056                         if stream is None:
1057                                 try:
1058                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1059                                         assert stream is not None
1060                                         filename = self.undo_temp_name(tmpfilename)
1061                                         self.report_destination(filename)
1062                                 except (OSError, IOError), err:
1063                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1064                                         return False
1065                         try:
1066                                 stream.write(data_block)
1067                         except (IOError, OSError), err:
1068                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1069                                 return False
1070                         block_size = self.best_block_size(after - before, len(data_block))
1071
1072                         # Progress message
1073                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1074                         if data_len is None:
1075                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1076                         else:
1077                                 percent_str = self.calc_percent(byte_counter, data_len)
1078                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1079                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1080
1081                         # Apply rate limit
1082                         self.slow_down(start, byte_counter - resume_len)
1083
1084                 if stream is None:
1085                         self.trouble(u'\nERROR: Did not get any data blocks')
1086                         return False
1087                 stream.close()
1088                 self.report_finish()
1089                 if data_len is not None and byte_counter != data_len:
1090                         raise ContentTooShortError(byte_counter, long(data_len))
1091                 self.try_rename(tmpfilename, filename)
1092
1093                 # Update file modification time
1094                 if self.params.get('updatetime', True):
1095                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1096
1097                 return True
1098
1099
1100 class InfoExtractor(object):
1101         """Information Extractor class.
1102
1103         Information extractors are the classes that, given a URL, extract
1104         information from the video (or videos) the URL refers to. This
1105         information includes the real video URL, the video title and simplified
1106         title, author and others. The information is stored in a dictionary
1107         which is then passed to the FileDownloader. The FileDownloader
1108         processes this information possibly downloading the video to the file
1109         system, among other possible outcomes. The dictionaries must include
1110         the following fields:
1111
1112         id:             Video identifier.
1113         url:            Final video URL.
1114         uploader:       Nickname of the video uploader.
1115         title:          Literal title.
1116         stitle:         Simplified title.
1117         ext:            Video filename extension.
1118         format:         Video format.
1119         player_url:     SWF Player URL (may be None).
1120
1121         The following fields are optional. Their primary purpose is to allow
1122         youtube-dl to serve as the backend for a video search function, such
1123         as the one in youtube2mp3.  They are only used when their respective
1124         forced printing functions are called:
1125
1126         thumbnail:      Full URL to a video thumbnail image.
1127         description:    One-line video description.
1128
1129         Subclasses of this one should re-define the _real_initialize() and
1130         _real_extract() methods and define a _VALID_URL regexp.
1131         Probably, they should also be added to the list of extractors.
1132         """
1133
1134         _ready = False
1135         _downloader = None
1136
1137         def __init__(self, downloader=None):
1138                 """Constructor. Receives an optional downloader."""
1139                 self._ready = False
1140                 self.set_downloader(downloader)
1141
1142         def suitable(self, url):
1143                 """Receives a URL and returns True if suitable for this IE."""
1144                 return re.match(self._VALID_URL, url) is not None
1145
1146         def initialize(self):
1147                 """Initializes an instance (authentication, etc)."""
1148                 if not self._ready:
1149                         self._real_initialize()
1150                         self._ready = True
1151
1152         def extract(self, url):
1153                 """Extracts URL information and returns it in list of dicts."""
1154                 self.initialize()
1155                 return self._real_extract(url)
1156
1157         def set_downloader(self, downloader):
1158                 """Sets the downloader for this IE."""
1159                 self._downloader = downloader
1160
1161         def _real_initialize(self):
1162                 """Real initialization process. Redefine in subclasses."""
1163                 pass
1164
1165         def _real_extract(self, url):
1166                 """Real extraction process. Redefine in subclasses."""
1167                 pass
1168
1169
1170 class YoutubeIE(InfoExtractor):
1171         """Information extractor for youtube.com."""
1172
1173         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1174         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1175         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1176         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1177         _NETRC_MACHINE = 'youtube'
1178         # Listed in order of quality
1179         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1180         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1181         _video_extensions = {
1182                 '13': '3gp',
1183                 '17': 'mp4',
1184                 '18': 'mp4',
1185                 '22': 'mp4',
1186                 '37': 'mp4',
1187                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1188                 '43': 'webm',
1189                 '44': 'webm',
1190                 '45': 'webm',
1191         }
1192         _video_dimensions = {
1193                 '5': '240x400',
1194                 '6': '???',
1195                 '13': '???',
1196                 '17': '144x176',
1197                 '18': '360x640',
1198                 '22': '720x1280',
1199                 '34': '360x640',
1200                 '35': '480x854',
1201                 '37': '1080x1920',
1202                 '38': '3072x4096',
1203                 '43': '360x640',
1204                 '44': '480x854',
1205                 '45': '720x1280',
1206         }       
1207         IE_NAME = u'youtube'
1208
1209         def report_lang(self):
1210                 """Report attempt to set language."""
1211                 self._downloader.to_screen(u'[youtube] Setting language')
1212
1213         def report_login(self):
1214                 """Report attempt to log in."""
1215                 self._downloader.to_screen(u'[youtube] Logging in')
1216
1217         def report_age_confirmation(self):
1218                 """Report attempt to confirm age."""
1219                 self._downloader.to_screen(u'[youtube] Confirming age')
1220
1221         def report_video_webpage_download(self, video_id):
1222                 """Report attempt to download video webpage."""
1223                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1224
1225         def report_video_info_webpage_download(self, video_id):
1226                 """Report attempt to download video info webpage."""
1227                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1228
1229         def report_video_subtitles_download(self, video_id):
1230                 """Report attempt to download video info webpage."""
1231                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1232
1233         def report_information_extraction(self, video_id):
1234                 """Report attempt to extract video information."""
1235                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1236
1237         def report_unavailable_format(self, video_id, format):
1238                 """Report extracted video URL."""
1239                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1240
1241         def report_rtmp_download(self):
1242                 """Indicate the download will use the RTMP protocol."""
1243                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1244
1245         def _closed_captions_xml_to_srt(self, xml_string):
1246                 srt = ''
1247                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1248                 # TODO parse xml instead of regex
1249                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1250                         if not dur: dur = '4'
1251                         start = float(start)
1252                         end = start + float(dur)
1253                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1254                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1255                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1256                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1257                         srt += str(n) + '\n'
1258                         srt += start + ' --> ' + end + '\n'
1259                         srt += caption + '\n\n'
1260                 return srt
1261
1262         def _print_formats(self, formats):
1263                 print 'Available formats:'
1264                 for x in formats:
1265                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1266
1267         def _real_initialize(self):
1268                 if self._downloader is None:
1269                         return
1270
1271                 username = None
1272                 password = None
1273                 downloader_params = self._downloader.params
1274
1275                 # Attempt to use provided username and password or .netrc data
1276                 if downloader_params.get('username', None) is not None:
1277                         username = downloader_params['username']
1278                         password = downloader_params['password']
1279                 elif downloader_params.get('usenetrc', False):
1280                         try:
1281                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1282                                 if info is not None:
1283                                         username = info[0]
1284                                         password = info[2]
1285                                 else:
1286                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1287                         except (IOError, netrc.NetrcParseError), err:
1288                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1289                                 return
1290
1291                 # Set language
1292                 request = urllib2.Request(self._LANG_URL)
1293                 try:
1294                         self.report_lang()
1295                         urllib2.urlopen(request).read()
1296                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1297                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1298                         return
1299
1300                 # No authentication to be performed
1301                 if username is None:
1302                         return
1303
1304                 # Log in
1305                 login_form = {
1306                                 'current_form': 'loginForm',
1307                                 'next':         '/',
1308                                 'action_login': 'Log In',
1309                                 'username':     username,
1310                                 'password':     password,
1311                                 }
1312                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1313                 try:
1314                         self.report_login()
1315                         login_results = urllib2.urlopen(request).read()
1316                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1317                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1318                                 return
1319                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1320                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1321                         return
1322
1323                 # Confirm age
1324                 age_form = {
1325                                 'next_url':             '/',
1326                                 'action_confirm':       'Confirm',
1327                                 }
1328                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1329                 try:
1330                         self.report_age_confirmation()
1331                         age_results = urllib2.urlopen(request).read()
1332                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1333                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1334                         return
1335
1336         def _real_extract(self, url):
1337                 # Extract video id from URL
1338                 mobj = re.match(self._VALID_URL, url)
1339                 if mobj is None:
1340                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1341                         return
1342                 video_id = mobj.group(2)
1343
1344                 # Get video webpage
1345                 self.report_video_webpage_download(video_id)
1346                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1347                 try:
1348                         video_webpage = urllib2.urlopen(request).read()
1349                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1350                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1351                         return
1352
1353                 # Attempt to extract SWF player URL
1354                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1355                 if mobj is not None:
1356                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1357                 else:
1358                         player_url = None
1359
1360                 # Get video info
1361                 self.report_video_info_webpage_download(video_id)
1362                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1363                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1364                                         % (video_id, el_type))
1365                         request = urllib2.Request(video_info_url)
1366                         try:
1367                                 video_info_webpage = urllib2.urlopen(request).read()
1368                                 video_info = parse_qs(video_info_webpage)
1369                                 if 'token' in video_info:
1370                                         break
1371                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1372                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1373                                 return
1374                 if 'token' not in video_info:
1375                         if 'reason' in video_info:
1376                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1377                         else:
1378                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1379                         return
1380
1381                 # Start extracting information
1382                 self.report_information_extraction(video_id)
1383
1384                 # uploader
1385                 if 'author' not in video_info:
1386                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1387                         return
1388                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1389
1390                 # title
1391                 if 'title' not in video_info:
1392                         self._downloader.trouble(u'ERROR: unable to extract video title')
1393                         return
1394                 video_title = urllib.unquote_plus(video_info['title'][0])
1395                 video_title = video_title.decode('utf-8')
1396                 video_title = sanitize_title(video_title)
1397
1398                 # simplified title
1399                 simple_title = _simplify_title(video_title)
1400
1401                 # thumbnail image
1402                 if 'thumbnail_url' not in video_info:
1403                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1404                         video_thumbnail = ''
1405                 else:   # don't panic if we can't find it
1406                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1407
1408                 # upload date
1409                 upload_date = u'NA'
1410                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1411                 if mobj is not None:
1412                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1413                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1414                         for expression in format_expressions:
1415                                 try:
1416                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1417                                 except:
1418                                         pass
1419
1420                 # description
1421                 try:
1422                         lxml.etree
1423                 except NameError:
1424                         video_description = u'No description available.'
1425                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1426                         if mobj is not None:
1427                                 video_description = mobj.group(1).decode('utf-8')
1428                 else:
1429                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1430                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1431                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1432                         # TODO use another parser
1433                         
1434                 # closed captions
1435                 video_subtitles = None
1436                 if self._downloader.params.get('writesubtitles', False):
1437                         self.report_video_subtitles_download(video_id)
1438                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1439                         try:
1440                                 srt_list = urllib2.urlopen(request).read()
1441                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1443                         else:
1444                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1445                                 if srt_lang_list:
1446                                         if 'en' in srt_lang_list: srt_lang = 'en'
1447                                         else: srt_lang = srt_lang_list[0] # TODO choose better and provide an override
1448                                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1449                                         try:
1450                                                 srt_xml = urllib2.urlopen(request).read()
1451                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1452                                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1453                                         else:
1454                                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1455                                 else:
1456                                         self._downloader.trouble(u'WARNING: video has no subtitles')
1457
1458                 # token
1459                 video_token = urllib.unquote_plus(video_info['token'][0])
1460
1461                 # Decide which formats to download
1462                 req_format = self._downloader.params.get('format', None)
1463
1464                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1465                         self.report_rtmp_download()
1466                         video_url_list = [(None, video_info['conn'][0])]
1467                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1468                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1469                         url_data = [parse_qs(uds) for uds in url_data_strs]
1470                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1471                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1472
1473                         format_limit = self._downloader.params.get('format_limit', None)
1474                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1475                         if format_limit is not None and format_limit in available_formats:
1476                                 format_list = available_formats[available_formats.index(format_limit):]
1477                         else:
1478                                 format_list = available_formats
1479                         existing_formats = [x for x in format_list if x in url_map]
1480                         if len(existing_formats) == 0:
1481                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1482                                 return
1483                         if self._downloader.params.get('listformats', None):
1484                                 self._print_formats(existing_formats)
1485                                 return
1486                         if req_format is None or req_format == 'best':
1487                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1488                         elif req_format == 'worst':
1489                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1490                         elif req_format in ('-1', 'all'):
1491                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1492                         else:
1493                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1494                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1495                                 req_formats = req_format.split('/')
1496                                 video_url_list = None
1497                                 for rf in req_formats:
1498                                         if rf in url_map:
1499                                                 video_url_list = [(rf, url_map[rf])]
1500                                                 break
1501                                 if video_url_list is None:
1502                                         self._downloader.trouble(u'ERROR: requested format not available')
1503                                         return
1504                 else:
1505                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1506                         return
1507
1508                 for format_param, video_real_url in video_url_list:
1509                         # At this point we have a new video
1510                         self._downloader.increment_downloads()
1511
1512                         # Extension
1513                         video_extension = self._video_extensions.get(format_param, 'flv')
1514
1515                         try:
1516                                 # Process video information
1517                                 self._downloader.process_info({
1518                                         'id':           video_id.decode('utf-8'),
1519                                         'url':          video_real_url.decode('utf-8'),
1520                                         'uploader':     video_uploader.decode('utf-8'),
1521                                         'upload_date':  upload_date,
1522                                         'title':        video_title,
1523                                         'stitle':       simple_title,
1524                                         'ext':          video_extension.decode('utf-8'),
1525                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1526                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1527                                         'description':  video_description,
1528                                         'player_url':   player_url,
1529                                         'subtitles':    video_subtitles
1530                                 })
1531                         except UnavailableVideoError, err:
1532                                 self._downloader.trouble(u'\nERROR: unable to download video')
1533
1534
1535 class MetacafeIE(InfoExtractor):
1536         """Information Extractor for metacafe.com."""
1537
1538         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1539         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1540         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1541         _youtube_ie = None
1542         IE_NAME = u'metacafe'
1543
1544         def __init__(self, youtube_ie, downloader=None):
1545                 InfoExtractor.__init__(self, downloader)
1546                 self._youtube_ie = youtube_ie
1547
1548         def report_disclaimer(self):
1549                 """Report disclaimer retrieval."""
1550                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1551
1552         def report_age_confirmation(self):
1553                 """Report attempt to confirm age."""
1554                 self._downloader.to_screen(u'[metacafe] Confirming age')
1555
1556         def report_download_webpage(self, video_id):
1557                 """Report webpage download."""
1558                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1559
1560         def report_extraction(self, video_id):
1561                 """Report information extraction."""
1562                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1563
1564         def _real_initialize(self):
1565                 # Retrieve disclaimer
1566                 request = urllib2.Request(self._DISCLAIMER)
1567                 try:
1568                         self.report_disclaimer()
1569                         disclaimer = urllib2.urlopen(request).read()
1570                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1571                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1572                         return
1573
1574                 # Confirm age
1575                 disclaimer_form = {
1576                         'filters': '0',
1577                         'submit': "Continue - I'm over 18",
1578                         }
1579                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1580                 try:
1581                         self.report_age_confirmation()
1582                         disclaimer = urllib2.urlopen(request).read()
1583                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1584                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1585                         return
1586
1587         def _real_extract(self, url):
1588                 # Extract id and simplified title from URL
1589                 mobj = re.match(self._VALID_URL, url)
1590                 if mobj is None:
1591                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1592                         return
1593
1594                 video_id = mobj.group(1)
1595
1596                 # Check if video comes from YouTube
1597                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1598                 if mobj2 is not None:
1599                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1600                         return
1601
1602                 # At this point we have a new video
1603                 self._downloader.increment_downloads()
1604
1605                 simple_title = mobj.group(2).decode('utf-8')
1606
1607                 # Retrieve video webpage to extract further information
1608                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1609                 try:
1610                         self.report_download_webpage(video_id)
1611                         webpage = urllib2.urlopen(request).read()
1612                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1613                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1614                         return
1615
1616                 # Extract URL, uploader and title from webpage
1617                 self.report_extraction(video_id)
1618                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1619                 if mobj is not None:
1620                         mediaURL = urllib.unquote(mobj.group(1))
1621                         video_extension = mediaURL[-3:]
1622
1623                         # Extract gdaKey if available
1624                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1625                         if mobj is None:
1626                                 video_url = mediaURL
1627                         else:
1628                                 gdaKey = mobj.group(1)
1629                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1630                 else:
1631                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1632                         if mobj is None:
1633                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1634                                 return
1635                         vardict = parse_qs(mobj.group(1))
1636                         if 'mediaData' not in vardict:
1637                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1638                                 return
1639                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1640                         if mobj is None:
1641                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1642                                 return
1643                         mediaURL = mobj.group(1).replace('\\/', '/')
1644                         video_extension = mediaURL[-3:]
1645                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1646
1647                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1648                 if mobj is None:
1649                         self._downloader.trouble(u'ERROR: unable to extract title')
1650                         return
1651                 video_title = mobj.group(1).decode('utf-8')
1652                 video_title = sanitize_title(video_title)
1653
1654                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1655                 if mobj is None:
1656                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1657                         return
1658                 video_uploader = mobj.group(1)
1659
1660                 try:
1661                         # Process video information
1662                         self._downloader.process_info({
1663                                 'id':           video_id.decode('utf-8'),
1664                                 'url':          video_url.decode('utf-8'),
1665                                 'uploader':     video_uploader.decode('utf-8'),
1666                                 'upload_date':  u'NA',
1667                                 'title':        video_title,
1668                                 'stitle':       simple_title,
1669                                 'ext':          video_extension.decode('utf-8'),
1670                                 'format':       u'NA',
1671                                 'player_url':   None,
1672                         })
1673                 except UnavailableVideoError:
1674                         self._downloader.trouble(u'\nERROR: unable to download video')
1675
1676
1677 class DailymotionIE(InfoExtractor):
1678         """Information Extractor for Dailymotion"""
1679
1680         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1681         IE_NAME = u'dailymotion'
1682
1683         def __init__(self, downloader=None):
1684                 InfoExtractor.__init__(self, downloader)
1685
1686         def report_download_webpage(self, video_id):
1687                 """Report webpage download."""
1688                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1689
1690         def report_extraction(self, video_id):
1691                 """Report information extraction."""
1692                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1693
1694         def _real_extract(self, url):
1695                 # Extract id and simplified title from URL
1696                 mobj = re.match(self._VALID_URL, url)
1697                 if mobj is None:
1698                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1699                         return
1700
1701                 # At this point we have a new video
1702                 self._downloader.increment_downloads()
1703                 video_id = mobj.group(1)
1704
1705                 video_extension = 'flv'
1706
1707                 # Retrieve video webpage to extract further information
1708                 request = urllib2.Request(url)
1709                 request.add_header('Cookie', 'family_filter=off')
1710                 try:
1711                         self.report_download_webpage(video_id)
1712                         webpage = urllib2.urlopen(request).read()
1713                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1714                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1715                         return
1716
1717                 # Extract URL, uploader and title from webpage
1718                 self.report_extraction(video_id)
1719                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1720                 if mobj is None:
1721                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1722                         return
1723                 sequence = urllib.unquote(mobj.group(1))
1724                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1725                 if mobj is None:
1726                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1727                         return
1728                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1729
1730                 # if needed add http://www.dailymotion.com/ if relative URL
1731
1732                 video_url = mediaURL
1733
1734                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1735                 if mobj is None:
1736                         self._downloader.trouble(u'ERROR: unable to extract title')
1737                         return
1738                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1739                 video_title = sanitize_title(video_title)
1740                 simple_title = _simplify_title(video_title)
1741
1742                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1743                 if mobj is None:
1744                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1745                         return
1746                 video_uploader = mobj.group(1)
1747
1748                 try:
1749                         # Process video information
1750                         self._downloader.process_info({
1751                                 'id':           video_id.decode('utf-8'),
1752                                 'url':          video_url.decode('utf-8'),
1753                                 'uploader':     video_uploader.decode('utf-8'),
1754                                 'upload_date':  u'NA',
1755                                 'title':        video_title,
1756                                 'stitle':       simple_title,
1757                                 'ext':          video_extension.decode('utf-8'),
1758                                 'format':       u'NA',
1759                                 'player_url':   None,
1760                         })
1761                 except UnavailableVideoError:
1762                         self._downloader.trouble(u'\nERROR: unable to download video')
1763
1764
1765 class GoogleIE(InfoExtractor):
1766         """Information extractor for video.google.com."""
1767
1768         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1769         IE_NAME = u'video.google'
1770
1771         def __init__(self, downloader=None):
1772                 InfoExtractor.__init__(self, downloader)
1773
1774         def report_download_webpage(self, video_id):
1775                 """Report webpage download."""
1776                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1777
1778         def report_extraction(self, video_id):
1779                 """Report information extraction."""
1780                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1781
1782         def _real_extract(self, url):
1783                 # Extract id from URL
1784                 mobj = re.match(self._VALID_URL, url)
1785                 if mobj is None:
1786                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1787                         return
1788
1789                 # At this point we have a new video
1790                 self._downloader.increment_downloads()
1791                 video_id = mobj.group(1)
1792
1793                 video_extension = 'mp4'
1794
1795                 # Retrieve video webpage to extract further information
1796                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1797                 try:
1798                         self.report_download_webpage(video_id)
1799                         webpage = urllib2.urlopen(request).read()
1800                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1801                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1802                         return
1803
1804                 # Extract URL, uploader, and title from webpage
1805                 self.report_extraction(video_id)
1806                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1807                 if mobj is None:
1808                         video_extension = 'flv'
1809                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1810                 if mobj is None:
1811                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1812                         return
1813                 mediaURL = urllib.unquote(mobj.group(1))
1814                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1815                 mediaURL = mediaURL.replace('\\x26', '\x26')
1816
1817                 video_url = mediaURL
1818
1819                 mobj = re.search(r'<title>(.*)</title>', webpage)
1820                 if mobj is None:
1821                         self._downloader.trouble(u'ERROR: unable to extract title')
1822                         return
1823                 video_title = mobj.group(1).decode('utf-8')
1824                 video_title = sanitize_title(video_title)
1825                 simple_title = _simplify_title(video_title)
1826
1827                 # Extract video description
1828                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1829                 if mobj is None:
1830                         self._downloader.trouble(u'ERROR: unable to extract video description')
1831                         return
1832                 video_description = mobj.group(1).decode('utf-8')
1833                 if not video_description:
1834                         video_description = 'No description available.'
1835
1836                 # Extract video thumbnail
1837                 if self._downloader.params.get('forcethumbnail', False):
1838                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1839                         try:
1840                                 webpage = urllib2.urlopen(request).read()
1841                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1842                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1843                                 return
1844                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1845                         if mobj is None:
1846                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1847                                 return
1848                         video_thumbnail = mobj.group(1)
1849                 else:   # we need something to pass to process_info
1850                         video_thumbnail = ''
1851
1852                 try:
1853                         # Process video information
1854                         self._downloader.process_info({
1855                                 'id':           video_id.decode('utf-8'),
1856                                 'url':          video_url.decode('utf-8'),
1857                                 'uploader':     u'NA',
1858                                 'upload_date':  u'NA',
1859                                 'title':        video_title,
1860                                 'stitle':       simple_title,
1861                                 'ext':          video_extension.decode('utf-8'),
1862                                 'format':       u'NA',
1863                                 'player_url':   None,
1864                         })
1865                 except UnavailableVideoError:
1866                         self._downloader.trouble(u'\nERROR: unable to download video')
1867
1868
1869 class PhotobucketIE(InfoExtractor):
1870         """Information extractor for photobucket.com."""
1871
1872         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1873         IE_NAME = u'photobucket'
1874
1875         def __init__(self, downloader=None):
1876                 InfoExtractor.__init__(self, downloader)
1877
1878         def report_download_webpage(self, video_id):
1879                 """Report webpage download."""
1880                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1881
1882         def report_extraction(self, video_id):
1883                 """Report information extraction."""
1884                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1885
1886         def _real_extract(self, url):
1887                 # Extract id from URL
1888                 mobj = re.match(self._VALID_URL, url)
1889                 if mobj is None:
1890                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1891                         return
1892
1893                 # At this point we have a new video
1894                 self._downloader.increment_downloads()
1895                 video_id = mobj.group(1)
1896
1897                 video_extension = 'flv'
1898
1899                 # Retrieve video webpage to extract further information
1900                 request = urllib2.Request(url)
1901                 try:
1902                         self.report_download_webpage(video_id)
1903                         webpage = urllib2.urlopen(request).read()
1904                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1905                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1906                         return
1907
1908                 # Extract URL, uploader, and title from webpage
1909                 self.report_extraction(video_id)
1910                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1911                 if mobj is None:
1912                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1913                         return
1914                 mediaURL = urllib.unquote(mobj.group(1))
1915
1916                 video_url = mediaURL
1917
1918                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1919                 if mobj is None:
1920                         self._downloader.trouble(u'ERROR: unable to extract title')
1921                         return
1922                 video_title = mobj.group(1).decode('utf-8')
1923                 video_title = sanitize_title(video_title)
1924                 simple_title = _simplify_title(vide_title)
1925
1926                 video_uploader = mobj.group(2).decode('utf-8')
1927
1928                 try:
1929                         # Process video information
1930                         self._downloader.process_info({
1931                                 'id':           video_id.decode('utf-8'),
1932                                 'url':          video_url.decode('utf-8'),
1933                                 'uploader':     video_uploader,
1934                                 'upload_date':  u'NA',
1935                                 'title':        video_title,
1936                                 'stitle':       simple_title,
1937                                 'ext':          video_extension.decode('utf-8'),
1938                                 'format':       u'NA',
1939                                 'player_url':   None,
1940                         })
1941                 except UnavailableVideoError:
1942                         self._downloader.trouble(u'\nERROR: unable to download video')
1943
1944
1945 class YahooIE(InfoExtractor):
1946         """Information extractor for video.yahoo.com."""
1947
1948         # _VALID_URL matches all Yahoo! Video URLs
1949         # _VPAGE_URL matches only the extractable '/watch/' URLs
1950         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1951         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1952         IE_NAME = u'video.yahoo'
1953
1954         def __init__(self, downloader=None):
1955                 InfoExtractor.__init__(self, downloader)
1956
1957         def report_download_webpage(self, video_id):
1958                 """Report webpage download."""
1959                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1960
1961         def report_extraction(self, video_id):
1962                 """Report information extraction."""
1963                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1964
1965         def _real_extract(self, url, new_video=True):
1966                 # Extract ID from URL
1967                 mobj = re.match(self._VALID_URL, url)
1968                 if mobj is None:
1969                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1970                         return
1971
1972                 # At this point we have a new video
1973                 self._downloader.increment_downloads()
1974                 video_id = mobj.group(2)
1975                 video_extension = 'flv'
1976
1977                 # Rewrite valid but non-extractable URLs as
1978                 # extractable English language /watch/ URLs
1979                 if re.match(self._VPAGE_URL, url) is None:
1980                         request = urllib2.Request(url)
1981                         try:
1982                                 webpage = urllib2.urlopen(request).read()
1983                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1984                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1985                                 return
1986
1987                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1988                         if mobj is None:
1989                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1990                                 return
1991                         yahoo_id = mobj.group(1)
1992
1993                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1994                         if mobj is None:
1995                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1996                                 return
1997                         yahoo_vid = mobj.group(1)
1998
1999                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2000                         return self._real_extract(url, new_video=False)
2001
2002                 # Retrieve video webpage to extract further information
2003                 request = urllib2.Request(url)
2004                 try:
2005                         self.report_download_webpage(video_id)
2006                         webpage = urllib2.urlopen(request).read()
2007                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2008                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2009                         return
2010
2011                 # Extract uploader and title from webpage
2012                 self.report_extraction(video_id)
2013                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2014                 if mobj is None:
2015                         self._downloader.trouble(u'ERROR: unable to extract video title')
2016                         return
2017                 video_title = mobj.group(1).decode('utf-8')
2018                 simple_title = _simplify_title(video_title)
2019
2020                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2021                 if mobj is None:
2022                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2023                         return
2024                 video_uploader = mobj.group(1).decode('utf-8')
2025
2026                 # Extract video thumbnail
2027                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2028                 if mobj is None:
2029                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2030                         return
2031                 video_thumbnail = mobj.group(1).decode('utf-8')
2032
2033                 # Extract video description
2034                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2035                 if mobj is None:
2036                         self._downloader.trouble(u'ERROR: unable to extract video description')
2037                         return
2038                 video_description = mobj.group(1).decode('utf-8')
2039                 if not video_description:
2040                         video_description = 'No description available.'
2041
2042                 # Extract video height and width
2043                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2044                 if mobj is None:
2045                         self._downloader.trouble(u'ERROR: unable to extract video height')
2046                         return
2047                 yv_video_height = mobj.group(1)
2048
2049                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2050                 if mobj is None:
2051                         self._downloader.trouble(u'ERROR: unable to extract video width')
2052                         return
2053                 yv_video_width = mobj.group(1)
2054
2055                 # Retrieve video playlist to extract media URL
2056                 # I'm not completely sure what all these options are, but we
2057                 # seem to need most of them, otherwise the server sends a 401.
2058                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2059                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2060                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2061                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2062                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2063                 try:
2064                         self.report_download_webpage(video_id)
2065                         webpage = urllib2.urlopen(request).read()
2066                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2067                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2068                         return
2069
2070                 # Extract media URL from playlist XML
2071                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2072                 if mobj is None:
2073                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2074                         return
2075                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2076                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2077
2078                 try:
2079                         # Process video information
2080                         self._downloader.process_info({
2081                                 'id':           video_id.decode('utf-8'),
2082                                 'url':          video_url,
2083                                 'uploader':     video_uploader,
2084                                 'upload_date':  u'NA',
2085                                 'title':        video_title,
2086                                 'stitle':       simple_title,
2087                                 'ext':          video_extension.decode('utf-8'),
2088                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2089                                 'description':  video_description,
2090                                 'thumbnail':    video_thumbnail,
2091                                 'player_url':   None,
2092                         })
2093                 except UnavailableVideoError:
2094                         self._downloader.trouble(u'\nERROR: unable to download video')
2095
2096
2097 class VimeoIE(InfoExtractor):
2098         """Information extractor for vimeo.com."""
2099
2100         # _VALID_URL matches Vimeo URLs
2101         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2102         IE_NAME = u'vimeo'
2103
2104         def __init__(self, downloader=None):
2105                 InfoExtractor.__init__(self, downloader)
2106
2107         def report_download_webpage(self, video_id):
2108                 """Report webpage download."""
2109                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2110
2111         def report_extraction(self, video_id):
2112                 """Report information extraction."""
2113                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2114
2115         def _real_extract(self, url, new_video=True):
2116                 # Extract ID from URL
2117                 mobj = re.match(self._VALID_URL, url)
2118                 if mobj is None:
2119                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2120                         return
2121
2122                 # At this point we have a new video
2123                 self._downloader.increment_downloads()
2124                 video_id = mobj.group(1)
2125
2126                 # Retrieve video webpage to extract further information
2127                 request = urllib2.Request(url, None, std_headers)
2128                 try:
2129                         self.report_download_webpage(video_id)
2130                         webpage = urllib2.urlopen(request).read()
2131                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2132                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2133                         return
2134
2135                 # Now we begin extracting as much information as we can from what we
2136                 # retrieved. First we extract the information common to all extractors,
2137                 # and latter we extract those that are Vimeo specific.
2138                 self.report_extraction(video_id)
2139
2140                 # Extract the config JSON
2141                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2142                 try:
2143                         config = json.loads(config)
2144                 except:
2145                         self._downloader.trouble(u'ERROR: unable to extract info section')
2146                         return
2147                 
2148                 # Extract title
2149                 video_title = config["video"]["title"]
2150                 simple_title = _simplify_title(video_title)
2151
2152                 # Extract uploader
2153                 video_uploader = config["video"]["owner"]["name"]
2154
2155                 # Extract video thumbnail
2156                 video_thumbnail = config["video"]["thumbnail"]
2157
2158                 # Extract video description
2159                 try:
2160                         lxml.etree
2161                 except NameError:
2162                         video_description = u'No description available.'
2163                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2164                         if mobj is not None:
2165                                 video_description = mobj.group(1)
2166                 else:
2167                         html_parser = lxml.etree.HTMLParser()
2168                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2169                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2170                         # TODO use another parser
2171
2172                 # Extract upload date
2173                 video_upload_date = u'NA'
2174                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2175                 if mobj is not None:
2176                         video_upload_date = mobj.group(1)
2177
2178                 # Vimeo specific: extract request signature and timestamp
2179                 sig = config['request']['signature']
2180                 timestamp = config['request']['timestamp']
2181
2182                 # Vimeo specific: extract video codec and quality information
2183                 # TODO bind to format param
2184                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2185                 for codec in codecs:
2186                         if codec[0] in config["video"]["files"]:
2187                                 video_codec = codec[0]
2188                                 video_extension = codec[1]
2189                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2190                                 else: quality = 'sd'
2191                                 break
2192                 else:
2193                         self._downloader.trouble(u'ERROR: no known codec found')
2194                         return
2195
2196                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2197                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2198
2199                 try:
2200                         # Process video information
2201                         self._downloader.process_info({
2202                                 'id':           video_id,
2203                                 'url':          video_url,
2204                                 'uploader':     video_uploader,
2205                                 'upload_date':  video_upload_date,
2206                                 'title':        video_title,
2207                                 'stitle':       simple_title,
2208                                 'ext':          video_extension,
2209                                 'thumbnail':    video_thumbnail,
2210                                 'description':  video_description,
2211                                 'player_url':   None,
2212                         })
2213                 except UnavailableVideoError:
2214                         self._downloader.trouble(u'ERROR: unable to download video')
2215
2216
2217 class GenericIE(InfoExtractor):
2218         """Generic last-resort information extractor."""
2219
2220         _VALID_URL = r'.*'
2221         IE_NAME = u'generic'
2222
2223         def __init__(self, downloader=None):
2224                 InfoExtractor.__init__(self, downloader)
2225
2226         def report_download_webpage(self, video_id):
2227                 """Report webpage download."""
2228                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2229                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2230
2231         def report_extraction(self, video_id):
2232                 """Report information extraction."""
2233                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2234
2235         def _real_extract(self, url):
2236                 # At this point we have a new video
2237                 self._downloader.increment_downloads()
2238
2239                 video_id = url.split('/')[-1]
2240                 request = urllib2.Request(url)
2241                 try:
2242                         self.report_download_webpage(video_id)
2243                         webpage = urllib2.urlopen(request).read()
2244                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2245                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2246                         return
2247                 except ValueError, err:
2248                         # since this is the last-resort InfoExtractor, if
2249                         # this error is thrown, it'll be thrown here
2250                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2251                         return
2252
2253                 self.report_extraction(video_id)
2254                 # Start with something easy: JW Player in SWFObject
2255                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2256                 if mobj is None:
2257                         # Broaden the search a little bit
2258                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2259                 if mobj is None:
2260                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2261                         return
2262
2263                 # It's possible that one of the regexes
2264                 # matched, but returned an empty group:
2265                 if mobj.group(1) is None:
2266                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2267                         return
2268
2269                 video_url = urllib.unquote(mobj.group(1))
2270                 video_id = os.path.basename(video_url)
2271
2272                 # here's a fun little line of code for you:
2273                 video_extension = os.path.splitext(video_id)[1][1:]
2274                 video_id = os.path.splitext(video_id)[0]
2275
2276                 # it's tempting to parse this further, but you would
2277                 # have to take into account all the variations like
2278                 #   Video Title - Site Name
2279                 #   Site Name | Video Title
2280                 #   Video Title - Tagline | Site Name
2281                 # and so on and so forth; it's just not practical
2282                 mobj = re.search(r'<title>(.*)</title>', webpage)
2283                 if mobj is None:
2284                         self._downloader.trouble(u'ERROR: unable to extract title')
2285                         return
2286                 video_title = mobj.group(1).decode('utf-8')
2287                 video_title = sanitize_title(video_title)
2288                 simple_title = _simplify_title(video_title)
2289
2290                 # video uploader is domain name
2291                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2292                 if mobj is None:
2293                         self._downloader.trouble(u'ERROR: unable to extract title')
2294                         return
2295                 video_uploader = mobj.group(1).decode('utf-8')
2296
2297                 try:
2298                         # Process video information
2299                         self._downloader.process_info({
2300                                 'id':           video_id.decode('utf-8'),
2301                                 'url':          video_url.decode('utf-8'),
2302                                 'uploader':     video_uploader,
2303                                 'upload_date':  u'NA',
2304                                 'title':        video_title,
2305                                 'stitle':       simple_title,
2306                                 'ext':          video_extension.decode('utf-8'),
2307                                 'format':       u'NA',
2308                                 'player_url':   None,
2309                         })
2310                 except UnavailableVideoError, err:
2311                         self._downloader.trouble(u'\nERROR: unable to download video')
2312
2313
2314 class YoutubeSearchIE(InfoExtractor):
2315         """Information Extractor for YouTube search queries."""
2316         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2317         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2318         _youtube_ie = None
2319         _max_youtube_results = 1000
2320         IE_NAME = u'youtube:search'
2321
2322         def __init__(self, youtube_ie, downloader=None):
2323                 InfoExtractor.__init__(self, downloader)
2324                 self._youtube_ie = youtube_ie
2325
2326         def report_download_page(self, query, pagenum):
2327                 """Report attempt to download playlist page with given number."""
2328                 query = query.decode(preferredencoding())
2329                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2330
2331         def _real_initialize(self):
2332                 self._youtube_ie.initialize()
2333
2334         def _real_extract(self, query):
2335                 mobj = re.match(self._VALID_URL, query)
2336                 if mobj is None:
2337                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2338                         return
2339
2340                 prefix, query = query.split(':')
2341                 prefix = prefix[8:]
2342                 query = query.encode('utf-8')
2343                 if prefix == '':
2344                         self._download_n_results(query, 1)
2345                         return
2346                 elif prefix == 'all':
2347                         self._download_n_results(query, self._max_youtube_results)
2348                         return
2349                 else:
2350                         try:
2351                                 n = long(prefix)
2352                                 if n <= 0:
2353                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2354                                         return
2355                                 elif n > self._max_youtube_results:
2356                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2357                                         n = self._max_youtube_results
2358                                 self._download_n_results(query, n)
2359                                 return
2360                         except ValueError: # parsing prefix as integer fails
2361                                 self._download_n_results(query, 1)
2362                                 return
2363
2364         def _download_n_results(self, query, n):
2365                 """Downloads a specified number of results for a query"""
2366
2367                 video_ids = []
2368                 pagenum = 0
2369                 limit = n
2370
2371                 while (50 * pagenum) < limit:
2372                         self.report_download_page(query, pagenum+1)
2373                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2374                         request = urllib2.Request(result_url)
2375                         try:
2376                                 data = urllib2.urlopen(request).read()
2377                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2378                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2379                                 return
2380                         api_response = json.loads(data)['data']
2381
2382                         new_ids = list(video['id'] for video in api_response['items'])
2383                         video_ids += new_ids
2384
2385                         limit = min(n, api_response['totalItems'])
2386                         pagenum += 1
2387
2388                 if len(video_ids) > n:
2389                         video_ids = video_ids[:n]
2390                 for id in video_ids:
2391                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2392                 return
2393
2394
2395 class GoogleSearchIE(InfoExtractor):
2396         """Information Extractor for Google Video search queries."""
2397         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2398         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2399         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2400         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2401         _google_ie = None
2402         _max_google_results = 1000
2403         IE_NAME = u'video.google:search'
2404
2405         def __init__(self, google_ie, downloader=None):
2406                 InfoExtractor.__init__(self, downloader)
2407                 self._google_ie = google_ie
2408
2409         def report_download_page(self, query, pagenum):
2410                 """Report attempt to download playlist page with given number."""
2411                 query = query.decode(preferredencoding())
2412                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2413
2414         def _real_initialize(self):
2415                 self._google_ie.initialize()
2416
2417         def _real_extract(self, query):
2418                 mobj = re.match(self._VALID_URL, query)
2419                 if mobj is None:
2420                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2421                         return
2422
2423                 prefix, query = query.split(':')
2424                 prefix = prefix[8:]
2425                 query = query.encode('utf-8')
2426                 if prefix == '':
2427                         self._download_n_results(query, 1)
2428                         return
2429                 elif prefix == 'all':
2430                         self._download_n_results(query, self._max_google_results)
2431                         return
2432                 else:
2433                         try:
2434                                 n = long(prefix)
2435                                 if n <= 0:
2436                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2437                                         return
2438                                 elif n > self._max_google_results:
2439                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2440                                         n = self._max_google_results
2441                                 self._download_n_results(query, n)
2442                                 return
2443                         except ValueError: # parsing prefix as integer fails
2444                                 self._download_n_results(query, 1)
2445                                 return
2446
2447         def _download_n_results(self, query, n):
2448                 """Downloads a specified number of results for a query"""
2449
2450                 video_ids = []
2451                 pagenum = 0
2452
2453                 while True:
2454                         self.report_download_page(query, pagenum)
2455                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2456                         request = urllib2.Request(result_url)
2457                         try:
2458                                 page = urllib2.urlopen(request).read()
2459                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2461                                 return
2462
2463                         # Extract video identifiers
2464                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2465                                 video_id = mobj.group(1)
2466                                 if video_id not in video_ids:
2467                                         video_ids.append(video_id)
2468                                         if len(video_ids) == n:
2469                                                 # Specified n videos reached
2470                                                 for id in video_ids:
2471                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2472                                                 return
2473
2474                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2475                                 for id in video_ids:
2476                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2477                                 return
2478
2479                         pagenum = pagenum + 1
2480
2481
2482 class YahooSearchIE(InfoExtractor):
2483         """Information Extractor for Yahoo! Video search queries."""
2484         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2485         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2486         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2487         _MORE_PAGES_INDICATOR = r'\s*Next'
2488         _yahoo_ie = None
2489         _max_yahoo_results = 1000
2490         IE_NAME = u'video.yahoo:search'
2491
2492         def __init__(self, yahoo_ie, downloader=None):
2493                 InfoExtractor.__init__(self, downloader)
2494                 self._yahoo_ie = yahoo_ie
2495
2496         def report_download_page(self, query, pagenum):
2497                 """Report attempt to download playlist page with given number."""
2498                 query = query.decode(preferredencoding())
2499                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2500
2501         def _real_initialize(self):
2502                 self._yahoo_ie.initialize()
2503
2504         def _real_extract(self, query):
2505                 mobj = re.match(self._VALID_URL, query)
2506                 if mobj is None:
2507                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2508                         return
2509
2510                 prefix, query = query.split(':')
2511                 prefix = prefix[8:]
2512                 query = query.encode('utf-8')
2513                 if prefix == '':
2514                         self._download_n_results(query, 1)
2515                         return
2516                 elif prefix == 'all':
2517                         self._download_n_results(query, self._max_yahoo_results)
2518                         return
2519                 else:
2520                         try:
2521                                 n = long(prefix)
2522                                 if n <= 0:
2523                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2524                                         return
2525                                 elif n > self._max_yahoo_results:
2526                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2527                                         n = self._max_yahoo_results
2528                                 self._download_n_results(query, n)
2529                                 return
2530                         except ValueError: # parsing prefix as integer fails
2531                                 self._download_n_results(query, 1)
2532                                 return
2533
2534         def _download_n_results(self, query, n):
2535                 """Downloads a specified number of results for a query"""
2536
2537                 video_ids = []
2538                 already_seen = set()
2539                 pagenum = 1
2540
2541                 while True:
2542                         self.report_download_page(query, pagenum)
2543                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2544                         request = urllib2.Request(result_url)
2545                         try:
2546                                 page = urllib2.urlopen(request).read()
2547                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2548                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2549                                 return
2550
2551                         # Extract video identifiers
2552                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2553                                 video_id = mobj.group(1)
2554                                 if video_id not in already_seen:
2555                                         video_ids.append(video_id)
2556                                         already_seen.add(video_id)
2557                                         if len(video_ids) == n:
2558                                                 # Specified n videos reached
2559                                                 for id in video_ids:
2560                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2561                                                 return
2562
2563                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2564                                 for id in video_ids:
2565                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2566                                 return
2567
2568                         pagenum = pagenum + 1
2569
2570
2571 class YoutubePlaylistIE(InfoExtractor):
2572         """Information Extractor for YouTube playlists."""
2573
2574         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2575         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2576         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2577         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2578         _youtube_ie = None
2579         IE_NAME = u'youtube:playlist'
2580
2581         def __init__(self, youtube_ie, downloader=None):
2582                 InfoExtractor.__init__(self, downloader)
2583                 self._youtube_ie = youtube_ie
2584
2585         def report_download_page(self, playlist_id, pagenum):
2586                 """Report attempt to download playlist page with given number."""
2587                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2588
2589         def _real_initialize(self):
2590                 self._youtube_ie.initialize()
2591
2592         def _real_extract(self, url):
2593                 # Extract playlist id
2594                 mobj = re.match(self._VALID_URL, url)
2595                 if mobj is None:
2596                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2597                         return
2598
2599                 # Single video case
2600                 if mobj.group(3) is not None:
2601                         self._youtube_ie.extract(mobj.group(3))
2602                         return
2603
2604                 # Download playlist pages
2605                 # prefix is 'p' as default for playlists but there are other types that need extra care
2606                 playlist_prefix = mobj.group(1)
2607                 if playlist_prefix == 'a':
2608                         playlist_access = 'artist'
2609                 else:
2610                         playlist_prefix = 'p'
2611                         playlist_access = 'view_play_list'
2612                 playlist_id = mobj.group(2)
2613                 video_ids = []
2614                 pagenum = 1
2615
2616                 while True:
2617                         self.report_download_page(playlist_id, pagenum)
2618                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2619                         request = urllib2.Request(url)
2620                         try:
2621                                 page = urllib2.urlopen(request).read()
2622                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2623                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2624                                 return
2625
2626                         # Extract video identifiers
2627                         ids_in_page = []
2628                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2629                                 if mobj.group(1) not in ids_in_page:
2630                                         ids_in_page.append(mobj.group(1))
2631                         video_ids.extend(ids_in_page)
2632
2633                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2634                                 break
2635                         pagenum = pagenum + 1
2636
2637                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2638                 playlistend = self._downloader.params.get('playlistend', -1)
2639                 if playlistend == -1:
2640                         video_ids = video_ids[playliststart:]
2641                 else:
2642                         video_ids = video_ids[playliststart:playlistend]
2643
2644                 for id in video_ids:
2645                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2646                 return
2647
2648
2649 class YoutubeUserIE(InfoExtractor):
2650         """Information Extractor for YouTube users."""
2651
2652         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2653         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2654         _GDATA_PAGE_SIZE = 50
2655         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2656         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2657         _youtube_ie = None
2658         IE_NAME = u'youtube:user'
2659
2660         def __init__(self, youtube_ie, downloader=None):
2661                 InfoExtractor.__init__(self, downloader)
2662                 self._youtube_ie = youtube_ie
2663
2664         def report_download_page(self, username, start_index):
2665                 """Report attempt to download user page."""
2666                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2667                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2668
2669         def _real_initialize(self):
2670                 self._youtube_ie.initialize()
2671
2672         def _real_extract(self, url):
2673                 # Extract username
2674                 mobj = re.match(self._VALID_URL, url)
2675                 if mobj is None:
2676                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2677                         return
2678
2679                 username = mobj.group(1)
2680
2681                 # Download video ids using YouTube Data API. Result size per
2682                 # query is limited (currently to 50 videos) so we need to query
2683                 # page by page until there are no video ids - it means we got
2684                 # all of them.
2685
2686                 video_ids = []
2687                 pagenum = 0
2688
2689                 while True:
2690                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2691                         self.report_download_page(username, start_index)
2692
2693                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2694
2695                         try:
2696                                 page = urllib2.urlopen(request).read()
2697                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2698                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2699                                 return
2700
2701                         # Extract video identifiers
2702                         ids_in_page = []
2703
2704                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2705                                 if mobj.group(1) not in ids_in_page:
2706                                         ids_in_page.append(mobj.group(1))
2707
2708                         video_ids.extend(ids_in_page)
2709
2710                         # A little optimization - if current page is not
2711                         # "full", ie. does not contain PAGE_SIZE video ids then
2712                         # we can assume that this page is the last one - there
2713                         # are no more ids on further pages - no need to query
2714                         # again.
2715
2716                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2717                                 break
2718
2719                         pagenum += 1
2720
2721                 all_ids_count = len(video_ids)
2722                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2723                 playlistend = self._downloader.params.get('playlistend', -1)
2724
2725                 if playlistend == -1:
2726                         video_ids = video_ids[playliststart:]
2727                 else:
2728                         video_ids = video_ids[playliststart:playlistend]
2729
2730                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2731                                 (username, all_ids_count, len(video_ids)))
2732
2733                 for video_id in video_ids:
2734                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2735
2736
2737 class DepositFilesIE(InfoExtractor):
2738         """Information extractor for depositfiles.com"""
2739
2740         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2741         IE_NAME = u'DepositFiles'
2742
2743         def __init__(self, downloader=None):
2744                 InfoExtractor.__init__(self, downloader)
2745
2746         def report_download_webpage(self, file_id):
2747                 """Report webpage download."""
2748                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2749
2750         def report_extraction(self, file_id):
2751                 """Report information extraction."""
2752                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2753
2754         def _real_extract(self, url):
2755                 # At this point we have a new file
2756                 self._downloader.increment_downloads()
2757
2758                 file_id = url.split('/')[-1]
2759                 # Rebuild url in english locale
2760                 url = 'http://depositfiles.com/en/files/' + file_id
2761
2762                 # Retrieve file webpage with 'Free download' button pressed
2763                 free_download_indication = { 'gateway_result' : '1' }
2764                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2765                 try:
2766                         self.report_download_webpage(file_id)
2767                         webpage = urllib2.urlopen(request).read()
2768                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2769                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2770                         return
2771
2772                 # Search for the real file URL
2773                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2774                 if (mobj is None) or (mobj.group(1) is None):
2775                         # Try to figure out reason of the error.
2776                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2777                         if (mobj is not None) and (mobj.group(1) is not None):
2778                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2779                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2780                         else:
2781                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2782                         return
2783
2784                 file_url = mobj.group(1)
2785                 file_extension = os.path.splitext(file_url)[1][1:]
2786
2787                 # Search for file title
2788                 mobj = re.search(r'<b title="(.*?)">', webpage)
2789                 if mobj is None:
2790                         self._downloader.trouble(u'ERROR: unable to extract title')
2791                         return
2792                 file_title = mobj.group(1).decode('utf-8')
2793
2794                 try:
2795                         # Process file information
2796                         self._downloader.process_info({
2797                                 'id':           file_id.decode('utf-8'),
2798                                 'url':          file_url.decode('utf-8'),
2799                                 'uploader':     u'NA',
2800                                 'upload_date':  u'NA',
2801                                 'title':        file_title,
2802                                 'stitle':       file_title,
2803                                 'ext':          file_extension.decode('utf-8'),
2804                                 'format':       u'NA',
2805                                 'player_url':   None,
2806                         })
2807                 except UnavailableVideoError, err:
2808                         self._downloader.trouble(u'ERROR: unable to download file')
2809
2810
2811 class FacebookIE(InfoExtractor):
2812         """Information Extractor for Facebook"""
2813
2814         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2815         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2816         _NETRC_MACHINE = 'facebook'
2817         _available_formats = ['video', 'highqual', 'lowqual']
2818         _video_extensions = {
2819                 'video': 'mp4',
2820                 'highqual': 'mp4',
2821                 'lowqual': 'mp4',
2822         }
2823         IE_NAME = u'facebook'
2824
2825         def __init__(self, downloader=None):
2826                 InfoExtractor.__init__(self, downloader)
2827
2828         def _reporter(self, message):
2829                 """Add header and report message."""
2830                 self._downloader.to_screen(u'[facebook] %s' % message)
2831
2832         def report_login(self):
2833                 """Report attempt to log in."""
2834                 self._reporter(u'Logging in')
2835
2836         def report_video_webpage_download(self, video_id):
2837                 """Report attempt to download video webpage."""
2838                 self._reporter(u'%s: Downloading video webpage' % video_id)
2839
2840         def report_information_extraction(self, video_id):
2841                 """Report attempt to extract video information."""
2842                 self._reporter(u'%s: Extracting video information' % video_id)
2843
2844         def _parse_page(self, video_webpage):
2845                 """Extract video information from page"""
2846                 # General data
2847                 data = {'title': r'\("video_title", "(.*?)"\)',
2848                         'description': r'<div class="datawrap">(.*?)</div>',
2849                         'owner': r'\("video_owner_name", "(.*?)"\)',
2850                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2851                         }
2852                 video_info = {}
2853                 for piece in data.keys():
2854                         mobj = re.search(data[piece], video_webpage)
2855                         if mobj is not None:
2856                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2857
2858                 # Video urls
2859                 video_urls = {}
2860                 for fmt in self._available_formats:
2861                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2862                         if mobj is not None:
2863                                 # URL is in a Javascript segment inside an escaped Unicode format within
2864                                 # the generally utf-8 page
2865                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2866                 video_info['video_urls'] = video_urls
2867
2868                 return video_info
2869
2870         def _real_initialize(self):
2871                 if self._downloader is None:
2872                         return
2873
2874                 useremail = None
2875                 password = None
2876                 downloader_params = self._downloader.params
2877
2878                 # Attempt to use provided username and password or .netrc data
2879                 if downloader_params.get('username', None) is not None:
2880                         useremail = downloader_params['username']
2881                         password = downloader_params['password']
2882                 elif downloader_params.get('usenetrc', False):
2883                         try:
2884                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2885                                 if info is not None:
2886                                         useremail = info[0]
2887                                         password = info[2]
2888                                 else:
2889                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2890                         except (IOError, netrc.NetrcParseError), err:
2891                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2892                                 return
2893
2894                 if useremail is None:
2895                         return
2896
2897                 # Log in
2898                 login_form = {
2899                         'email': useremail,
2900                         'pass': password,
2901                         'login': 'Log+In'
2902                         }
2903                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2904                 try:
2905                         self.report_login()
2906                         login_results = urllib2.urlopen(request).read()
2907                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2908                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2909                                 return
2910                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2911                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2912                         return
2913
2914         def _real_extract(self, url):
2915                 mobj = re.match(self._VALID_URL, url)
2916                 if mobj is None:
2917                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2918                         return
2919                 video_id = mobj.group('ID')
2920
2921                 # Get video webpage
2922                 self.report_video_webpage_download(video_id)
2923                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2924                 try:
2925                         page = urllib2.urlopen(request)
2926                         video_webpage = page.read()
2927                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2928                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2929                         return
2930
2931                 # Start extracting information
2932                 self.report_information_extraction(video_id)
2933
2934                 # Extract information
2935                 video_info = self._parse_page(video_webpage)
2936
2937                 # uploader
2938                 if 'owner' not in video_info:
2939                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2940                         return
2941                 video_uploader = video_info['owner']
2942
2943                 # title
2944                 if 'title' not in video_info:
2945                         self._downloader.trouble(u'ERROR: unable to extract video title')
2946                         return
2947                 video_title = video_info['title']
2948                 video_title = video_title.decode('utf-8')
2949                 video_title = sanitize_title(video_title)
2950
2951                 simple_title = _simplify_title(video_title)
2952
2953                 # thumbnail image
2954                 if 'thumbnail' not in video_info:
2955                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2956                         video_thumbnail = ''
2957                 else:
2958                         video_thumbnail = video_info['thumbnail']
2959
2960                 # upload date
2961                 upload_date = u'NA'
2962                 if 'upload_date' in video_info:
2963                         upload_time = video_info['upload_date']
2964                         timetuple = email.utils.parsedate_tz(upload_time)
2965                         if timetuple is not None:
2966                                 try:
2967                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2968                                 except:
2969                                         pass
2970
2971                 # description
2972                 video_description = video_info.get('description', 'No description available.')
2973
2974                 url_map = video_info['video_urls']
2975                 if len(url_map.keys()) > 0:
2976                         # Decide which formats to download
2977                         req_format = self._downloader.params.get('format', None)
2978                         format_limit = self._downloader.params.get('format_limit', None)
2979
2980                         if format_limit is not None and format_limit in self._available_formats:
2981                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2982                         else:
2983                                 format_list = self._available_formats
2984                         existing_formats = [x for x in format_list if x in url_map]
2985                         if len(existing_formats) == 0:
2986                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2987                                 return
2988                         if req_format is None:
2989                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2990                         elif req_format == 'worst':
2991                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2992                         elif req_format == '-1':
2993                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2994                         else:
2995                                 # Specific format
2996                                 if req_format not in url_map:
2997                                         self._downloader.trouble(u'ERROR: requested format not available')
2998                                         return
2999                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3000
3001                 for format_param, video_real_url in video_url_list:
3002
3003                         # At this point we have a new video
3004                         self._downloader.increment_downloads()
3005
3006                         # Extension
3007                         video_extension = self._video_extensions.get(format_param, 'mp4')
3008
3009                         try:
3010                                 # Process video information
3011                                 self._downloader.process_info({
3012                                         'id':           video_id.decode('utf-8'),
3013                                         'url':          video_real_url.decode('utf-8'),
3014                                         'uploader':     video_uploader.decode('utf-8'),
3015                                         'upload_date':  upload_date,
3016                                         'title':        video_title,
3017                                         'stitle':       simple_title,
3018                                         'ext':          video_extension.decode('utf-8'),
3019                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3020                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3021                                         'description':  video_description.decode('utf-8'),
3022                                         'player_url':   None,
3023                                 })
3024                         except UnavailableVideoError, err:
3025                                 self._downloader.trouble(u'\nERROR: unable to download video')
3026
3027 class BlipTVIE(InfoExtractor):
3028         """Information extractor for blip.tv"""
3029
3030         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3031         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3032         IE_NAME = u'blip.tv'
3033
3034         def report_extraction(self, file_id):
3035                 """Report information extraction."""
3036                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3037
3038         def report_direct_download(self, title):
3039                 """Report information extraction."""
3040                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3041
3042         def _real_extract(self, url):
3043                 mobj = re.match(self._VALID_URL, url)
3044                 if mobj is None:
3045                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3046                         return
3047
3048                 if '?' in url:
3049                         cchar = '&'
3050                 else:
3051                         cchar = '?'
3052                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3053                 request = urllib2.Request(json_url)
3054                 self.report_extraction(mobj.group(1))
3055                 info = None
3056                 try:
3057                         urlh = urllib2.urlopen(request)
3058                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3059                                 basename = url.split('/')[-1]
3060                                 title,ext = os.path.splitext(basename)
3061                                 title = title.decode('UTF-8')
3062                                 ext = ext.replace('.', '')
3063                                 self.report_direct_download(title)
3064                                 info = {
3065                                         'id': title,
3066                                         'url': url,
3067                                         'title': title,
3068                                         'stitle': _simplify_title(title),
3069                                         'ext': ext,
3070                                         'urlhandle': urlh
3071                                 }
3072                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3073                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3074                         return
3075                 if info is None: # Regular URL
3076                         try:
3077                                 json_code = urlh.read()
3078                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3079                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3080                                 return
3081
3082                         try:
3083                                 json_data = json.loads(json_code)
3084                                 if 'Post' in json_data:
3085                                         data = json_data['Post']
3086                                 else:
3087                                         data = json_data
3088         
3089                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3090                                 video_url = data['media']['url']
3091                                 umobj = re.match(self._URL_EXT, video_url)
3092                                 if umobj is None:
3093                                         raise ValueError('Can not determine filename extension')
3094                                 ext = umobj.group(1)
3095         
3096                                 info = {
3097                                         'id': data['item_id'],
3098                                         'url': video_url,
3099                                         'uploader': data['display_name'],
3100                                         'upload_date': upload_date,
3101                                         'title': data['title'],
3102                                         'stitle': _simplify_title(data['title']),
3103                                         'ext': ext,
3104                                         'format': data['media']['mimeType'],
3105                                         'thumbnail': data['thumbnailUrl'],
3106                                         'description': data['description'],
3107                                         'player_url': data['embedUrl']
3108                                 }
3109                         except (ValueError,KeyError), err:
3110                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3111                                 return
3112
3113                 self._downloader.increment_downloads()
3114
3115                 try:
3116                         self._downloader.process_info(info)
3117                 except UnavailableVideoError, err:
3118                         self._downloader.trouble(u'\nERROR: unable to download video')
3119
3120
3121 class MyVideoIE(InfoExtractor):
3122         """Information Extractor for myvideo.de."""
3123
3124         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3125         IE_NAME = u'myvideo'
3126
3127         def __init__(self, downloader=None):
3128                 InfoExtractor.__init__(self, downloader)
3129         
3130         def report_download_webpage(self, video_id):
3131                 """Report webpage download."""
3132                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3133
3134         def report_extraction(self, video_id):
3135                 """Report information extraction."""
3136                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3137
3138         def _real_extract(self,url):
3139                 mobj = re.match(self._VALID_URL, url)
3140                 if mobj is None:
3141                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3142                         return
3143
3144                 video_id = mobj.group(1)
3145
3146                 # Get video webpage
3147                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3148                 try:
3149                         self.report_download_webpage(video_id)
3150                         webpage = urllib2.urlopen(request).read()
3151                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3152                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3153                         return
3154
3155                 self.report_extraction(video_id)
3156                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3157                                  webpage)
3158                 if mobj is None:
3159                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3160                         return
3161                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3162
3163                 mobj = re.search('<title>([^<]+)</title>', webpage)
3164                 if mobj is None:
3165                         self._downloader.trouble(u'ERROR: unable to extract title')
3166                         return
3167
3168                 video_title = mobj.group(1)
3169                 video_title = sanitize_title(video_title)
3170
3171                 simple_title = _simplify_title(video_title)
3172
3173                 try:
3174                         self._downloader.process_info({
3175                                 'id':           video_id,
3176                                 'url':          video_url,
3177                                 'uploader':     u'NA',
3178                                 'upload_date':  u'NA',
3179                                 'title':        video_title,
3180                                 'stitle':       simple_title,
3181                                 'ext':          u'flv',
3182                                 'format':       u'NA',
3183                                 'player_url':   None,
3184                         })
3185                 except UnavailableVideoError:
3186                         self._downloader.trouble(u'\nERROR: Unable to download video')
3187
3188 class ComedyCentralIE(InfoExtractor):
3189         """Information extractor for The Daily Show and Colbert Report """
3190
3191         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3192         IE_NAME = u'comedycentral'
3193
3194         def report_extraction(self, episode_id):
3195                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3196         
3197         def report_config_download(self, episode_id):
3198                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3199
3200         def report_index_download(self, episode_id):
3201                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3202
3203         def report_player_url(self, episode_id):
3204                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3205
3206         def _real_extract(self, url):
3207                 mobj = re.match(self._VALID_URL, url)
3208                 if mobj is None:
3209                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3210                         return
3211
3212                 if mobj.group('shortname'):
3213                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3214                                 url = u'http://www.thedailyshow.com/full-episodes/'
3215                         else:
3216                                 url = u'http://www.colbertnation.com/full-episodes/'
3217                         mobj = re.match(self._VALID_URL, url)
3218                         assert mobj is not None
3219
3220                 dlNewest = not mobj.group('episode')
3221                 if dlNewest:
3222                         epTitle = mobj.group('showname')
3223                 else:
3224                         epTitle = mobj.group('episode')
3225
3226                 req = urllib2.Request(url)
3227                 self.report_extraction(epTitle)
3228                 try:
3229                         htmlHandle = urllib2.urlopen(req)
3230                         html = htmlHandle.read()
3231                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3232                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3233                         return
3234                 if dlNewest:
3235                         url = htmlHandle.geturl()
3236                         mobj = re.match(self._VALID_URL, url)
3237                         if mobj is None:
3238                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3239                                 return
3240                         if mobj.group('episode') == '':
3241                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3242                                 return
3243                         epTitle = mobj.group('episode')
3244
3245                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3246                 if len(mMovieParams) == 0:
3247                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3248                         return
3249
3250                 playerUrl_raw = mMovieParams[0][0]
3251                 self.report_player_url(epTitle)
3252                 try:
3253                         urlHandle = urllib2.urlopen(playerUrl_raw)
3254                         playerUrl = urlHandle.geturl()
3255                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3256                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3257                         return
3258
3259                 uri = mMovieParams[0][1]
3260                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3261                 self.report_index_download(epTitle)
3262                 try:
3263                         indexXml = urllib2.urlopen(indexUrl).read()
3264                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3265                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3266                         return
3267
3268                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3269                 itemEls = idoc.findall('.//item')
3270                 for itemEl in itemEls:
3271                         mediaId = itemEl.findall('./guid')[0].text
3272                         shortMediaId = mediaId.split(':')[-1]
3273                         showId = mediaId.split(':')[-2].replace('.com', '')
3274                         officialTitle = itemEl.findall('./title')[0].text
3275                         officialDate = itemEl.findall('./pubDate')[0].text
3276
3277                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3278                                                 urllib.urlencode({'uri': mediaId}))
3279                         configReq = urllib2.Request(configUrl)
3280                         self.report_config_download(epTitle)
3281                         try:
3282                                 configXml = urllib2.urlopen(configReq).read()
3283                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3284                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3285                                 return
3286
3287                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3288                         turls = []
3289                         for rendition in cdoc.findall('.//rendition'):
3290                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3291                                 turls.append(finfo)
3292
3293                         if len(turls) == 0:
3294                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3295                                 continue
3296
3297                         # For now, just pick the highest bitrate
3298                         format,video_url = turls[-1]
3299
3300                         self._downloader.increment_downloads()
3301
3302                         effTitle = showId + u'-' + epTitle
3303                         info = {
3304                                 'id': shortMediaId,
3305                                 'url': video_url,
3306                                 'uploader': showId,
3307                                 'upload_date': officialDate,
3308                                 'title': effTitle,
3309                                 'stitle': _simplify_title(effTitle),
3310                                 'ext': 'mp4',
3311                                 'format': format,
3312                                 'thumbnail': None,
3313                                 'description': officialTitle,
3314                                 'player_url': playerUrl
3315                         }
3316
3317                         try:
3318                                 self._downloader.process_info(info)
3319                         except UnavailableVideoError, err:
3320                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3321                                 continue
3322
3323
3324 class EscapistIE(InfoExtractor):
3325         """Information extractor for The Escapist """
3326
3327         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3328         IE_NAME = u'escapist'
3329
3330         def report_extraction(self, showName):
3331                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3332
3333         def report_config_download(self, showName):
3334                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3335
3336         def _real_extract(self, url):
3337                 htmlParser = HTMLParser.HTMLParser()
3338
3339                 mobj = re.match(self._VALID_URL, url)
3340                 if mobj is None:
3341                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3342                         return
3343                 showName = mobj.group('showname')
3344                 videoId = mobj.group('episode')
3345
3346                 self.report_extraction(showName)
3347                 try:
3348                         webPage = urllib2.urlopen(url).read()
3349                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3350                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3351                         return
3352
3353                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3354                 description = htmlParser.unescape(descMatch.group(1))
3355                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3356                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3357                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3358                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3359                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3360                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3361
3362                 self.report_config_download(showName)
3363                 try:
3364                         configJSON = urllib2.urlopen(configUrl).read()
3365                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3366                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3367                         return
3368
3369                 # Technically, it's JavaScript, not JSON
3370                 configJSON = configJSON.replace("'", '"')
3371
3372                 try:
3373                         config = json.loads(configJSON)
3374                 except (ValueError,), err:
3375                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3376                         return
3377
3378                 playlist = config['playlist']
3379                 videoUrl = playlist[1]['url']
3380
3381                 self._downloader.increment_downloads()
3382                 info = {
3383                         'id': videoId,
3384                         'url': videoUrl,
3385                         'uploader': showName,
3386                         'upload_date': None,
3387                         'title': showName,
3388                         'stitle': _simplify_title(showName),
3389                         'ext': 'flv',
3390                         'format': 'flv',
3391                         'thumbnail': imgUrl,
3392                         'description': description,
3393                         'player_url': playerUrl,
3394                 }
3395
3396                 try:
3397                         self._downloader.process_info(info)
3398                 except UnavailableVideoError, err:
3399                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3400
3401
3402 class CollegeHumorIE(InfoExtractor):
3403         """Information extractor for collegehumor.com"""
3404
3405         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3406         IE_NAME = u'collegehumor'
3407
3408         def report_webpage(self, video_id):
3409                 """Report information extraction."""
3410                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3411
3412         def report_extraction(self, video_id):
3413                 """Report information extraction."""
3414                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3415
3416         def _real_extract(self, url):
3417                 htmlParser = HTMLParser.HTMLParser()
3418
3419                 mobj = re.match(self._VALID_URL, url)
3420                 if mobj is None:
3421                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3422                         return
3423                 video_id = mobj.group('videoid')
3424
3425                 self.report_webpage(video_id)
3426                 request = urllib2.Request(url)
3427                 try:
3428                         webpage = urllib2.urlopen(request).read()
3429                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3430                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3431                         return
3432
3433                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3434                 if m is None:
3435                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3436                         return
3437                 internal_video_id = m.group('internalvideoid')
3438
3439                 info = {
3440                         'id': video_id,
3441                         'internal_id': internal_video_id,
3442                 }
3443
3444                 self.report_extraction(video_id)
3445                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3446                 try:
3447                         metaXml = urllib2.urlopen(xmlUrl).read()
3448                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3449                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3450                         return
3451
3452                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3453                 try:
3454                         videoNode = mdoc.findall('./video')[0]
3455                         info['description'] = videoNode.findall('./description')[0].text
3456                         info['title'] = videoNode.findall('./caption')[0].text
3457                         info['stitle'] = _simplify_title(info['title'])
3458                         info['url'] = videoNode.findall('./file')[0].text
3459                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3460                         info['ext'] = info['url'].rpartition('.')[2]
3461                         info['format'] = info['ext']
3462                 except IndexError:
3463                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3464                         return
3465
3466                 self._downloader.increment_downloads()
3467
3468                 try:
3469                         self._downloader.process_info(info)
3470                 except UnavailableVideoError, err:
3471                         self._downloader.trouble(u'\nERROR: unable to download video')
3472
3473
3474 class XVideosIE(InfoExtractor):
3475         """Information extractor for xvideos.com"""
3476
3477         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3478         IE_NAME = u'xvideos'
3479
3480         def report_webpage(self, video_id):
3481                 """Report information extraction."""
3482                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3483
3484         def report_extraction(self, video_id):
3485                 """Report information extraction."""
3486                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3487
3488         def _real_extract(self, url):
3489                 htmlParser = HTMLParser.HTMLParser()
3490
3491                 mobj = re.match(self._VALID_URL, url)
3492                 if mobj is None:
3493                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3494                         return
3495                 video_id = mobj.group(1).decode('utf-8')
3496
3497                 self.report_webpage(video_id)
3498
3499                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3500                 try:
3501                         webpage = urllib2.urlopen(request).read()
3502                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3503                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3504                         return
3505
3506                 self.report_extraction(video_id)
3507
3508
3509                 # Extract video URL
3510                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3511                 if mobj is None:
3512                         self._downloader.trouble(u'ERROR: unable to extract video url')
3513                         return
3514                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3515
3516
3517                 # Extract title
3518                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3519                 if mobj is None:
3520                         self._downloader.trouble(u'ERROR: unable to extract video title')
3521                         return
3522                 video_title = mobj.group(1).decode('utf-8')
3523
3524
3525                 # Extract video thumbnail
3526                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3527                 if mobj is None:
3528                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3529                         return
3530                 video_thumbnail = mobj.group(1).decode('utf-8')
3531
3532
3533
3534                 self._downloader.increment_downloads()
3535                 info = {
3536                         'id': video_id,
3537                         'url': video_url,
3538                         'uploader': None,
3539                         'upload_date': None,
3540                         'title': video_title,
3541                         'stitle': _simplify_title(video_title),
3542                         'ext': 'flv',
3543                         'format': 'flv',
3544                         'thumbnail': video_thumbnail,
3545                         'description': None,
3546                         'player_url': None,
3547                 }
3548
3549                 try:
3550                         self._downloader.process_info(info)
3551                 except UnavailableVideoError, err:
3552                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3553
3554
3555 class SoundcloudIE(InfoExtractor):
3556         """Information extractor for soundcloud.com
3557            To access the media, the uid of the song and a stream token
3558            must be extracted from the page source and the script must make
3559            a request to media.soundcloud.com/crossdomain.xml. Then
3560            the media can be grabbed by requesting from an url composed
3561            of the stream token and uid
3562          """
3563
3564         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3565         IE_NAME = u'soundcloud'
3566
3567         def __init__(self, downloader=None):
3568                 InfoExtractor.__init__(self, downloader)
3569
3570         def report_webpage(self, video_id):
3571                 """Report information extraction."""
3572                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3573
3574         def report_extraction(self, video_id):
3575                 """Report information extraction."""
3576                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3577
3578         def _real_extract(self, url):
3579                 htmlParser = HTMLParser.HTMLParser()
3580
3581                 mobj = re.match(self._VALID_URL, url)
3582                 if mobj is None:
3583                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3584                         return
3585
3586                 # extract uploader (which is in the url)
3587                 uploader = mobj.group(1).decode('utf-8')
3588                 # extract simple title (uploader + slug of song title)
3589                 slug_title =  mobj.group(2).decode('utf-8')
3590                 simple_title = uploader + '-' + slug_title
3591
3592                 self.report_webpage('%s/%s' % (uploader, slug_title))
3593
3594                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3595                 try:
3596                         webpage = urllib2.urlopen(request).read()
3597                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3598                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3599                         return
3600
3601                 self.report_extraction('%s/%s' % (uploader, slug_title))
3602
3603                 # extract uid and stream token that soundcloud hands out for access
3604                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3605                 if mobj:
3606                         video_id = mobj.group(1)
3607                         stream_token = mobj.group(2)
3608
3609                 # extract unsimplified title
3610                 mobj = re.search('"title":"(.*?)",', webpage)
3611                 if mobj:
3612                         title = mobj.group(1)
3613
3614                 # construct media url (with uid/token)
3615                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3616                 mediaURL = mediaURL % (video_id, stream_token)
3617
3618                 # description
3619                 description = u'No description available'
3620                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3621                 if mobj:
3622                         description = mobj.group(1)
3623                 
3624                 # upload date
3625                 upload_date = None
3626                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3627                 if mobj:
3628                         try:
3629                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3630                         except Exception, e:
3631                                 print str(e)
3632
3633                 # for soundcloud, a request to a cross domain is required for cookies
3634                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3635
3636                 try:
3637                         self._downloader.process_info({
3638                                 'id':           video_id.decode('utf-8'),
3639                                 'url':          mediaURL,
3640                                 'uploader':     uploader.decode('utf-8'),
3641                                 'upload_date':  upload_date,
3642                                 'title':        simple_title.decode('utf-8'),
3643                                 'stitle':       simple_title.decode('utf-8'),
3644                                 'ext':          u'mp3',
3645                                 'format':       u'NA',
3646                                 'player_url':   None,
3647                                 'description': description.decode('utf-8')
3648                         })
3649                 except UnavailableVideoError:
3650                         self._downloader.trouble(u'\nERROR: unable to download video')
3651
3652
3653 class InfoQIE(InfoExtractor):
3654         """Information extractor for infoq.com"""
3655
3656         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3657         IE_NAME = u'infoq'
3658
3659         def report_webpage(self, video_id):
3660                 """Report information extraction."""
3661                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3662
3663         def report_extraction(self, video_id):
3664                 """Report information extraction."""
3665                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3666
3667         def _real_extract(self, url):
3668                 htmlParser = HTMLParser.HTMLParser()
3669
3670                 mobj = re.match(self._VALID_URL, url)
3671                 if mobj is None:
3672                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3673                         return
3674
3675                 self.report_webpage(url)
3676
3677                 request = urllib2.Request(url)
3678                 try:
3679                         webpage = urllib2.urlopen(request).read()
3680                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3681                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3682                         return
3683
3684                 self.report_extraction(url)
3685
3686
3687                 # Extract video URL
3688                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3689                 if mobj is None:
3690                         self._downloader.trouble(u'ERROR: unable to extract video url')
3691                         return
3692                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3693
3694
3695                 # Extract title
3696                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3697                 if mobj is None:
3698                         self._downloader.trouble(u'ERROR: unable to extract video title')
3699                         return
3700                 video_title = mobj.group(1).decode('utf-8')
3701
3702                 # Extract description
3703                 video_description = u'No description available.'
3704                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3705                 if mobj is not None:
3706                         video_description = mobj.group(1).decode('utf-8')
3707
3708                 video_filename = video_url.split('/')[-1]
3709                 video_id, extension = video_filename.split('.')
3710
3711                 self._downloader.increment_downloads()
3712                 info = {
3713                         'id': video_id,
3714                         'url': video_url,
3715                         'uploader': None,
3716                         'upload_date': None,
3717                         'title': video_title,
3718                         'stitle': _simplify_title(video_title),
3719                         'ext': extension,
3720                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3721                         'thumbnail': None,
3722                         'description': video_description,
3723                         'player_url': None,
3724                 }
3725
3726                 try:
3727                         self._downloader.process_info(info)
3728                 except UnavailableVideoError, err:
3729                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3730
3731 class MixcloudIE(InfoExtractor):
3732         """Information extractor for www.mixcloud.com"""
3733         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3734         IE_NAME = u'mixcloud'
3735
3736         def __init__(self, downloader=None):
3737                 InfoExtractor.__init__(self, downloader)
3738
3739         def report_download_json(self, file_id):
3740                 """Report JSON download."""
3741                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3742
3743         def report_extraction(self, file_id):
3744                 """Report information extraction."""
3745                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3746
3747         def get_urls(self, jsonData, fmt, bitrate='best'):
3748                 """Get urls from 'audio_formats' section in json"""
3749                 file_url = None
3750                 try:
3751                         bitrate_list = jsonData[fmt]
3752                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3753                                 bitrate = max(bitrate_list) # select highest
3754
3755                         url_list = jsonData[fmt][bitrate]
3756                 except TypeError: # we have no bitrate info.
3757                         url_list = jsonData[fmt]
3758                                 
3759                 return url_list
3760
3761         def check_urls(self, url_list):
3762                 """Returns 1st active url from list"""
3763                 for url in url_list:
3764                         try:
3765                                 urllib2.urlopen(url)
3766                                 return url
3767                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3768                                 url = None
3769
3770                 return None
3771
3772         def _print_formats(self, formats):
3773                 print 'Available formats:'
3774                 for fmt in formats.keys():
3775                         for b in formats[fmt]:
3776                                 try:
3777                                         ext = formats[fmt][b][0]
3778                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3779                                 except TypeError: # we have no bitrate info
3780                                         ext = formats[fmt][0]
3781                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3782                                         break
3783
3784         def _real_extract(self, url):
3785                 mobj = re.match(self._VALID_URL, url)
3786                 if mobj is None:
3787                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3788                         return
3789                 # extract uploader & filename from url
3790                 uploader = mobj.group(1).decode('utf-8')
3791                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3792
3793                 # construct API request
3794                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3795                 # retrieve .json file with links to files
3796                 request = urllib2.Request(file_url)
3797                 try:
3798                         self.report_download_json(file_url)
3799                         jsonData = urllib2.urlopen(request).read()
3800                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3801                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3802                         return
3803
3804                 # parse JSON
3805                 json_data = json.loads(jsonData)
3806                 player_url = json_data['player_swf_url']
3807                 formats = dict(json_data['audio_formats'])
3808
3809                 req_format = self._downloader.params.get('format', None)
3810                 bitrate = None
3811
3812                 if self._downloader.params.get('listformats', None):
3813                         self._print_formats(formats)
3814                         return
3815
3816                 if req_format is None or req_format == 'best':
3817                         for format_param in formats.keys():
3818                                 url_list = self.get_urls(formats, format_param)
3819                                 # check urls
3820                                 file_url = self.check_urls(url_list)
3821                                 if file_url is not None:
3822                                         break # got it!
3823                 else:
3824                         if req_format not in formats.keys():
3825                                 self._downloader.trouble(u'ERROR: format is not available')
3826                                 return
3827
3828                         url_list = self.get_urls(formats, req_format)
3829                         file_url = self.check_urls(url_list)
3830                         format_param = req_format
3831
3832                 # We have audio
3833                 self._downloader.increment_downloads()
3834                 try:
3835                         # Process file information
3836                         self._downloader.process_info({
3837                                 'id': file_id.decode('utf-8'),
3838                                 'url': file_url.decode('utf-8'),
3839                                 'uploader':     uploader.decode('utf-8'),
3840                                 'upload_date': u'NA',
3841                                 'title': json_data['name'],
3842                                 'stitle': _simplify_title(json_data['name']),
3843                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3844                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3845                                 'thumbnail': json_data['thumbnail_url'],
3846                                 'description': json_data['description'],
3847                                 'player_url': player_url.decode('utf-8'),
3848                         })
3849                 except UnavailableVideoError, err:
3850                         self._downloader.trouble(u'ERROR: unable to download file')
3851
3852 class StanfordOpenClassroomIE(InfoExtractor):
3853         """Information extractor for Stanford's Open ClassRoom"""
3854
3855         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3856         IE_NAME = u'stanfordoc'
3857
3858         def report_download_webpage(self, objid):
3859                 """Report information extraction."""
3860                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3861
3862         def report_extraction(self, video_id):
3863                 """Report information extraction."""
3864                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3865
3866         def _real_extract(self, url):
3867                 mobj = re.match(self._VALID_URL, url)
3868                 if mobj is None:
3869                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3870                         return
3871
3872                 if mobj.group('course') and mobj.group('video'): # A specific video
3873                         course = mobj.group('course')
3874                         video = mobj.group('video')
3875                         info = {
3876                                 'id': _simplify_title(course + '_' + video),
3877                         }
3878         
3879                         self.report_extraction(info['id'])
3880                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3881                         xmlUrl = baseUrl + video + '.xml'
3882                         try:
3883                                 metaXml = urllib2.urlopen(xmlUrl).read()
3884                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3885                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3886                                 return
3887                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3888                         try:
3889                                 info['title'] = mdoc.findall('./title')[0].text
3890                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3891                         except IndexError:
3892                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3893                                 return
3894                         info['stitle'] = _simplify_title(info['title'])
3895                         info['ext'] = info['url'].rpartition('.')[2]
3896                         info['format'] = info['ext']
3897                         self._downloader.increment_downloads()
3898                         try:
3899                                 self._downloader.process_info(info)
3900                         except UnavailableVideoError, err:
3901                                 self._downloader.trouble(u'\nERROR: unable to download video')
3902                 elif mobj.group('course'): # A course page
3903                         unescapeHTML = HTMLParser.HTMLParser().unescape
3904
3905                         course = mobj.group('course')
3906                         info = {
3907                                 'id': _simplify_title(course),
3908                                 'type': 'playlist',
3909                         }
3910
3911                         self.report_download_webpage(info['id'])
3912                         try:
3913                                 coursepage = urllib2.urlopen(url).read()
3914                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3915                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3916                                 return
3917
3918                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3919                         if m:
3920                                 info['title'] = unescapeHTML(m.group(1))
3921                         else:
3922                                 info['title'] = info['id']
3923                         info['stitle'] = _simplify_title(info['title'])
3924
3925                         m = re.search('<description>([^<]+)</description>', coursepage)
3926                         if m:
3927                                 info['description'] = unescapeHTML(m.group(1))
3928
3929                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3930                         info['list'] = [
3931                                 {
3932                                         'type': 'reference',
3933                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3934                                 }
3935                                         for vpage in links]
3936
3937                         for entry in info['list']:
3938                                 assert entry['type'] == 'reference'
3939                                 self.extract(entry['url'])
3940                 else: # Root page
3941                         unescapeHTML = HTMLParser.HTMLParser().unescape
3942
3943                         info = {
3944                                 'id': 'Stanford OpenClassroom',
3945                                 'type': 'playlist',
3946                         }
3947
3948                         self.report_download_webpage(info['id'])
3949                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3950                         try:
3951                                 rootpage = urllib2.urlopen(rootURL).read()
3952                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3953                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3954                                 return
3955
3956                         info['title'] = info['id']
3957                         info['stitle'] = _simplify_title(info['title'])
3958
3959                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3960                         info['list'] = [
3961                                 {
3962                                         'type': 'reference',
3963                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3964                                 }
3965                                         for cpage in links]
3966
3967                         for entry in info['list']:
3968                                 assert entry['type'] == 'reference'
3969                                 self.extract(entry['url'])
3970
3971 class MTVIE(InfoExtractor):
3972         """Information extractor for MTV.com"""
3973
3974         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3975         IE_NAME = u'mtv'
3976
3977         def report_webpage(self, video_id):
3978                 """Report information extraction."""
3979                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3980
3981         def report_extraction(self, video_id):
3982                 """Report information extraction."""
3983                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3984
3985         def _real_extract(self, url):
3986                 mobj = re.match(self._VALID_URL, url)
3987                 if mobj is None:
3988                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3989                         return
3990                 if not mobj.group('proto'):
3991                         url = 'http://' + url
3992                 video_id = mobj.group('videoid')
3993                 self.report_webpage(video_id)
3994
3995                 request = urllib2.Request(url)
3996                 try:
3997                         webpage = urllib2.urlopen(request).read()
3998                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3999                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4000                         return
4001
4002                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4003                 if mobj is None:
4004                         self._downloader.trouble(u'ERROR: unable to extract song name')
4005                         return
4006                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4007                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4008                 if mobj is None:
4009                         self._downloader.trouble(u'ERROR: unable to extract performer')
4010                         return
4011                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4012                 video_title = performer + ' - ' + song_name 
4013
4014                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4015                 if mobj is None:
4016                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4017                         return
4018                 mtvn_uri = mobj.group(1)
4019
4020                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4021                 if mobj is None:
4022                         self._downloader.trouble(u'ERROR: unable to extract content id')
4023                         return
4024                 content_id = mobj.group(1)
4025
4026                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4027                 self.report_extraction(video_id)
4028                 request = urllib2.Request(videogen_url)
4029                 try:
4030                         metadataXml = urllib2.urlopen(request).read()
4031                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4032                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4033                         return
4034
4035                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4036                 renditions = mdoc.findall('.//rendition')
4037
4038                 # For now, always pick the highest quality.
4039                 rendition = renditions[-1]
4040
4041                 try:
4042                         _,_,ext = rendition.attrib['type'].partition('/')
4043                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4044                         video_url = rendition.find('./src').text
4045                 except KeyError:
4046                         self._downloader.trouble('Invalid rendition field.')
4047                         return
4048
4049                 self._downloader.increment_downloads()
4050                 info = {
4051                         'id': video_id,
4052                         'url': video_url,
4053                         'uploader': performer,
4054                         'title': video_title,
4055                         'stitle': _simplify_title(video_title),
4056                         'ext': ext,
4057                         'format': format,
4058                 }
4059
4060                 try:
4061                         self._downloader.process_info(info)
4062                 except UnavailableVideoError, err:
4063                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4064
4065
4066 class PostProcessor(object):
4067         """Post Processor class.
4068
4069         PostProcessor objects can be added to downloaders with their
4070         add_post_processor() method. When the downloader has finished a
4071         successful download, it will take its internal chain of PostProcessors
4072         and start calling the run() method on each one of them, first with
4073         an initial argument and then with the returned value of the previous
4074         PostProcessor.
4075
4076         The chain will be stopped if one of them ever returns None or the end
4077         of the chain is reached.
4078
4079         PostProcessor objects follow a "mutual registration" process similar
4080         to InfoExtractor objects.
4081         """
4082
4083         _downloader = None
4084
4085         def __init__(self, downloader=None):
4086                 self._downloader = downloader
4087
4088         def set_downloader(self, downloader):
4089                 """Sets the downloader for this PP."""
4090                 self._downloader = downloader
4091
4092         def run(self, information):
4093                 """Run the PostProcessor.
4094
4095                 The "information" argument is a dictionary like the ones
4096                 composed by InfoExtractors. The only difference is that this
4097                 one has an extra field called "filepath" that points to the
4098                 downloaded file.
4099
4100                 When this method returns None, the postprocessing chain is
4101                 stopped. However, this method may return an information
4102                 dictionary that will be passed to the next postprocessing
4103                 object in the chain. It can be the one it received after
4104                 changing some fields.
4105
4106                 In addition, this method may raise a PostProcessingError
4107                 exception that will be taken into account by the downloader
4108                 it was called from.
4109                 """
4110                 return information # by default, do nothing
4111
4112 class AudioConversionError(BaseException):
4113         def __init__(self, message):
4114                 self.message = message
4115
4116 class FFmpegExtractAudioPP(PostProcessor):
4117
4118         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4119                 PostProcessor.__init__(self, downloader)
4120                 if preferredcodec is None:
4121                         preferredcodec = 'best'
4122                 self._preferredcodec = preferredcodec
4123                 self._preferredquality = preferredquality
4124                 self._keepvideo = keepvideo
4125
4126         @staticmethod
4127         def get_audio_codec(path):
4128                 try:
4129                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4130                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4131                         output = handle.communicate()[0]
4132                         if handle.wait() != 0:
4133                                 return None
4134                 except (IOError, OSError):
4135                         return None
4136                 audio_codec = None
4137                 for line in output.split('\n'):
4138                         if line.startswith('codec_name='):
4139                                 audio_codec = line.split('=')[1].strip()
4140                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4141                                 return audio_codec
4142                 return None
4143
4144         @staticmethod
4145         def run_ffmpeg(path, out_path, codec, more_opts):
4146                 if codec is None:
4147                         acodec_opts = []
4148                 else:
4149                         acodec_opts = ['-acodec', codec]
4150                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4151                 try:
4152                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4153                         stdout,stderr = p.communicate()
4154                 except (IOError, OSError):
4155                         e = sys.exc_info()[1]
4156                         if isinstance(e, OSError) and e.errno == 2:
4157                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4158                         else:
4159                                 raise e
4160                 if p.returncode != 0:
4161                         msg = stderr.strip().split('\n')[-1]
4162                         raise AudioConversionError(msg)
4163
4164         def run(self, information):
4165                 path = information['filepath']
4166
4167                 filecodec = self.get_audio_codec(path)
4168                 if filecodec is None:
4169                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4170                         return None
4171
4172                 more_opts = []
4173                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4174                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4175                                 # Lossless, but in another container
4176                                 acodec = 'copy'
4177                                 extension = self._preferredcodec
4178                                 more_opts = ['-absf', 'aac_adtstoasc']
4179                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4180                                 # Lossless if possible
4181                                 acodec = 'copy'
4182                                 extension = filecodec
4183                                 if filecodec == 'aac':
4184                                         more_opts = ['-f', 'adts']
4185                                 if filecodec == 'vorbis':
4186                                         extension = 'ogg'
4187                         else:
4188                                 # MP3 otherwise.
4189                                 acodec = 'libmp3lame'
4190                                 extension = 'mp3'
4191                                 more_opts = []
4192                                 if self._preferredquality is not None:
4193                                         more_opts += ['-ab', self._preferredquality]
4194                 else:
4195                         # We convert the audio (lossy)
4196                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4197                         extension = self._preferredcodec
4198                         more_opts = []
4199                         if self._preferredquality is not None:
4200                                 more_opts += ['-ab', self._preferredquality]
4201                         if self._preferredcodec == 'aac':
4202                                 more_opts += ['-f', 'adts']
4203                         if self._preferredcodec == 'm4a':
4204                                 more_opts += ['-absf', 'aac_adtstoasc']
4205                         if self._preferredcodec == 'vorbis':
4206                                 extension = 'ogg'
4207                         if self._preferredcodec == 'wav':
4208                                 extension = 'wav'
4209                                 more_opts += ['-f', 'wav']
4210
4211                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4212                 new_path = prefix + sep + extension
4213                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4214                 try:
4215                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4216                 except:
4217                         etype,e,tb = sys.exc_info()
4218                         if isinstance(e, AudioConversionError):
4219                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4220                         else:
4221                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4222                         return None
4223
4224                 # Try to update the date time for extracted audio file.
4225                 if information.get('filetime') is not None:
4226                         try:
4227                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4228                         except:
4229                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4230
4231                 if not self._keepvideo:
4232                         try:
4233                                 os.remove(_encodeFilename(path))
4234                         except (IOError, OSError):
4235                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4236                                 return None
4237
4238                 information['filepath'] = new_path
4239                 return information
4240
4241
4242 def updateSelf(downloader, filename):
4243         ''' Update the program file with the latest version from the repository '''
4244         # Note: downloader only used for options
4245         if not os.access(filename, os.W_OK):
4246                 sys.exit('ERROR: no write permissions on %s' % filename)
4247
4248         downloader.to_screen(u'Updating to latest version...')
4249
4250         try:
4251                 try:
4252                         urlh = urllib.urlopen(UPDATE_URL)
4253                         newcontent = urlh.read()
4254                         
4255                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4256                         if vmatch is not None and vmatch.group(1) == __version__:
4257                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4258                                 return
4259                 finally:
4260                         urlh.close()
4261         except (IOError, OSError), err:
4262                 sys.exit('ERROR: unable to download latest version')
4263
4264         try:
4265                 outf = open(filename, 'wb')
4266                 try:
4267                         outf.write(newcontent)
4268                 finally:
4269                         outf.close()
4270         except (IOError, OSError), err:
4271                 sys.exit('ERROR: unable to overwrite current version')
4272
4273         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4274
4275 def parseOpts():
4276         def _readOptions(filename_bytes):
4277                 try:
4278                         optionf = open(filename_bytes)
4279                 except IOError:
4280                         return [] # silently skip if file is not present
4281                 try:
4282                         res = []
4283                         for l in optionf:
4284                                 res += shlex.split(l, comments=True)
4285                 finally:
4286                         optionf.close()
4287                 return res
4288
4289         def _format_option_string(option):
4290                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4291
4292                 opts = []
4293
4294                 if option._short_opts: opts.append(option._short_opts[0])
4295                 if option._long_opts: opts.append(option._long_opts[0])
4296                 if len(opts) > 1: opts.insert(1, ', ')
4297
4298                 if option.takes_value(): opts.append(' %s' % option.metavar)
4299
4300                 return "".join(opts)
4301
4302         def _find_term_columns():
4303                 columns = os.environ.get('COLUMNS', None)
4304                 if columns:
4305                         return int(columns)
4306
4307                 try:
4308                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4309                         out,err = sp.communicate()
4310                         return int(out.split()[1])
4311                 except:
4312                         pass
4313                 return None
4314
4315         max_width = 80
4316         max_help_position = 80
4317
4318         # No need to wrap help messages if we're on a wide console
4319         columns = _find_term_columns()
4320         if columns: max_width = columns
4321
4322         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4323         fmt.format_option_strings = _format_option_string
4324
4325         kw = {
4326                 'version'   : __version__,
4327                 'formatter' : fmt,
4328                 'usage' : '%prog [options] url [url...]',
4329                 'conflict_handler' : 'resolve',
4330         }
4331
4332         parser = optparse.OptionParser(**kw)
4333
4334         # option groups
4335         general        = optparse.OptionGroup(parser, 'General Options')
4336         selection      = optparse.OptionGroup(parser, 'Video Selection')
4337         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4338         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4339         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4340         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4341         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4342
4343         general.add_option('-h', '--help',
4344                         action='help', help='print this help text and exit')
4345         general.add_option('-v', '--version',
4346                         action='version', help='print program version and exit')
4347         general.add_option('-U', '--update',
4348                         action='store_true', dest='update_self', help='update this program to latest version')
4349         general.add_option('-i', '--ignore-errors',
4350                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4351         general.add_option('-r', '--rate-limit',
4352                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4353         general.add_option('-R', '--retries',
4354                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4355         general.add_option('--dump-user-agent',
4356                         action='store_true', dest='dump_user_agent',
4357                         help='display the current browser identification', default=False)
4358         general.add_option('--list-extractors',
4359                         action='store_true', dest='list_extractors',
4360                         help='List all supported extractors and the URLs they would handle', default=False)
4361
4362         selection.add_option('--playlist-start',
4363                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4364         selection.add_option('--playlist-end',
4365                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4366         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4367         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4368         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4369
4370         authentication.add_option('-u', '--username',
4371                         dest='username', metavar='USERNAME', help='account username')
4372         authentication.add_option('-p', '--password',
4373                         dest='password', metavar='PASSWORD', help='account password')
4374         authentication.add_option('-n', '--netrc',
4375                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4376
4377
4378         video_format.add_option('-f', '--format',
4379                         action='store', dest='format', metavar='FORMAT', help='video format code')
4380         video_format.add_option('--all-formats',
4381                         action='store_const', dest='format', help='download all available video formats', const='all')
4382         video_format.add_option('--prefer-free-formats',
4383                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4384         video_format.add_option('--max-quality',
4385                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4386         video_format.add_option('-F', '--list-formats',
4387                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4388
4389
4390         verbosity.add_option('-q', '--quiet',
4391                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4392         verbosity.add_option('-s', '--simulate',
4393                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4394         verbosity.add_option('--skip-download',
4395                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4396         verbosity.add_option('-g', '--get-url',
4397                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4398         verbosity.add_option('-e', '--get-title',
4399                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4400         verbosity.add_option('--get-thumbnail',
4401                         action='store_true', dest='getthumbnail',
4402                         help='simulate, quiet but print thumbnail URL', default=False)
4403         verbosity.add_option('--get-description',
4404                         action='store_true', dest='getdescription',
4405                         help='simulate, quiet but print video description', default=False)
4406         verbosity.add_option('--get-filename',
4407                         action='store_true', dest='getfilename',
4408                         help='simulate, quiet but print output filename', default=False)
4409         verbosity.add_option('--get-format',
4410                         action='store_true', dest='getformat',
4411                         help='simulate, quiet but print output format', default=False)
4412         verbosity.add_option('--no-progress',
4413                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4414         verbosity.add_option('--console-title',
4415                         action='store_true', dest='consoletitle',
4416                         help='display progress in console titlebar', default=False)
4417         verbosity.add_option('-v', '--verbose',
4418                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4419
4420
4421         filesystem.add_option('-t', '--title',
4422                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4423         filesystem.add_option('-l', '--literal',
4424                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4425         filesystem.add_option('-A', '--auto-number',
4426                         action='store_true', dest='autonumber',
4427                         help='number downloaded files starting from 00000', default=False)
4428         filesystem.add_option('-o', '--output',
4429                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4430         filesystem.add_option('-a', '--batch-file',
4431                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4432         filesystem.add_option('-w', '--no-overwrites',
4433                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4434         filesystem.add_option('-c', '--continue',
4435                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4436         filesystem.add_option('--no-continue',
4437                         action='store_false', dest='continue_dl',
4438                         help='do not resume partially downloaded files (restart from beginning)')
4439         filesystem.add_option('--cookies',
4440                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4441         filesystem.add_option('--no-part',
4442                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4443         filesystem.add_option('--no-mtime',
4444                         action='store_false', dest='updatetime',
4445                         help='do not use the Last-modified header to set the file modification time', default=True)
4446         filesystem.add_option('--write-description',
4447                         action='store_true', dest='writedescription',
4448                         help='write video description to a .description file', default=False)
4449         filesystem.add_option('--write-info-json',
4450                         action='store_true', dest='writeinfojson',
4451                         help='write video metadata to a .info.json file', default=False)
4452         filesystem.add_option('--write-srt',
4453                         action='store_true', dest='writesubtitles',
4454                         help='write video subtitles to a .srt file', default=False)
4455
4456
4457         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4458                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4459         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4460                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4461         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4462                         help='ffmpeg audio bitrate specification, 128k by default')
4463         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4464                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4465
4466
4467         parser.add_option_group(general)
4468         parser.add_option_group(selection)
4469         parser.add_option_group(filesystem)
4470         parser.add_option_group(verbosity)
4471         parser.add_option_group(video_format)
4472         parser.add_option_group(authentication)
4473         parser.add_option_group(postproc)
4474
4475         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4476         if xdg_config_home:
4477                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4478         else:
4479                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4480         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4481         opts, args = parser.parse_args(argv)
4482
4483         return parser, opts, args
4484
4485 def gen_extractors():
4486         """ Return a list of an instance of every supported extractor.
4487         The order does matter; the first extractor matched is the one handling the URL.
4488         """
4489         youtube_ie = YoutubeIE()
4490         google_ie = GoogleIE()
4491         yahoo_ie = YahooIE()
4492         return [
4493                 YoutubePlaylistIE(youtube_ie),
4494                 YoutubeUserIE(youtube_ie),
4495                 YoutubeSearchIE(youtube_ie),
4496                 youtube_ie,
4497                 MetacafeIE(youtube_ie),
4498                 DailymotionIE(),
4499                 google_ie,
4500                 GoogleSearchIE(google_ie),
4501                 PhotobucketIE(),
4502                 yahoo_ie,
4503                 YahooSearchIE(yahoo_ie),
4504                 DepositFilesIE(),
4505                 FacebookIE(),
4506                 BlipTVIE(),
4507                 VimeoIE(),
4508                 MyVideoIE(),
4509                 ComedyCentralIE(),
4510                 EscapistIE(),
4511                 CollegeHumorIE(),
4512                 XVideosIE(),
4513                 SoundcloudIE(),
4514                 InfoQIE(),
4515                 MixcloudIE(),
4516                 StanfordOpenClassroomIE(),
4517                 MTVIE(),
4518
4519                 GenericIE()
4520         ]
4521
4522 def _real_main():
4523         parser, opts, args = parseOpts()
4524
4525         # Open appropriate CookieJar
4526         if opts.cookiefile is None:
4527                 jar = cookielib.CookieJar()
4528         else:
4529                 try:
4530                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4531                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4532                                 jar.load()
4533                 except (IOError, OSError), err:
4534                         sys.exit(u'ERROR: unable to open cookie file')
4535
4536         # Dump user agent
4537         if opts.dump_user_agent:
4538                 print std_headers['User-Agent']
4539                 sys.exit(0)
4540
4541         # Batch file verification
4542         batchurls = []
4543         if opts.batchfile is not None:
4544                 try:
4545                         if opts.batchfile == '-':
4546                                 batchfd = sys.stdin
4547                         else:
4548                                 batchfd = open(opts.batchfile, 'r')
4549                         batchurls = batchfd.readlines()
4550                         batchurls = [x.strip() for x in batchurls]
4551                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4552                 except IOError:
4553                         sys.exit(u'ERROR: batch file could not be read')
4554         all_urls = batchurls + args
4555
4556         # General configuration
4557         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4558         proxy_handler = urllib2.ProxyHandler()
4559         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4560         urllib2.install_opener(opener)
4561         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4562
4563         if opts.verbose:
4564                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4565
4566         extractors = gen_extractors()
4567
4568         if opts.list_extractors:
4569                 for ie in extractors:
4570                         print(ie.IE_NAME)
4571                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4572                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4573                         for mu in matchedUrls:
4574                                 print(u'  ' + mu)
4575                 sys.exit(0)
4576
4577         # Conflicting, missing and erroneous options
4578         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4579                 parser.error(u'using .netrc conflicts with giving username/password')
4580         if opts.password is not None and opts.username is None:
4581                 parser.error(u'account username missing')
4582         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4583                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4584         if opts.usetitle and opts.useliteral:
4585                 parser.error(u'using title conflicts with using literal title')
4586         if opts.username is not None and opts.password is None:
4587                 opts.password = getpass.getpass(u'Type account password and press return:')
4588         if opts.ratelimit is not None:
4589                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4590                 if numeric_limit is None:
4591                         parser.error(u'invalid rate limit specified')
4592                 opts.ratelimit = numeric_limit
4593         if opts.retries is not None:
4594                 try:
4595                         opts.retries = long(opts.retries)
4596                 except (TypeError, ValueError), err:
4597                         parser.error(u'invalid retry count specified')
4598         try:
4599                 opts.playliststart = int(opts.playliststart)
4600                 if opts.playliststart <= 0:
4601                         raise ValueError(u'Playlist start must be positive')
4602         except (TypeError, ValueError), err:
4603                 parser.error(u'invalid playlist start number specified')
4604         try:
4605                 opts.playlistend = int(opts.playlistend)
4606                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4607                         raise ValueError(u'Playlist end must be greater than playlist start')
4608         except (TypeError, ValueError), err:
4609                 parser.error(u'invalid playlist end number specified')
4610         if opts.extractaudio:
4611                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4612                         parser.error(u'invalid audio format specified')
4613
4614         # File downloader
4615         fd = FileDownloader({
4616                 'usenetrc': opts.usenetrc,
4617                 'username': opts.username,
4618                 'password': opts.password,
4619                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4620                 'forceurl': opts.geturl,
4621                 'forcetitle': opts.gettitle,
4622                 'forcethumbnail': opts.getthumbnail,
4623                 'forcedescription': opts.getdescription,
4624                 'forcefilename': opts.getfilename,
4625                 'forceformat': opts.getformat,
4626                 'simulate': opts.simulate,
4627                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4628                 'format': opts.format,
4629                 'format_limit': opts.format_limit,
4630                 'listformats': opts.listformats,
4631                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4632                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4633                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4634                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4635                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4636                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4637                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4638                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4639                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4640                         or u'%(id)s.%(ext)s'),
4641                 'ignoreerrors': opts.ignoreerrors,
4642                 'ratelimit': opts.ratelimit,
4643                 'nooverwrites': opts.nooverwrites,
4644                 'retries': opts.retries,
4645                 'continuedl': opts.continue_dl,
4646                 'noprogress': opts.noprogress,
4647                 'playliststart': opts.playliststart,
4648                 'playlistend': opts.playlistend,
4649                 'logtostderr': opts.outtmpl == '-',
4650                 'consoletitle': opts.consoletitle,
4651                 'nopart': opts.nopart,
4652                 'updatetime': opts.updatetime,
4653                 'writedescription': opts.writedescription,
4654                 'writeinfojson': opts.writeinfojson,
4655                 'writesubtitles': opts.writesubtitles,
4656                 'matchtitle': opts.matchtitle,
4657                 'rejecttitle': opts.rejecttitle,
4658                 'max_downloads': opts.max_downloads,
4659                 'prefer_free_formats': opts.prefer_free_formats,
4660                 'verbose': opts.verbose,
4661                 })
4662         for extractor in extractors:
4663                 fd.add_info_extractor(extractor)
4664
4665         # PostProcessors
4666         if opts.extractaudio:
4667                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4668
4669         # Update version
4670         if opts.update_self:
4671                 updateSelf(fd, sys.argv[0])
4672
4673         # Maybe do nothing
4674         if len(all_urls) < 1:
4675                 if not opts.update_self:
4676                         parser.error(u'you must provide at least one URL')
4677                 else:
4678                         sys.exit()
4679         
4680         try:
4681                 retcode = fd.download(all_urls)
4682         except MaxDownloadsReached:
4683                 fd.to_screen(u'--max-download limit reached, aborting.')
4684                 retcode = 101
4685
4686         # Dump cookie jar if requested
4687         if opts.cookiefile is not None:
4688                 try:
4689                         jar.save()
4690                 except (IOError, OSError), err:
4691                         sys.exit(u'ERROR: unable to save cookie jar')
4692
4693         sys.exit(retcode)
4694
4695 def main():
4696         try:
4697                 _real_main()
4698         except DownloadError:
4699                 sys.exit(1)
4700         except SameFileError:
4701                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4702         except KeyboardInterrupt:
4703                 sys.exit(u'\nERROR: Interrupted by user')
4704
4705 if __name__ == '__main__':
4706         main()
4707
4708 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: