e7d61e18d7dcffed50defe9797b17827200cc538
[youtube-dl.git] / youtube_dl / __init__.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2012.01.25'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25
26 import cookielib
27 import datetime
28 import getpass
29 import gzip
30 import htmlentitydefs
31 import HTMLParser
32 import httplib
33 import locale
34 import math
35 import netrc
36 import optparse
37 import os
38 import os.path
39 import re
40 import shlex
41 import socket
42 import string
43 import subprocess
44 import sys
45 import time
46 import urllib
47 import urllib2
48 import warnings
49 import zlib
50
51 if os.name == 'nt':
52         import ctypes
53
54 try:
55         import email.utils
56 except ImportError: # Python 2.4
57         import email.Utils
58 try:
59         import cStringIO as StringIO
60 except ImportError:
61         import StringIO
62
63 # parse_qs was moved from the cgi module to the urlparse module recently.
64 try:
65         from urlparse import parse_qs
66 except ImportError:
67         from cgi import parse_qs
68
69 try:
70         import lxml.etree
71 except ImportError:
72         pass # Handled below
73
74 try:
75         import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
78
79 std_headers = {
80         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83         'Accept-Encoding': 'gzip, deflate',
84         'Accept-Language': 'en-us,en;q=0.5',
85 }
86
87 try:
88         import json
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90         import re
91         class json(object):
92                 @staticmethod
93                 def loads(s):
94                         s = s.decode('UTF-8')
95                         def raiseError(msg, i):
96                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97                         def skipSpace(i, expectMore=True):
98                                 while i < len(s) and s[i] in ' \t\r\n':
99                                         i += 1
100                                 if expectMore:
101                                         if i >= len(s):
102                                                 raiseError('Premature end', i)
103                                 return i
104                         def decodeEscape(match):
105                                 esc = match.group(1)
106                                 _STATIC = {
107                                         '"': '"',
108                                         '\\': '\\',
109                                         '/': '/',
110                                         'b': unichr(0x8),
111                                         'f': unichr(0xc),
112                                         'n': '\n',
113                                         'r': '\r',
114                                         't': '\t',
115                                 }
116                                 if esc in _STATIC:
117                                         return _STATIC[esc]
118                                 if esc[0] == 'u':
119                                         if len(esc) == 1+4:
120                                                 return unichr(int(esc[1:5], 16))
121                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
122                                                 hi = int(esc[1:5], 16)
123                                                 low = int(esc[7:11], 16)
124                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125                                 raise ValueError('Unknown escape ' + str(esc))
126                         def parseString(i):
127                                 i += 1
128                                 e = i
129                                 while True:
130                                         e = s.index('"', e)
131                                         bslashes = 0
132                                         while s[e-bslashes-1] == '\\':
133                                                 bslashes += 1
134                                         if bslashes % 2 == 1:
135                                                 e += 1
136                                                 continue
137                                         break
138                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139                                 stri = rexp.sub(decodeEscape, s[i:e])
140                                 return (e+1,stri)
141                         def parseObj(i):
142                                 i += 1
143                                 res = {}
144                                 i = skipSpace(i)
145                                 if s[i] == '}': # Empty dictionary
146                                         return (i+1,res)
147                                 while True:
148                                         if s[i] != '"':
149                                                 raiseError('Expected a string object key', i)
150                                         i,key = parseString(i)
151                                         i = skipSpace(i)
152                                         if i >= len(s) or s[i] != ':':
153                                                 raiseError('Expected a colon', i)
154                                         i,val = parse(i+1)
155                                         res[key] = val
156                                         i = skipSpace(i)
157                                         if s[i] == '}':
158                                                 return (i+1, res)
159                                         if s[i] != ',':
160                                                 raiseError('Expected comma or closing curly brace', i)
161                                         i = skipSpace(i+1)
162                         def parseArray(i):
163                                 res = []
164                                 i = skipSpace(i+1)
165                                 if s[i] == ']': # Empty array
166                                         return (i+1,res)
167                                 while True:
168                                         i,val = parse(i)
169                                         res.append(val)
170                                         i = skipSpace(i) # Raise exception if premature end
171                                         if s[i] == ']':
172                                                 return (i+1, res)
173                                         if s[i] != ',':
174                                                 raiseError('Expected a comma or closing bracket', i)
175                                         i = skipSpace(i+1)
176                         def parseDiscrete(i):
177                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
178                                         if s.startswith(k, i):
179                                                 return (i+len(k), v)
180                                 raiseError('Not a boolean (or null)', i)
181                         def parseNumber(i):
182                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
183                                 if mobj is None:
184                                         raiseError('Not a number', i)
185                                 nums = mobj.group(1)
186                                 if '.' in nums or 'e' in nums or 'E' in nums:
187                                         return (i+len(nums), float(nums))
188                                 return (i+len(nums), int(nums))
189                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
190                         def parse(i):
191                                 i = skipSpace(i)
192                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
193                                 i = skipSpace(i, False)
194                                 return (i,res)
195                         i,res = parse(0)
196                         if i < len(s):
197                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
198                         return res
199
200 def preferredencoding():
201         """Get preferred encoding.
202
203         Returns the best encoding scheme for the system, based on
204         locale.getpreferredencoding() and some further tweaks.
205         """
206         def yield_preferredencoding():
207                 try:
208                         pref = locale.getpreferredencoding()
209                         u'TEST'.encode(pref)
210                 except:
211                         pref = 'UTF-8'
212                 while True:
213                         yield pref
214         return yield_preferredencoding().next()
215
216
217 def htmlentity_transform(matchobj):
218         """Transforms an HTML entity to a Unicode character.
219
220         This function receives a match object and is intended to be used with
221         the re.sub() function.
222         """
223         entity = matchobj.group(1)
224
225         # Known non-numeric HTML entity
226         if entity in htmlentitydefs.name2codepoint:
227                 return unichr(htmlentitydefs.name2codepoint[entity])
228
229         # Unicode character
230         mobj = re.match(ur'(?u)#(x?\d+)', entity)
231         if mobj is not None:
232                 numstr = mobj.group(1)
233                 if numstr.startswith(u'x'):
234                         base = 16
235                         numstr = u'0%s' % numstr
236                 else:
237                         base = 10
238                 return unichr(long(numstr, base))
239
240         # Unknown entity in name, return its literal representation
241         return (u'&%s;' % entity)
242
243
244 def sanitize_title(utitle):
245         """Sanitizes a video title so it could be used as part of a filename."""
246         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247         return utitle.replace(unicode(os.sep), u'%')
248
249
250 def sanitize_open(filename, open_mode):
251         """Try to open the given filename, and slightly tweak it if this fails.
252
253         Attempts to open the given filename. If this fails, it tries to change
254         the filename slightly, step by step, until it's either able to open it
255         or it fails and raises a final exception, like the standard open()
256         function.
257
258         It returns the tuple (stream, definitive_file_name).
259         """
260         try:
261                 if filename == u'-':
262                         if sys.platform == 'win32':
263                                 import msvcrt
264                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265                         return (sys.stdout, filename)
266                 stream = open(_encodeFilename(filename), open_mode)
267                 return (stream, filename)
268         except (IOError, OSError), err:
269                 # In case of error, try to remove win32 forbidden chars
270                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
271
272                 # An exception here should be caught in the caller
273                 stream = open(_encodeFilename(filename), open_mode)
274                 return (stream, filename)
275
276
277 def timeconvert(timestr):
278         """Convert RFC 2822 defined time string into system timestamp"""
279         timestamp = None
280         timetuple = email.utils.parsedate_tz(timestr)
281         if timetuple is not None:
282                 timestamp = email.utils.mktime_tz(timetuple)
283         return timestamp
284
285 def _simplify_title(title):
286         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287         return expr.sub(u'_', title).strip(u'_')
288
289 def _orderedSet(iterable):
290         """ Remove all duplicates from the input iterable """
291         res = []
292         for el in iterable:
293                 if el not in res:
294                         res.append(el)
295         return res
296
297 def _unescapeHTML(s):
298         """
299         @param s a string (of type unicode)
300         """
301         assert type(s) == type(u'')
302
303         htmlParser = HTMLParser.HTMLParser()
304         return htmlParser.unescape(s)
305
306 def _encodeFilename(s):
307         """
308         @param s The name of the file (of type unicode)
309         """
310
311         assert type(s) == type(u'')
312
313         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
317                 return s
318         else:
319                 return s.encode(sys.getfilesystemencoding(), 'ignore')
320
321 class DownloadError(Exception):
322         """Download Error exception.
323
324         This exception may be thrown by FileDownloader objects if they are not
325         configured to continue on errors. They will contain the appropriate
326         error message.
327         """
328         pass
329
330
331 class SameFileError(Exception):
332         """Same File exception.
333
334         This exception will be thrown by FileDownloader objects if they detect
335         multiple files would have to be downloaded to the same file on disk.
336         """
337         pass
338
339
340 class PostProcessingError(Exception):
341         """Post Processing exception.
342
343         This exception may be raised by PostProcessor's .run() method to
344         indicate an error in the postprocessing task.
345         """
346         pass
347
348 class MaxDownloadsReached(Exception):
349         """ --max-downloads limit has been reached. """
350         pass
351
352
353 class UnavailableVideoError(Exception):
354         """Unavailable Format exception.
355
356         This exception will be thrown when a video is requested
357         in a format that is not available for that video.
358         """
359         pass
360
361
362 class ContentTooShortError(Exception):
363         """Content Too Short exception.
364
365         This exception may be raised by FileDownloader objects when a file they
366         download is too small for what the server announced first, indicating
367         the connection was probably interrupted.
368         """
369         # Both in bytes
370         downloaded = None
371         expected = None
372
373         def __init__(self, downloaded, expected):
374                 self.downloaded = downloaded
375                 self.expected = expected
376
377
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379         """Handler for HTTP requests and responses.
380
381         This class, when installed with an OpenerDirector, automatically adds
382         the standard headers to every HTTP request and handles gzipped and
383         deflated responses from web servers. If compression is to be avoided in
384         a particular request, the original request in the program code only has
385         to include the HTTP header "Youtubedl-No-Compression", which will be
386         removed before making the real request.
387
388         Part of this code was copied from:
389
390         http://techknack.net/python-urllib2-handlers/
391
392         Andrew Rowls, the author of that code, agreed to release it to the
393         public domain.
394         """
395
396         @staticmethod
397         def deflate(data):
398                 try:
399                         return zlib.decompress(data, -zlib.MAX_WBITS)
400                 except zlib.error:
401                         return zlib.decompress(data)
402
403         @staticmethod
404         def addinfourl_wrapper(stream, headers, url, code):
405                 if hasattr(urllib2.addinfourl, 'getcode'):
406                         return urllib2.addinfourl(stream, headers, url, code)
407                 ret = urllib2.addinfourl(stream, headers, url)
408                 ret.code = code
409                 return ret
410
411         def http_request(self, req):
412                 for h in std_headers:
413                         if h in req.headers:
414                                 del req.headers[h]
415                         req.add_header(h, std_headers[h])
416                 if 'Youtubedl-no-compression' in req.headers:
417                         if 'Accept-encoding' in req.headers:
418                                 del req.headers['Accept-encoding']
419                         del req.headers['Youtubedl-no-compression']
420                 return req
421
422         def http_response(self, req, resp):
423                 old_resp = resp
424                 # gzip
425                 if resp.headers.get('Content-encoding', '') == 'gzip':
426                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428                         resp.msg = old_resp.msg
429                 # deflate
430                 if resp.headers.get('Content-encoding', '') == 'deflate':
431                         gz = StringIO.StringIO(self.deflate(resp.read()))
432                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433                         resp.msg = old_resp.msg
434                 return resp
435
436
437 class FileDownloader(object):
438         """File Downloader class.
439
440         File downloader objects are the ones responsible of downloading the
441         actual video file and writing it to disk if the user has requested
442         it, among some other tasks. In most cases there should be one per
443         program. As, given a video URL, the downloader doesn't know how to
444         extract all the needed information, task that InfoExtractors do, it
445         has to pass the URL to one of them.
446
447         For this, file downloader objects have a method that allows
448         InfoExtractors to be registered in a given order. When it is passed
449         a URL, the file downloader handles it to the first InfoExtractor it
450         finds that reports being able to handle it. The InfoExtractor extracts
451         all the information about the video or videos the URL refers to, and
452         asks the FileDownloader to process the video information, possibly
453         downloading the video.
454
455         File downloaders accept a lot of parameters. In order not to saturate
456         the object constructor with arguments, it receives a dictionary of
457         options instead. These options are available through the params
458         attribute for the InfoExtractors to use. The FileDownloader also
459         registers itself as the downloader in charge for the InfoExtractors
460         that are added to it, so this is a "mutual registration".
461
462         Available options:
463
464         username:         Username for authentication purposes.
465         password:         Password for authentication purposes.
466         usenetrc:         Use netrc for authentication instead.
467         quiet:            Do not print messages to stdout.
468         forceurl:         Force printing final URL.
469         forcetitle:       Force printing title.
470         forcethumbnail:   Force printing thumbnail URL.
471         forcedescription: Force printing description.
472         forcefilename:    Force printing final filename.
473         simulate:         Do not download the video files.
474         format:           Video format code.
475         format_limit:     Highest quality format to try.
476         outtmpl:          Template for output names.
477         ignoreerrors:     Do not stop on download errors.
478         ratelimit:        Download speed limit, in bytes/sec.
479         nooverwrites:     Prevent overwriting files.
480         retries:          Number of times to retry for HTTP error 5xx
481         continuedl:       Try to continue downloads if possible.
482         noprogress:       Do not print the progress bar.
483         playliststart:    Playlist item to start at.
484         playlistend:      Playlist item to end at.
485         matchtitle:       Download only matching titles.
486         rejecttitle:      Reject downloads for matching titles.
487         logtostderr:      Log messages to stderr instead of stdout.
488         consoletitle:     Display progress in console window's titlebar.
489         nopart:           Do not use temporary .part files.
490         updatetime:       Use the Last-modified header to set output file timestamps.
491         writedescription: Write the video description to a .description file
492         writeinfojson:    Write the video description to a .info.json file
493         """
494
495         params = None
496         _ies = []
497         _pps = []
498         _download_retcode = None
499         _num_downloads = None
500         _screen_file = None
501
502         def __init__(self, params):
503                 """Create a FileDownloader object with the given options."""
504                 self._ies = []
505                 self._pps = []
506                 self._download_retcode = 0
507                 self._num_downloads = 0
508                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
509                 self.params = params
510
511         @staticmethod
512         def format_bytes(bytes):
513                 if bytes is None:
514                         return 'N/A'
515                 if type(bytes) is str:
516                         bytes = float(bytes)
517                 if bytes == 0.0:
518                         exponent = 0
519                 else:
520                         exponent = long(math.log(bytes, 1024.0))
521                 suffix = 'bkMGTPEZY'[exponent]
522                 converted = float(bytes) / float(1024 ** exponent)
523                 return '%.2f%s' % (converted, suffix)
524
525         @staticmethod
526         def calc_percent(byte_counter, data_len):
527                 if data_len is None:
528                         return '---.-%'
529                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
530
531         @staticmethod
532         def calc_eta(start, now, total, current):
533                 if total is None:
534                         return '--:--'
535                 dif = now - start
536                 if current == 0 or dif < 0.001: # One millisecond
537                         return '--:--'
538                 rate = float(current) / dif
539                 eta = long((float(total) - float(current)) / rate)
540                 (eta_mins, eta_secs) = divmod(eta, 60)
541                 if eta_mins > 99:
542                         return '--:--'
543                 return '%02d:%02d' % (eta_mins, eta_secs)
544
545         @staticmethod
546         def calc_speed(start, now, bytes):
547                 dif = now - start
548                 if bytes == 0 or dif < 0.001: # One millisecond
549                         return '%10s' % '---b/s'
550                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
551
552         @staticmethod
553         def best_block_size(elapsed_time, bytes):
554                 new_min = max(bytes / 2.0, 1.0)
555                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
556                 if elapsed_time < 0.001:
557                         return long(new_max)
558                 rate = bytes / elapsed_time
559                 if rate > new_max:
560                         return long(new_max)
561                 if rate < new_min:
562                         return long(new_min)
563                 return long(rate)
564
565         @staticmethod
566         def parse_bytes(bytestr):
567                 """Parse a string indicating a byte quantity into a long integer."""
568                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
569                 if matchobj is None:
570                         return None
571                 number = float(matchobj.group(1))
572                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
573                 return long(round(number * multiplier))
574
575         def add_info_extractor(self, ie):
576                 """Add an InfoExtractor object to the end of the list."""
577                 self._ies.append(ie)
578                 ie.set_downloader(self)
579
580         def add_post_processor(self, pp):
581                 """Add a PostProcessor object to the end of the chain."""
582                 self._pps.append(pp)
583                 pp.set_downloader(self)
584
585         def to_screen(self, message, skip_eol=False):
586                 """Print message to stdout if not in quiet mode."""
587                 assert type(message) == type(u'')
588                 if not self.params.get('quiet', False):
589                         terminator = [u'\n', u''][skip_eol]
590                         output = message + terminator
591
592                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
593                                 output = output.encode(preferredencoding(), 'ignore')
594                         self._screen_file.write(output)
595                         self._screen_file.flush()
596
597         def to_stderr(self, message):
598                 """Print message to stderr."""
599                 print >>sys.stderr, message.encode(preferredencoding())
600
601         def to_cons_title(self, message):
602                 """Set console/terminal window title to message."""
603                 if not self.params.get('consoletitle', False):
604                         return
605                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
606                         # c_wchar_p() might not be necessary if `message` is
607                         # already of type unicode()
608                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
609                 elif 'TERM' in os.environ:
610                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
611
612         def fixed_template(self):
613                 """Checks if the output template is fixed."""
614                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
615
616         def trouble(self, message=None):
617                 """Determine action to take when a download problem appears.
618
619                 Depending on if the downloader has been configured to ignore
620                 download errors or not, this method may throw an exception or
621                 not when errors are found, after printing the message.
622                 """
623                 if message is not None:
624                         self.to_stderr(message)
625                 if not self.params.get('ignoreerrors', False):
626                         raise DownloadError(message)
627                 self._download_retcode = 1
628
629         def slow_down(self, start_time, byte_counter):
630                 """Sleep if the download speed is over the rate limit."""
631                 rate_limit = self.params.get('ratelimit', None)
632                 if rate_limit is None or byte_counter == 0:
633                         return
634                 now = time.time()
635                 elapsed = now - start_time
636                 if elapsed <= 0.0:
637                         return
638                 speed = float(byte_counter) / elapsed
639                 if speed > rate_limit:
640                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
641
642         def temp_name(self, filename):
643                 """Returns a temporary filename for the given filename."""
644                 if self.params.get('nopart', False) or filename == u'-' or \
645                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
646                         return filename
647                 return filename + u'.part'
648
649         def undo_temp_name(self, filename):
650                 if filename.endswith(u'.part'):
651                         return filename[:-len(u'.part')]
652                 return filename
653
654         def try_rename(self, old_filename, new_filename):
655                 try:
656                         if old_filename == new_filename:
657                                 return
658                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
659                 except (IOError, OSError), err:
660                         self.trouble(u'ERROR: unable to rename file')
661
662         def try_utime(self, filename, last_modified_hdr):
663                 """Try to set the last-modified time of the given file."""
664                 if last_modified_hdr is None:
665                         return
666                 if not os.path.isfile(_encodeFilename(filename)):
667                         return
668                 timestr = last_modified_hdr
669                 if timestr is None:
670                         return
671                 filetime = timeconvert(timestr)
672                 if filetime is None:
673                         return filetime
674                 try:
675                         os.utime(filename, (time.time(), filetime))
676                 except:
677                         pass
678                 return filetime
679
680         def report_writedescription(self, descfn):
681                 """ Report that the description file is being written """
682                 self.to_screen(u'[info] Writing video description to: ' + descfn)
683
684         def report_writeinfojson(self, infofn):
685                 """ Report that the metadata file has been written """
686                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
687
688         def report_destination(self, filename):
689                 """Report destination filename."""
690                 self.to_screen(u'[download] Destination: ' + filename)
691
692         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
693                 """Report download progress."""
694                 if self.params.get('noprogress', False):
695                         return
696                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
697                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
698                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
699                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
700
701         def report_resuming_byte(self, resume_len):
702                 """Report attempt to resume at given byte."""
703                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
704
705         def report_retry(self, count, retries):
706                 """Report retry in case of HTTP error 5xx"""
707                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
708
709         def report_file_already_downloaded(self, file_name):
710                 """Report file has already been fully downloaded."""
711                 try:
712                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
713                 except (UnicodeEncodeError), err:
714                         self.to_screen(u'[download] The file has already been downloaded')
715
716         def report_unable_to_resume(self):
717                 """Report it was impossible to resume download."""
718                 self.to_screen(u'[download] Unable to resume')
719
720         def report_finish(self):
721                 """Report download finished."""
722                 if self.params.get('noprogress', False):
723                         self.to_screen(u'[download] Download completed')
724                 else:
725                         self.to_screen(u'')
726
727         def increment_downloads(self):
728                 """Increment the ordinal that assigns a number to each file."""
729                 self._num_downloads += 1
730
731         def prepare_filename(self, info_dict):
732                 """Generate the output filename."""
733                 try:
734                         template_dict = dict(info_dict)
735                         template_dict['epoch'] = unicode(long(time.time()))
736                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
737                         filename = self.params['outtmpl'] % template_dict
738                         return filename
739                 except (ValueError, KeyError), err:
740                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
741                         return None
742
743         def _match_entry(self, info_dict):
744                 """ Returns None iff the file should be downloaded """
745
746                 title = info_dict['title']
747                 matchtitle = self.params.get('matchtitle', False)
748                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
749                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
750                 rejecttitle = self.params.get('rejecttitle', False)
751                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
752                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
753                 return None
754
755         def process_info(self, info_dict):
756                 """Process a single dictionary returned by an InfoExtractor."""
757
758                 reason = self._match_entry(info_dict)
759                 if reason is not None:
760                         self.to_screen(u'[download] ' + reason)
761                         return
762
763                 max_downloads = self.params.get('max_downloads')
764                 if max_downloads is not None:
765                         if self._num_downloads > int(max_downloads):
766                                 raise MaxDownloadsReached()
767
768                 filename = self.prepare_filename(info_dict)
769                 
770                 # Forced printings
771                 if self.params.get('forcetitle', False):
772                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
773                 if self.params.get('forceurl', False):
774                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
775                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
776                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
777                 if self.params.get('forcedescription', False) and 'description' in info_dict:
778                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
779                 if self.params.get('forcefilename', False) and filename is not None:
780                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
781                 if self.params.get('forceformat', False):
782                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
783
784                 # Do nothing else if in simulate mode
785                 if self.params.get('simulate', False):
786                         return
787
788                 if filename is None:
789                         return
790
791                 try:
792                         dn = os.path.dirname(_encodeFilename(filename))
793                         if dn != '' and not os.path.exists(dn): # dn is already encoded
794                                 os.makedirs(dn)
795                 except (OSError, IOError), err:
796                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
797                         return
798
799                 if self.params.get('writedescription', False):
800                         try:
801                                 descfn = filename + u'.description'
802                                 self.report_writedescription(descfn)
803                                 descfile = open(_encodeFilename(descfn), 'wb')
804                                 try:
805                                         descfile.write(info_dict['description'].encode('utf-8'))
806                                 finally:
807                                         descfile.close()
808                         except (OSError, IOError):
809                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
810                                 return
811
812                 if self.params.get('writeinfojson', False):
813                         infofn = filename + u'.info.json'
814                         self.report_writeinfojson(infofn)
815                         try:
816                                 json.dump
817                         except (NameError,AttributeError):
818                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
819                                 return
820                         try:
821                                 infof = open(_encodeFilename(infofn), 'wb')
822                                 try:
823                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
824                                         json.dump(json_info_dict, infof)
825                                 finally:
826                                         infof.close()
827                         except (OSError, IOError):
828                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
829                                 return
830
831                 if not self.params.get('skip_download', False):
832                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
833                                 success = True
834                         else:
835                                 try:
836                                         success = self._do_download(filename, info_dict)
837                                 except (OSError, IOError), err:
838                                         raise UnavailableVideoError
839                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
840                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
841                                         return
842                                 except (ContentTooShortError, ), err:
843                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
844                                         return
845         
846                         if success:
847                                 try:
848                                         self.post_process(filename, info_dict)
849                                 except (PostProcessingError), err:
850                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
851                                         return
852
853         def download(self, url_list):
854                 """Download a given list of URLs."""
855                 if len(url_list) > 1 and self.fixed_template():
856                         raise SameFileError(self.params['outtmpl'])
857
858                 for url in url_list:
859                         suitable_found = False
860                         for ie in self._ies:
861                                 # Go to next InfoExtractor if not suitable
862                                 if not ie.suitable(url):
863                                         continue
864
865                                 # Suitable InfoExtractor found
866                                 suitable_found = True
867
868                                 # Extract information from URL and process it
869                                 ie.extract(url)
870
871                                 # Suitable InfoExtractor had been found; go to next URL
872                                 break
873
874                         if not suitable_found:
875                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
876
877                 return self._download_retcode
878
879         def post_process(self, filename, ie_info):
880                 """Run the postprocessing chain on the given file."""
881                 info = dict(ie_info)
882                 info['filepath'] = filename
883                 for pp in self._pps:
884                         info = pp.run(info)
885                         if info is None:
886                                 break
887
888         def _download_with_rtmpdump(self, filename, url, player_url):
889                 self.report_destination(filename)
890                 tmpfilename = self.temp_name(filename)
891
892                 # Check for rtmpdump first
893                 try:
894                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
895                 except (OSError, IOError):
896                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
897                         return False
898
899                 # Download using rtmpdump. rtmpdump returns exit code 2 when
900                 # the connection was interrumpted and resuming appears to be
901                 # possible. This is part of rtmpdump's normal usage, AFAIK.
902                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
903                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
904                 if self.params['verbose']:
905                         try:
906                                 import pipes
907                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
908                         except ImportError:
909                                 shell_quote = repr
910                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
911                 retval = subprocess.call(args)
912                 while retval == 2 or retval == 1:
913                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
914                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
915                         time.sleep(5.0) # This seems to be needed
916                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
917                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
918                         if prevsize == cursize and retval == 1:
919                                 break
920                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
921                         if prevsize == cursize and retval == 2 and cursize > 1024:
922                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
923                                 retval = 0
924                                 break
925                 if retval == 0:
926                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
927                         self.try_rename(tmpfilename, filename)
928                         return True
929                 else:
930                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
931                         return False
932
933         def _do_download(self, filename, info_dict):
934                 url = info_dict['url']
935                 player_url = info_dict.get('player_url', None)
936
937                 # Check file already present
938                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
939                         self.report_file_already_downloaded(filename)
940                         return True
941
942                 # Attempt to download using rtmpdump
943                 if url.startswith('rtmp'):
944                         return self._download_with_rtmpdump(filename, url, player_url)
945
946                 tmpfilename = self.temp_name(filename)
947                 stream = None
948
949                 # Do not include the Accept-Encoding header
950                 headers = {'Youtubedl-no-compression': 'True'}
951                 basic_request = urllib2.Request(url, None, headers)
952                 request = urllib2.Request(url, None, headers)
953
954                 # Establish possible resume length
955                 if os.path.isfile(_encodeFilename(tmpfilename)):
956                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
957                 else:
958                         resume_len = 0
959
960                 open_mode = 'wb'
961                 if resume_len != 0:
962                         if self.params.get('continuedl', False):
963                                 self.report_resuming_byte(resume_len)
964                                 request.add_header('Range','bytes=%d-' % resume_len)
965                                 open_mode = 'ab'
966                         else:
967                                 resume_len = 0
968
969                 count = 0
970                 retries = self.params.get('retries', 0)
971                 while count <= retries:
972                         # Establish connection
973                         try:
974                                 if count == 0 and 'urlhandle' in info_dict:
975                                         data = info_dict['urlhandle']
976                                 data = urllib2.urlopen(request)
977                                 break
978                         except (urllib2.HTTPError, ), err:
979                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
980                                         # Unexpected HTTP error
981                                         raise
982                                 elif err.code == 416:
983                                         # Unable to resume (requested range not satisfiable)
984                                         try:
985                                                 # Open the connection again without the range header
986                                                 data = urllib2.urlopen(basic_request)
987                                                 content_length = data.info()['Content-Length']
988                                         except (urllib2.HTTPError, ), err:
989                                                 if err.code < 500 or err.code >= 600:
990                                                         raise
991                                         else:
992                                                 # Examine the reported length
993                                                 if (content_length is not None and
994                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
995                                                         # The file had already been fully downloaded.
996                                                         # Explanation to the above condition: in issue #175 it was revealed that
997                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
998                                                         # changing the file size slightly and causing problems for some users. So
999                                                         # I decided to implement a suggested change and consider the file
1000                                                         # completely downloaded if the file size differs less than 100 bytes from
1001                                                         # the one in the hard drive.
1002                                                         self.report_file_already_downloaded(filename)
1003                                                         self.try_rename(tmpfilename, filename)
1004                                                         return True
1005                                                 else:
1006                                                         # The length does not match, we start the download over
1007                                                         self.report_unable_to_resume()
1008                                                         open_mode = 'wb'
1009                                                         break
1010                         # Retry
1011                         count += 1
1012                         if count <= retries:
1013                                 self.report_retry(count, retries)
1014
1015                 if count > retries:
1016                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1017                         return False
1018
1019                 data_len = data.info().get('Content-length', None)
1020                 if data_len is not None:
1021                         data_len = long(data_len) + resume_len
1022                 data_len_str = self.format_bytes(data_len)
1023                 byte_counter = 0 + resume_len
1024                 block_size = 1024
1025                 start = time.time()
1026                 while True:
1027                         # Download and write
1028                         before = time.time()
1029                         data_block = data.read(block_size)
1030                         after = time.time()
1031                         if len(data_block) == 0:
1032                                 break
1033                         byte_counter += len(data_block)
1034
1035                         # Open file just in time
1036                         if stream is None:
1037                                 try:
1038                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1039                                         assert stream is not None
1040                                         filename = self.undo_temp_name(tmpfilename)
1041                                         self.report_destination(filename)
1042                                 except (OSError, IOError), err:
1043                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1044                                         return False
1045                         try:
1046                                 stream.write(data_block)
1047                         except (IOError, OSError), err:
1048                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1049                                 return False
1050                         block_size = self.best_block_size(after - before, len(data_block))
1051
1052                         # Progress message
1053                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1054                         if data_len is None:
1055                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1056                         else:
1057                                 percent_str = self.calc_percent(byte_counter, data_len)
1058                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1059                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1060
1061                         # Apply rate limit
1062                         self.slow_down(start, byte_counter - resume_len)
1063
1064                 if stream is None:
1065                         self.trouble(u'\nERROR: Did not get any data blocks')
1066                         return False
1067                 stream.close()
1068                 self.report_finish()
1069                 if data_len is not None and byte_counter != data_len:
1070                         raise ContentTooShortError(byte_counter, long(data_len))
1071                 self.try_rename(tmpfilename, filename)
1072
1073                 # Update file modification time
1074                 if self.params.get('updatetime', True):
1075                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1076
1077                 return True
1078
1079
1080 class InfoExtractor(object):
1081         """Information Extractor class.
1082
1083         Information extractors are the classes that, given a URL, extract
1084         information from the video (or videos) the URL refers to. This
1085         information includes the real video URL, the video title and simplified
1086         title, author and others. The information is stored in a dictionary
1087         which is then passed to the FileDownloader. The FileDownloader
1088         processes this information possibly downloading the video to the file
1089         system, among other possible outcomes. The dictionaries must include
1090         the following fields:
1091
1092         id:             Video identifier.
1093         url:            Final video URL.
1094         uploader:       Nickname of the video uploader.
1095         title:          Literal title.
1096         stitle:         Simplified title.
1097         ext:            Video filename extension.
1098         format:         Video format.
1099         player_url:     SWF Player URL (may be None).
1100
1101         The following fields are optional. Their primary purpose is to allow
1102         youtube-dl to serve as the backend for a video search function, such
1103         as the one in youtube2mp3.  They are only used when their respective
1104         forced printing functions are called:
1105
1106         thumbnail:      Full URL to a video thumbnail image.
1107         description:    One-line video description.
1108
1109         Subclasses of this one should re-define the _real_initialize() and
1110         _real_extract() methods and define a _VALID_URL regexp.
1111         Probably, they should also be added to the list of extractors.
1112         """
1113
1114         _ready = False
1115         _downloader = None
1116
1117         def __init__(self, downloader=None):
1118                 """Constructor. Receives an optional downloader."""
1119                 self._ready = False
1120                 self.set_downloader(downloader)
1121
1122         def suitable(self, url):
1123                 """Receives a URL and returns True if suitable for this IE."""
1124                 return re.match(self._VALID_URL, url) is not None
1125
1126         def initialize(self):
1127                 """Initializes an instance (authentication, etc)."""
1128                 if not self._ready:
1129                         self._real_initialize()
1130                         self._ready = True
1131
1132         def extract(self, url):
1133                 """Extracts URL information and returns it in list of dicts."""
1134                 self.initialize()
1135                 return self._real_extract(url)
1136
1137         def set_downloader(self, downloader):
1138                 """Sets the downloader for this IE."""
1139                 self._downloader = downloader
1140
1141         def _real_initialize(self):
1142                 """Real initialization process. Redefine in subclasses."""
1143                 pass
1144
1145         def _real_extract(self, url):
1146                 """Real extraction process. Redefine in subclasses."""
1147                 pass
1148
1149
1150 class YoutubeIE(InfoExtractor):
1151         """Information extractor for youtube.com."""
1152
1153         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1154         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1155         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1156         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1157         _NETRC_MACHINE = 'youtube'
1158         # Listed in order of quality
1159         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1160         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1161         _video_extensions = {
1162                 '13': '3gp',
1163                 '17': 'mp4',
1164                 '18': 'mp4',
1165                 '22': 'mp4',
1166                 '37': 'mp4',
1167                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1168                 '43': 'webm',
1169                 '44': 'webm',
1170                 '45': 'webm',
1171         }
1172         _video_dimensions = {
1173                 '5': '240x400',
1174                 '6': '???',
1175                 '13': '???',
1176                 '17': '144x176',
1177                 '18': '360x640',
1178                 '22': '720x1280',
1179                 '34': '360x640',
1180                 '35': '480x854',
1181                 '37': '1080x1920',
1182                 '38': '3072x4096',
1183                 '43': '360x640',
1184                 '44': '480x854',
1185                 '45': '720x1280',
1186         }       
1187         IE_NAME = u'youtube'
1188
1189         def report_lang(self):
1190                 """Report attempt to set language."""
1191                 self._downloader.to_screen(u'[youtube] Setting language')
1192
1193         def report_login(self):
1194                 """Report attempt to log in."""
1195                 self._downloader.to_screen(u'[youtube] Logging in')
1196
1197         def report_age_confirmation(self):
1198                 """Report attempt to confirm age."""
1199                 self._downloader.to_screen(u'[youtube] Confirming age')
1200
1201         def report_video_webpage_download(self, video_id):
1202                 """Report attempt to download video webpage."""
1203                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1204
1205         def report_video_info_webpage_download(self, video_id):
1206                 """Report attempt to download video info webpage."""
1207                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1208
1209         def report_information_extraction(self, video_id):
1210                 """Report attempt to extract video information."""
1211                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1212
1213         def report_unavailable_format(self, video_id, format):
1214                 """Report extracted video URL."""
1215                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1216
1217         def report_rtmp_download(self):
1218                 """Indicate the download will use the RTMP protocol."""
1219                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1220
1221         def _print_formats(self, formats):
1222                 print 'Available formats:'
1223                 for x in formats:
1224                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1225
1226         def _real_initialize(self):
1227                 if self._downloader is None:
1228                         return
1229
1230                 username = None
1231                 password = None
1232                 downloader_params = self._downloader.params
1233
1234                 # Attempt to use provided username and password or .netrc data
1235                 if downloader_params.get('username', None) is not None:
1236                         username = downloader_params['username']
1237                         password = downloader_params['password']
1238                 elif downloader_params.get('usenetrc', False):
1239                         try:
1240                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1241                                 if info is not None:
1242                                         username = info[0]
1243                                         password = info[2]
1244                                 else:
1245                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1246                         except (IOError, netrc.NetrcParseError), err:
1247                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1248                                 return
1249
1250                 # Set language
1251                 request = urllib2.Request(self._LANG_URL)
1252                 try:
1253                         self.report_lang()
1254                         urllib2.urlopen(request).read()
1255                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1257                         return
1258
1259                 # No authentication to be performed
1260                 if username is None:
1261                         return
1262
1263                 # Log in
1264                 login_form = {
1265                                 'current_form': 'loginForm',
1266                                 'next':         '/',
1267                                 'action_login': 'Log In',
1268                                 'username':     username,
1269                                 'password':     password,
1270                                 }
1271                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1272                 try:
1273                         self.report_login()
1274                         login_results = urllib2.urlopen(request).read()
1275                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1276                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1277                                 return
1278                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1279                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1280                         return
1281
1282                 # Confirm age
1283                 age_form = {
1284                                 'next_url':             '/',
1285                                 'action_confirm':       'Confirm',
1286                                 }
1287                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1288                 try:
1289                         self.report_age_confirmation()
1290                         age_results = urllib2.urlopen(request).read()
1291                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1292                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1293                         return
1294
1295         def _real_extract(self, url):
1296                 # Extract video id from URL
1297                 mobj = re.match(self._VALID_URL, url)
1298                 if mobj is None:
1299                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1300                         return
1301                 video_id = mobj.group(2)
1302
1303                 # Get video webpage
1304                 self.report_video_webpage_download(video_id)
1305                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1306                 try:
1307                         video_webpage = urllib2.urlopen(request).read()
1308                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1309                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1310                         return
1311
1312                 # Attempt to extract SWF player URL
1313                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1314                 if mobj is not None:
1315                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1316                 else:
1317                         player_url = None
1318
1319                 # Get video info
1320                 self.report_video_info_webpage_download(video_id)
1321                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1322                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1323                                         % (video_id, el_type))
1324                         request = urllib2.Request(video_info_url)
1325                         try:
1326                                 video_info_webpage = urllib2.urlopen(request).read()
1327                                 video_info = parse_qs(video_info_webpage)
1328                                 if 'token' in video_info:
1329                                         break
1330                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1331                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1332                                 return
1333                 if 'token' not in video_info:
1334                         if 'reason' in video_info:
1335                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1336                         else:
1337                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1338                         return
1339
1340                 # Start extracting information
1341                 self.report_information_extraction(video_id)
1342
1343                 # uploader
1344                 if 'author' not in video_info:
1345                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1346                         return
1347                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1348
1349                 # title
1350                 if 'title' not in video_info:
1351                         self._downloader.trouble(u'ERROR: unable to extract video title')
1352                         return
1353                 video_title = urllib.unquote_plus(video_info['title'][0])
1354                 video_title = video_title.decode('utf-8')
1355                 video_title = sanitize_title(video_title)
1356
1357                 # simplified title
1358                 simple_title = _simplify_title(video_title)
1359
1360                 # thumbnail image
1361                 if 'thumbnail_url' not in video_info:
1362                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1363                         video_thumbnail = ''
1364                 else:   # don't panic if we can't find it
1365                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1366
1367                 # upload date
1368                 upload_date = u'NA'
1369                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1370                 if mobj is not None:
1371                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1372                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1373                         for expression in format_expressions:
1374                                 try:
1375                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1376                                 except:
1377                                         pass
1378
1379                 # description
1380                 try:
1381                         lxml.etree
1382                 except NameError:
1383                         video_description = u'No description available.'
1384                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1385                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1386                                 if mobj is not None:
1387                                         video_description = mobj.group(1).decode('utf-8')
1388                 else:
1389                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1390                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1391                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1392                         # TODO use another parser
1393
1394                 # token
1395                 video_token = urllib.unquote_plus(video_info['token'][0])
1396
1397                 # Decide which formats to download
1398                 req_format = self._downloader.params.get('format', None)
1399
1400                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1401                         self.report_rtmp_download()
1402                         video_url_list = [(None, video_info['conn'][0])]
1403                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1404                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1405                         url_data = [parse_qs(uds) for uds in url_data_strs]
1406                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1407                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1408
1409                         format_limit = self._downloader.params.get('format_limit', None)
1410                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1411                         if format_limit is not None and format_limit in available_formats:
1412                                 format_list = available_formats[available_formats.index(format_limit):]
1413                         else:
1414                                 format_list = available_formats
1415                         existing_formats = [x for x in format_list if x in url_map]
1416                         if len(existing_formats) == 0:
1417                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1418                                 return
1419                         if self._downloader.params.get('listformats', None):
1420                                 self._print_formats(existing_formats)
1421                                 return
1422                         if req_format is None or req_format == 'best':
1423                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1424                         elif req_format == 'worst':
1425                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1426                         elif req_format in ('-1', 'all'):
1427                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1428                         else:
1429                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1430                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1431                                 req_formats = req_format.split('/')
1432                                 video_url_list = None
1433                                 for rf in req_formats:
1434                                         if rf in url_map:
1435                                                 video_url_list = [(rf, url_map[rf])]
1436                                                 break
1437                                 if video_url_list is None:
1438                                         self._downloader.trouble(u'ERROR: requested format not available')
1439                                         return
1440                 else:
1441                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1442                         return
1443
1444                 for format_param, video_real_url in video_url_list:
1445                         # At this point we have a new video
1446                         self._downloader.increment_downloads()
1447
1448                         # Extension
1449                         video_extension = self._video_extensions.get(format_param, 'flv')
1450
1451                         try:
1452                                 # Process video information
1453                                 self._downloader.process_info({
1454                                         'id':           video_id.decode('utf-8'),
1455                                         'url':          video_real_url.decode('utf-8'),
1456                                         'uploader':     video_uploader.decode('utf-8'),
1457                                         'upload_date':  upload_date,
1458                                         'title':        video_title,
1459                                         'stitle':       simple_title,
1460                                         'ext':          video_extension.decode('utf-8'),
1461                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1462                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1463                                         'description':  video_description,
1464                                         'player_url':   player_url,
1465                                 })
1466                         except UnavailableVideoError, err:
1467                                 self._downloader.trouble(u'\nERROR: unable to download video')
1468
1469
1470 class MetacafeIE(InfoExtractor):
1471         """Information Extractor for metacafe.com."""
1472
1473         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1474         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1475         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1476         _youtube_ie = None
1477         IE_NAME = u'metacafe'
1478
1479         def __init__(self, youtube_ie, downloader=None):
1480                 InfoExtractor.__init__(self, downloader)
1481                 self._youtube_ie = youtube_ie
1482
1483         def report_disclaimer(self):
1484                 """Report disclaimer retrieval."""
1485                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1486
1487         def report_age_confirmation(self):
1488                 """Report attempt to confirm age."""
1489                 self._downloader.to_screen(u'[metacafe] Confirming age')
1490
1491         def report_download_webpage(self, video_id):
1492                 """Report webpage download."""
1493                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1494
1495         def report_extraction(self, video_id):
1496                 """Report information extraction."""
1497                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1498
1499         def _real_initialize(self):
1500                 # Retrieve disclaimer
1501                 request = urllib2.Request(self._DISCLAIMER)
1502                 try:
1503                         self.report_disclaimer()
1504                         disclaimer = urllib2.urlopen(request).read()
1505                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1506                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1507                         return
1508
1509                 # Confirm age
1510                 disclaimer_form = {
1511                         'filters': '0',
1512                         'submit': "Continue - I'm over 18",
1513                         }
1514                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1515                 try:
1516                         self.report_age_confirmation()
1517                         disclaimer = urllib2.urlopen(request).read()
1518                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1519                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1520                         return
1521
1522         def _real_extract(self, url):
1523                 # Extract id and simplified title from URL
1524                 mobj = re.match(self._VALID_URL, url)
1525                 if mobj is None:
1526                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1527                         return
1528
1529                 video_id = mobj.group(1)
1530
1531                 # Check if video comes from YouTube
1532                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1533                 if mobj2 is not None:
1534                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1535                         return
1536
1537                 # At this point we have a new video
1538                 self._downloader.increment_downloads()
1539
1540                 simple_title = mobj.group(2).decode('utf-8')
1541
1542                 # Retrieve video webpage to extract further information
1543                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1544                 try:
1545                         self.report_download_webpage(video_id)
1546                         webpage = urllib2.urlopen(request).read()
1547                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1548                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1549                         return
1550
1551                 # Extract URL, uploader and title from webpage
1552                 self.report_extraction(video_id)
1553                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1554                 if mobj is not None:
1555                         mediaURL = urllib.unquote(mobj.group(1))
1556                         video_extension = mediaURL[-3:]
1557
1558                         # Extract gdaKey if available
1559                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1560                         if mobj is None:
1561                                 video_url = mediaURL
1562                         else:
1563                                 gdaKey = mobj.group(1)
1564                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1565                 else:
1566                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1567                         if mobj is None:
1568                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1569                                 return
1570                         vardict = parse_qs(mobj.group(1))
1571                         if 'mediaData' not in vardict:
1572                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1573                                 return
1574                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1575                         if mobj is None:
1576                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1577                                 return
1578                         mediaURL = mobj.group(1).replace('\\/', '/')
1579                         video_extension = mediaURL[-3:]
1580                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1581
1582                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1583                 if mobj is None:
1584                         self._downloader.trouble(u'ERROR: unable to extract title')
1585                         return
1586                 video_title = mobj.group(1).decode('utf-8')
1587                 video_title = sanitize_title(video_title)
1588
1589                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1590                 if mobj is None:
1591                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1592                         return
1593                 video_uploader = mobj.group(1)
1594
1595                 try:
1596                         # Process video information
1597                         self._downloader.process_info({
1598                                 'id':           video_id.decode('utf-8'),
1599                                 'url':          video_url.decode('utf-8'),
1600                                 'uploader':     video_uploader.decode('utf-8'),
1601                                 'upload_date':  u'NA',
1602                                 'title':        video_title,
1603                                 'stitle':       simple_title,
1604                                 'ext':          video_extension.decode('utf-8'),
1605                                 'format':       u'NA',
1606                                 'player_url':   None,
1607                         })
1608                 except UnavailableVideoError:
1609                         self._downloader.trouble(u'\nERROR: unable to download video')
1610
1611
1612 class DailymotionIE(InfoExtractor):
1613         """Information Extractor for Dailymotion"""
1614
1615         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1616         IE_NAME = u'dailymotion'
1617
1618         def __init__(self, downloader=None):
1619                 InfoExtractor.__init__(self, downloader)
1620
1621         def report_download_webpage(self, video_id):
1622                 """Report webpage download."""
1623                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1624
1625         def report_extraction(self, video_id):
1626                 """Report information extraction."""
1627                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1628
1629         def _real_extract(self, url):
1630                 # Extract id and simplified title from URL
1631                 mobj = re.match(self._VALID_URL, url)
1632                 if mobj is None:
1633                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1634                         return
1635
1636                 # At this point we have a new video
1637                 self._downloader.increment_downloads()
1638                 video_id = mobj.group(1)
1639
1640                 video_extension = 'flv'
1641
1642                 # Retrieve video webpage to extract further information
1643                 request = urllib2.Request(url)
1644                 request.add_header('Cookie', 'family_filter=off')
1645                 try:
1646                         self.report_download_webpage(video_id)
1647                         webpage = urllib2.urlopen(request).read()
1648                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1649                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1650                         return
1651
1652                 # Extract URL, uploader and title from webpage
1653                 self.report_extraction(video_id)
1654                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1655                 if mobj is None:
1656                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1657                         return
1658                 sequence = urllib.unquote(mobj.group(1))
1659                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1660                 if mobj is None:
1661                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1662                         return
1663                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1664
1665                 # if needed add http://www.dailymotion.com/ if relative URL
1666
1667                 video_url = mediaURL
1668
1669                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1670                 if mobj is None:
1671                         self._downloader.trouble(u'ERROR: unable to extract title')
1672                         return
1673                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1674                 video_title = sanitize_title(video_title)
1675                 simple_title = _simplify_title(video_title)
1676
1677                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1678                 if mobj is None:
1679                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1680                         return
1681                 video_uploader = mobj.group(1)
1682
1683                 try:
1684                         # Process video information
1685                         self._downloader.process_info({
1686                                 'id':           video_id.decode('utf-8'),
1687                                 'url':          video_url.decode('utf-8'),
1688                                 'uploader':     video_uploader.decode('utf-8'),
1689                                 'upload_date':  u'NA',
1690                                 'title':        video_title,
1691                                 'stitle':       simple_title,
1692                                 'ext':          video_extension.decode('utf-8'),
1693                                 'format':       u'NA',
1694                                 'player_url':   None,
1695                         })
1696                 except UnavailableVideoError:
1697                         self._downloader.trouble(u'\nERROR: unable to download video')
1698
1699
1700 class GoogleIE(InfoExtractor):
1701         """Information extractor for video.google.com."""
1702
1703         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1704         IE_NAME = u'video.google'
1705
1706         def __init__(self, downloader=None):
1707                 InfoExtractor.__init__(self, downloader)
1708
1709         def report_download_webpage(self, video_id):
1710                 """Report webpage download."""
1711                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1712
1713         def report_extraction(self, video_id):
1714                 """Report information extraction."""
1715                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1716
1717         def _real_extract(self, url):
1718                 # Extract id from URL
1719                 mobj = re.match(self._VALID_URL, url)
1720                 if mobj is None:
1721                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1722                         return
1723
1724                 # At this point we have a new video
1725                 self._downloader.increment_downloads()
1726                 video_id = mobj.group(1)
1727
1728                 video_extension = 'mp4'
1729
1730                 # Retrieve video webpage to extract further information
1731                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1732                 try:
1733                         self.report_download_webpage(video_id)
1734                         webpage = urllib2.urlopen(request).read()
1735                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1736                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1737                         return
1738
1739                 # Extract URL, uploader, and title from webpage
1740                 self.report_extraction(video_id)
1741                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1742                 if mobj is None:
1743                         video_extension = 'flv'
1744                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1745                 if mobj is None:
1746                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1747                         return
1748                 mediaURL = urllib.unquote(mobj.group(1))
1749                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1750                 mediaURL = mediaURL.replace('\\x26', '\x26')
1751
1752                 video_url = mediaURL
1753
1754                 mobj = re.search(r'<title>(.*)</title>', webpage)
1755                 if mobj is None:
1756                         self._downloader.trouble(u'ERROR: unable to extract title')
1757                         return
1758                 video_title = mobj.group(1).decode('utf-8')
1759                 video_title = sanitize_title(video_title)
1760                 simple_title = _simplify_title(video_title)
1761
1762                 # Extract video description
1763                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1764                 if mobj is None:
1765                         self._downloader.trouble(u'ERROR: unable to extract video description')
1766                         return
1767                 video_description = mobj.group(1).decode('utf-8')
1768                 if not video_description:
1769                         video_description = 'No description available.'
1770
1771                 # Extract video thumbnail
1772                 if self._downloader.params.get('forcethumbnail', False):
1773                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1774                         try:
1775                                 webpage = urllib2.urlopen(request).read()
1776                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1777                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1778                                 return
1779                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1780                         if mobj is None:
1781                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1782                                 return
1783                         video_thumbnail = mobj.group(1)
1784                 else:   # we need something to pass to process_info
1785                         video_thumbnail = ''
1786
1787                 try:
1788                         # Process video information
1789                         self._downloader.process_info({
1790                                 'id':           video_id.decode('utf-8'),
1791                                 'url':          video_url.decode('utf-8'),
1792                                 'uploader':     u'NA',
1793                                 'upload_date':  u'NA',
1794                                 'title':        video_title,
1795                                 'stitle':       simple_title,
1796                                 'ext':          video_extension.decode('utf-8'),
1797                                 'format':       u'NA',
1798                                 'player_url':   None,
1799                         })
1800                 except UnavailableVideoError:
1801                         self._downloader.trouble(u'\nERROR: unable to download video')
1802
1803
1804 class PhotobucketIE(InfoExtractor):
1805         """Information extractor for photobucket.com."""
1806
1807         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1808         IE_NAME = u'photobucket'
1809
1810         def __init__(self, downloader=None):
1811                 InfoExtractor.__init__(self, downloader)
1812
1813         def report_download_webpage(self, video_id):
1814                 """Report webpage download."""
1815                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1816
1817         def report_extraction(self, video_id):
1818                 """Report information extraction."""
1819                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1820
1821         def _real_extract(self, url):
1822                 # Extract id from URL
1823                 mobj = re.match(self._VALID_URL, url)
1824                 if mobj is None:
1825                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1826                         return
1827
1828                 # At this point we have a new video
1829                 self._downloader.increment_downloads()
1830                 video_id = mobj.group(1)
1831
1832                 video_extension = 'flv'
1833
1834                 # Retrieve video webpage to extract further information
1835                 request = urllib2.Request(url)
1836                 try:
1837                         self.report_download_webpage(video_id)
1838                         webpage = urllib2.urlopen(request).read()
1839                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1840                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1841                         return
1842
1843                 # Extract URL, uploader, and title from webpage
1844                 self.report_extraction(video_id)
1845                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1846                 if mobj is None:
1847                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1848                         return
1849                 mediaURL = urllib.unquote(mobj.group(1))
1850
1851                 video_url = mediaURL
1852
1853                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1854                 if mobj is None:
1855                         self._downloader.trouble(u'ERROR: unable to extract title')
1856                         return
1857                 video_title = mobj.group(1).decode('utf-8')
1858                 video_title = sanitize_title(video_title)
1859                 simple_title = _simplify_title(vide_title)
1860
1861                 video_uploader = mobj.group(2).decode('utf-8')
1862
1863                 try:
1864                         # Process video information
1865                         self._downloader.process_info({
1866                                 'id':           video_id.decode('utf-8'),
1867                                 'url':          video_url.decode('utf-8'),
1868                                 'uploader':     video_uploader,
1869                                 'upload_date':  u'NA',
1870                                 'title':        video_title,
1871                                 'stitle':       simple_title,
1872                                 'ext':          video_extension.decode('utf-8'),
1873                                 'format':       u'NA',
1874                                 'player_url':   None,
1875                         })
1876                 except UnavailableVideoError:
1877                         self._downloader.trouble(u'\nERROR: unable to download video')
1878
1879
1880 class YahooIE(InfoExtractor):
1881         """Information extractor for video.yahoo.com."""
1882
1883         # _VALID_URL matches all Yahoo! Video URLs
1884         # _VPAGE_URL matches only the extractable '/watch/' URLs
1885         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1886         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1887         IE_NAME = u'video.yahoo'
1888
1889         def __init__(self, downloader=None):
1890                 InfoExtractor.__init__(self, downloader)
1891
1892         def report_download_webpage(self, video_id):
1893                 """Report webpage download."""
1894                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1895
1896         def report_extraction(self, video_id):
1897                 """Report information extraction."""
1898                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1899
1900         def _real_extract(self, url, new_video=True):
1901                 # Extract ID from URL
1902                 mobj = re.match(self._VALID_URL, url)
1903                 if mobj is None:
1904                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1905                         return
1906
1907                 # At this point we have a new video
1908                 self._downloader.increment_downloads()
1909                 video_id = mobj.group(2)
1910                 video_extension = 'flv'
1911
1912                 # Rewrite valid but non-extractable URLs as
1913                 # extractable English language /watch/ URLs
1914                 if re.match(self._VPAGE_URL, url) is None:
1915                         request = urllib2.Request(url)
1916                         try:
1917                                 webpage = urllib2.urlopen(request).read()
1918                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1919                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1920                                 return
1921
1922                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1923                         if mobj is None:
1924                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1925                                 return
1926                         yahoo_id = mobj.group(1)
1927
1928                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1929                         if mobj is None:
1930                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1931                                 return
1932                         yahoo_vid = mobj.group(1)
1933
1934                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1935                         return self._real_extract(url, new_video=False)
1936
1937                 # Retrieve video webpage to extract further information
1938                 request = urllib2.Request(url)
1939                 try:
1940                         self.report_download_webpage(video_id)
1941                         webpage = urllib2.urlopen(request).read()
1942                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1943                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1944                         return
1945
1946                 # Extract uploader and title from webpage
1947                 self.report_extraction(video_id)
1948                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1949                 if mobj is None:
1950                         self._downloader.trouble(u'ERROR: unable to extract video title')
1951                         return
1952                 video_title = mobj.group(1).decode('utf-8')
1953                 simple_title = _simplify_title(video_title)
1954
1955                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1956                 if mobj is None:
1957                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1958                         return
1959                 video_uploader = mobj.group(1).decode('utf-8')
1960
1961                 # Extract video thumbnail
1962                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1963                 if mobj is None:
1964                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1965                         return
1966                 video_thumbnail = mobj.group(1).decode('utf-8')
1967
1968                 # Extract video description
1969                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1970                 if mobj is None:
1971                         self._downloader.trouble(u'ERROR: unable to extract video description')
1972                         return
1973                 video_description = mobj.group(1).decode('utf-8')
1974                 if not video_description:
1975                         video_description = 'No description available.'
1976
1977                 # Extract video height and width
1978                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1979                 if mobj is None:
1980                         self._downloader.trouble(u'ERROR: unable to extract video height')
1981                         return
1982                 yv_video_height = mobj.group(1)
1983
1984                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1985                 if mobj is None:
1986                         self._downloader.trouble(u'ERROR: unable to extract video width')
1987                         return
1988                 yv_video_width = mobj.group(1)
1989
1990                 # Retrieve video playlist to extract media URL
1991                 # I'm not completely sure what all these options are, but we
1992                 # seem to need most of them, otherwise the server sends a 401.
1993                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1994                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1995                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1996                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1997                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1998                 try:
1999                         self.report_download_webpage(video_id)
2000                         webpage = urllib2.urlopen(request).read()
2001                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2002                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2003                         return
2004
2005                 # Extract media URL from playlist XML
2006                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2007                 if mobj is None:
2008                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2009                         return
2010                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2011                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2012
2013                 try:
2014                         # Process video information
2015                         self._downloader.process_info({
2016                                 'id':           video_id.decode('utf-8'),
2017                                 'url':          video_url,
2018                                 'uploader':     video_uploader,
2019                                 'upload_date':  u'NA',
2020                                 'title':        video_title,
2021                                 'stitle':       simple_title,
2022                                 'ext':          video_extension.decode('utf-8'),
2023                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2024                                 'description':  video_description,
2025                                 'thumbnail':    video_thumbnail,
2026                                 'player_url':   None,
2027                         })
2028                 except UnavailableVideoError:
2029                         self._downloader.trouble(u'\nERROR: unable to download video')
2030
2031
2032 class VimeoIE(InfoExtractor):
2033         """Information extractor for vimeo.com."""
2034
2035         # _VALID_URL matches Vimeo URLs
2036         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2037         IE_NAME = u'vimeo'
2038
2039         def __init__(self, downloader=None):
2040                 InfoExtractor.__init__(self, downloader)
2041
2042         def report_download_webpage(self, video_id):
2043                 """Report webpage download."""
2044                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2045
2046         def report_extraction(self, video_id):
2047                 """Report information extraction."""
2048                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2049
2050         def _real_extract(self, url, new_video=True):
2051                 # Extract ID from URL
2052                 mobj = re.match(self._VALID_URL, url)
2053                 if mobj is None:
2054                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2055                         return
2056
2057                 # At this point we have a new video
2058                 self._downloader.increment_downloads()
2059                 video_id = mobj.group(1)
2060
2061                 # Retrieve video webpage to extract further information
2062                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2063                 try:
2064                         self.report_download_webpage(video_id)
2065                         webpage = urllib2.urlopen(request).read()
2066                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2067                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2068                         return
2069
2070                 # Now we begin extracting as much information as we can from what we
2071                 # retrieved. First we extract the information common to all extractors,
2072                 # and latter we extract those that are Vimeo specific.
2073                 self.report_extraction(video_id)
2074
2075                 # Extract title
2076                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2077                 if mobj is None:
2078                         self._downloader.trouble(u'ERROR: unable to extract video title')
2079                         return
2080                 video_title = mobj.group(1).decode('utf-8')
2081                 simple_title = _simplify_title(video_title)
2082
2083                 # Extract uploader
2084                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2085                 if mobj is None:
2086                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2087                         return
2088                 video_uploader = mobj.group(1).decode('utf-8')
2089
2090                 # Extract video thumbnail
2091                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2092                 if mobj is None:
2093                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2094                         return
2095                 video_thumbnail = mobj.group(1).decode('utf-8')
2096
2097                 # # Extract video description
2098                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2099                 # if mobj is None:
2100                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2101                 #       return
2102                 # video_description = mobj.group(1).decode('utf-8')
2103                 # if not video_description: video_description = 'No description available.'
2104                 video_description = 'Foo.'
2105
2106                 # Vimeo specific: extract request signature
2107                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2108                 if mobj is None:
2109                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2110                         return
2111                 sig = mobj.group(1).decode('utf-8')
2112
2113                 # Vimeo specific: extract video quality information
2114                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2115                 if mobj is None:
2116                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2117                         return
2118                 quality = mobj.group(1).decode('utf-8')
2119
2120                 if int(quality) == 1:
2121                         quality = 'hd'
2122                 else:
2123                         quality = 'sd'
2124
2125                 # Vimeo specific: Extract request signature expiration
2126                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2127                 if mobj is None:
2128                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2129                         return
2130                 sig_exp = mobj.group(1).decode('utf-8')
2131
2132                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2133
2134                 try:
2135                         # Process video information
2136                         self._downloader.process_info({
2137                                 'id':           video_id.decode('utf-8'),
2138                                 'url':          video_url,
2139                                 'uploader':     video_uploader,
2140                                 'upload_date':  u'NA',
2141                                 'title':        video_title,
2142                                 'stitle':       simple_title,
2143                                 'ext':          u'mp4',
2144                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2145                                 'description':  video_description,
2146                                 'thumbnail':    video_thumbnail,
2147                                 'description':  video_description,
2148                                 'player_url':   None,
2149                         })
2150                 except UnavailableVideoError:
2151                         self._downloader.trouble(u'ERROR: unable to download video')
2152
2153
2154 class GenericIE(InfoExtractor):
2155         """Generic last-resort information extractor."""
2156
2157         _VALID_URL = r'.*'
2158         IE_NAME = u'generic'
2159
2160         def __init__(self, downloader=None):
2161                 InfoExtractor.__init__(self, downloader)
2162
2163         def report_download_webpage(self, video_id):
2164                 """Report webpage download."""
2165                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2166                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2167
2168         def report_extraction(self, video_id):
2169                 """Report information extraction."""
2170                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2171
2172         def _real_extract(self, url):
2173                 # At this point we have a new video
2174                 self._downloader.increment_downloads()
2175
2176                 video_id = url.split('/')[-1]
2177                 request = urllib2.Request(url)
2178                 try:
2179                         self.report_download_webpage(video_id)
2180                         webpage = urllib2.urlopen(request).read()
2181                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2182                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2183                         return
2184                 except ValueError, err:
2185                         # since this is the last-resort InfoExtractor, if
2186                         # this error is thrown, it'll be thrown here
2187                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2188                         return
2189
2190                 self.report_extraction(video_id)
2191                 # Start with something easy: JW Player in SWFObject
2192                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2193                 if mobj is None:
2194                         # Broaden the search a little bit
2195                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2196                 if mobj is None:
2197                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2198                         return
2199
2200                 # It's possible that one of the regexes
2201                 # matched, but returned an empty group:
2202                 if mobj.group(1) is None:
2203                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2204                         return
2205
2206                 video_url = urllib.unquote(mobj.group(1))
2207                 video_id = os.path.basename(video_url)
2208
2209                 # here's a fun little line of code for you:
2210                 video_extension = os.path.splitext(video_id)[1][1:]
2211                 video_id = os.path.splitext(video_id)[0]
2212
2213                 # it's tempting to parse this further, but you would
2214                 # have to take into account all the variations like
2215                 #   Video Title - Site Name
2216                 #   Site Name | Video Title
2217                 #   Video Title - Tagline | Site Name
2218                 # and so on and so forth; it's just not practical
2219                 mobj = re.search(r'<title>(.*)</title>', webpage)
2220                 if mobj is None:
2221                         self._downloader.trouble(u'ERROR: unable to extract title')
2222                         return
2223                 video_title = mobj.group(1).decode('utf-8')
2224                 video_title = sanitize_title(video_title)
2225                 simple_title = _simplify_title(video_title)
2226
2227                 # video uploader is domain name
2228                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2229                 if mobj is None:
2230                         self._downloader.trouble(u'ERROR: unable to extract title')
2231                         return
2232                 video_uploader = mobj.group(1).decode('utf-8')
2233
2234                 try:
2235                         # Process video information
2236                         self._downloader.process_info({
2237                                 'id':           video_id.decode('utf-8'),
2238                                 'url':          video_url.decode('utf-8'),
2239                                 'uploader':     video_uploader,
2240                                 'upload_date':  u'NA',
2241                                 'title':        video_title,
2242                                 'stitle':       simple_title,
2243                                 'ext':          video_extension.decode('utf-8'),
2244                                 'format':       u'NA',
2245                                 'player_url':   None,
2246                         })
2247                 except UnavailableVideoError, err:
2248                         self._downloader.trouble(u'\nERROR: unable to download video')
2249
2250
2251 class YoutubeSearchIE(InfoExtractor):
2252         """Information Extractor for YouTube search queries."""
2253         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2254         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2255         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2256         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2257         _youtube_ie = None
2258         _max_youtube_results = 1000
2259         IE_NAME = u'youtube:search'
2260
2261         def __init__(self, youtube_ie, downloader=None):
2262                 InfoExtractor.__init__(self, downloader)
2263                 self._youtube_ie = youtube_ie
2264
2265         def report_download_page(self, query, pagenum):
2266                 """Report attempt to download playlist page with given number."""
2267                 query = query.decode(preferredencoding())
2268                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2269
2270         def _real_initialize(self):
2271                 self._youtube_ie.initialize()
2272
2273         def _real_extract(self, query):
2274                 mobj = re.match(self._VALID_URL, query)
2275                 if mobj is None:
2276                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2277                         return
2278
2279                 prefix, query = query.split(':')
2280                 prefix = prefix[8:]
2281                 query = query.encode('utf-8')
2282                 if prefix == '':
2283                         self._download_n_results(query, 1)
2284                         return
2285                 elif prefix == 'all':
2286                         self._download_n_results(query, self._max_youtube_results)
2287                         return
2288                 else:
2289                         try:
2290                                 n = long(prefix)
2291                                 if n <= 0:
2292                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2293                                         return
2294                                 elif n > self._max_youtube_results:
2295                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2296                                         n = self._max_youtube_results
2297                                 self._download_n_results(query, n)
2298                                 return
2299                         except ValueError: # parsing prefix as integer fails
2300                                 self._download_n_results(query, 1)
2301                                 return
2302
2303         def _download_n_results(self, query, n):
2304                 """Downloads a specified number of results for a query"""
2305
2306                 video_ids = []
2307                 already_seen = set()
2308                 pagenum = 1
2309
2310                 while True:
2311                         self.report_download_page(query, pagenum)
2312                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2313                         request = urllib2.Request(result_url)
2314                         try:
2315                                 page = urllib2.urlopen(request).read()
2316                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2317                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2318                                 return
2319
2320                         # Extract video identifiers
2321                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2322                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2323                                 if video_id not in already_seen:
2324                                         video_ids.append(video_id)
2325                                         already_seen.add(video_id)
2326                                         if len(video_ids) == n:
2327                                                 # Specified n videos reached
2328                                                 for id in video_ids:
2329                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2330                                                 return
2331
2332                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2333                                 for id in video_ids:
2334                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2335                                 return
2336
2337                         pagenum = pagenum + 1
2338
2339
2340 class GoogleSearchIE(InfoExtractor):
2341         """Information Extractor for Google Video search queries."""
2342         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2343         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2344         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2345         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2346         _google_ie = None
2347         _max_google_results = 1000
2348         IE_NAME = u'video.google:search'
2349
2350         def __init__(self, google_ie, downloader=None):
2351                 InfoExtractor.__init__(self, downloader)
2352                 self._google_ie = google_ie
2353
2354         def report_download_page(self, query, pagenum):
2355                 """Report attempt to download playlist page with given number."""
2356                 query = query.decode(preferredencoding())
2357                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2358
2359         def _real_initialize(self):
2360                 self._google_ie.initialize()
2361
2362         def _real_extract(self, query):
2363                 mobj = re.match(self._VALID_URL, query)
2364                 if mobj is None:
2365                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2366                         return
2367
2368                 prefix, query = query.split(':')
2369                 prefix = prefix[8:]
2370                 query = query.encode('utf-8')
2371                 if prefix == '':
2372                         self._download_n_results(query, 1)
2373                         return
2374                 elif prefix == 'all':
2375                         self._download_n_results(query, self._max_google_results)
2376                         return
2377                 else:
2378                         try:
2379                                 n = long(prefix)
2380                                 if n <= 0:
2381                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2382                                         return
2383                                 elif n > self._max_google_results:
2384                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2385                                         n = self._max_google_results
2386                                 self._download_n_results(query, n)
2387                                 return
2388                         except ValueError: # parsing prefix as integer fails
2389                                 self._download_n_results(query, 1)
2390                                 return
2391
2392         def _download_n_results(self, query, n):
2393                 """Downloads a specified number of results for a query"""
2394
2395                 video_ids = []
2396                 already_seen = set()
2397                 pagenum = 1
2398
2399                 while True:
2400                         self.report_download_page(query, pagenum)
2401                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2402                         request = urllib2.Request(result_url)
2403                         try:
2404                                 page = urllib2.urlopen(request).read()
2405                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2406                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2407                                 return
2408
2409                         # Extract video identifiers
2410                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2411                                 video_id = mobj.group(1)
2412                                 if video_id not in already_seen:
2413                                         video_ids.append(video_id)
2414                                         already_seen.add(video_id)
2415                                         if len(video_ids) == n:
2416                                                 # Specified n videos reached
2417                                                 for id in video_ids:
2418                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2419                                                 return
2420
2421                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2422                                 for id in video_ids:
2423                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2424                                 return
2425
2426                         pagenum = pagenum + 1
2427
2428
2429 class YahooSearchIE(InfoExtractor):
2430         """Information Extractor for Yahoo! Video search queries."""
2431         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2432         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2433         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2434         _MORE_PAGES_INDICATOR = r'\s*Next'
2435         _yahoo_ie = None
2436         _max_yahoo_results = 1000
2437         IE_NAME = u'video.yahoo:search'
2438
2439         def __init__(self, yahoo_ie, downloader=None):
2440                 InfoExtractor.__init__(self, downloader)
2441                 self._yahoo_ie = yahoo_ie
2442
2443         def report_download_page(self, query, pagenum):
2444                 """Report attempt to download playlist page with given number."""
2445                 query = query.decode(preferredencoding())
2446                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2447
2448         def _real_initialize(self):
2449                 self._yahoo_ie.initialize()
2450
2451         def _real_extract(self, query):
2452                 mobj = re.match(self._VALID_URL, query)
2453                 if mobj is None:
2454                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2455                         return
2456
2457                 prefix, query = query.split(':')
2458                 prefix = prefix[8:]
2459                 query = query.encode('utf-8')
2460                 if prefix == '':
2461                         self._download_n_results(query, 1)
2462                         return
2463                 elif prefix == 'all':
2464                         self._download_n_results(query, self._max_yahoo_results)
2465                         return
2466                 else:
2467                         try:
2468                                 n = long(prefix)
2469                                 if n <= 0:
2470                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2471                                         return
2472                                 elif n > self._max_yahoo_results:
2473                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2474                                         n = self._max_yahoo_results
2475                                 self._download_n_results(query, n)
2476                                 return
2477                         except ValueError: # parsing prefix as integer fails
2478                                 self._download_n_results(query, 1)
2479                                 return
2480
2481         def _download_n_results(self, query, n):
2482                 """Downloads a specified number of results for a query"""
2483
2484                 video_ids = []
2485                 already_seen = set()
2486                 pagenum = 1
2487
2488                 while True:
2489                         self.report_download_page(query, pagenum)
2490                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2491                         request = urllib2.Request(result_url)
2492                         try:
2493                                 page = urllib2.urlopen(request).read()
2494                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2495                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2496                                 return
2497
2498                         # Extract video identifiers
2499                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2500                                 video_id = mobj.group(1)
2501                                 if video_id not in already_seen:
2502                                         video_ids.append(video_id)
2503                                         already_seen.add(video_id)
2504                                         if len(video_ids) == n:
2505                                                 # Specified n videos reached
2506                                                 for id in video_ids:
2507                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2508                                                 return
2509
2510                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2511                                 for id in video_ids:
2512                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2513                                 return
2514
2515                         pagenum = pagenum + 1
2516
2517
2518 class YoutubePlaylistIE(InfoExtractor):
2519         """Information Extractor for YouTube playlists."""
2520
2521         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2522         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2523         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2524         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2525         _youtube_ie = None
2526         IE_NAME = u'youtube:playlist'
2527
2528         def __init__(self, youtube_ie, downloader=None):
2529                 InfoExtractor.__init__(self, downloader)
2530                 self._youtube_ie = youtube_ie
2531
2532         def report_download_page(self, playlist_id, pagenum):
2533                 """Report attempt to download playlist page with given number."""
2534                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2535
2536         def _real_initialize(self):
2537                 self._youtube_ie.initialize()
2538
2539         def _real_extract(self, url):
2540                 # Extract playlist id
2541                 mobj = re.match(self._VALID_URL, url)
2542                 if mobj is None:
2543                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2544                         return
2545
2546                 # Single video case
2547                 if mobj.group(3) is not None:
2548                         self._youtube_ie.extract(mobj.group(3))
2549                         return
2550
2551                 # Download playlist pages
2552                 # prefix is 'p' as default for playlists but there are other types that need extra care
2553                 playlist_prefix = mobj.group(1)
2554                 if playlist_prefix == 'a':
2555                         playlist_access = 'artist'
2556                 else:
2557                         playlist_prefix = 'p'
2558                         playlist_access = 'view_play_list'
2559                 playlist_id = mobj.group(2)
2560                 video_ids = []
2561                 pagenum = 1
2562
2563                 while True:
2564                         self.report_download_page(playlist_id, pagenum)
2565                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2566                         request = urllib2.Request(url)
2567                         try:
2568                                 page = urllib2.urlopen(request).read()
2569                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2570                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2571                                 return
2572
2573                         # Extract video identifiers
2574                         ids_in_page = []
2575                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2576                                 if mobj.group(1) not in ids_in_page:
2577                                         ids_in_page.append(mobj.group(1))
2578                         video_ids.extend(ids_in_page)
2579
2580                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2581                                 break
2582                         pagenum = pagenum + 1
2583
2584                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2585                 playlistend = self._downloader.params.get('playlistend', -1)
2586                 video_ids = video_ids[playliststart:playlistend]
2587
2588                 for id in video_ids:
2589                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2590                 return
2591
2592
2593 class YoutubeUserIE(InfoExtractor):
2594         """Information Extractor for YouTube users."""
2595
2596         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2597         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2598         _GDATA_PAGE_SIZE = 50
2599         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2600         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2601         _youtube_ie = None
2602         IE_NAME = u'youtube:user'
2603
2604         def __init__(self, youtube_ie, downloader=None):
2605                 InfoExtractor.__init__(self, downloader)
2606                 self._youtube_ie = youtube_ie
2607
2608         def report_download_page(self, username, start_index):
2609                 """Report attempt to download user page."""
2610                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2611                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2612
2613         def _real_initialize(self):
2614                 self._youtube_ie.initialize()
2615
2616         def _real_extract(self, url):
2617                 # Extract username
2618                 mobj = re.match(self._VALID_URL, url)
2619                 if mobj is None:
2620                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2621                         return
2622
2623                 username = mobj.group(1)
2624
2625                 # Download video ids using YouTube Data API. Result size per
2626                 # query is limited (currently to 50 videos) so we need to query
2627                 # page by page until there are no video ids - it means we got
2628                 # all of them.
2629
2630                 video_ids = []
2631                 pagenum = 0
2632
2633                 while True:
2634                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2635                         self.report_download_page(username, start_index)
2636
2637                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2638
2639                         try:
2640                                 page = urllib2.urlopen(request).read()
2641                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2642                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2643                                 return
2644
2645                         # Extract video identifiers
2646                         ids_in_page = []
2647
2648                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2649                                 if mobj.group(1) not in ids_in_page:
2650                                         ids_in_page.append(mobj.group(1))
2651
2652                         video_ids.extend(ids_in_page)
2653
2654                         # A little optimization - if current page is not
2655                         # "full", ie. does not contain PAGE_SIZE video ids then
2656                         # we can assume that this page is the last one - there
2657                         # are no more ids on further pages - no need to query
2658                         # again.
2659
2660                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2661                                 break
2662
2663                         pagenum += 1
2664
2665                 all_ids_count = len(video_ids)
2666                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2667                 playlistend = self._downloader.params.get('playlistend', -1)
2668
2669                 if playlistend == -1:
2670                         video_ids = video_ids[playliststart:]
2671                 else:
2672                         video_ids = video_ids[playliststart:playlistend]
2673
2674                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2675                                 (username, all_ids_count, len(video_ids)))
2676
2677                 for video_id in video_ids:
2678                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2679
2680
2681 class DepositFilesIE(InfoExtractor):
2682         """Information extractor for depositfiles.com"""
2683
2684         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2685         IE_NAME = u'DepositFiles'
2686
2687         def __init__(self, downloader=None):
2688                 InfoExtractor.__init__(self, downloader)
2689
2690         def report_download_webpage(self, file_id):
2691                 """Report webpage download."""
2692                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2693
2694         def report_extraction(self, file_id):
2695                 """Report information extraction."""
2696                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2697
2698         def _real_extract(self, url):
2699                 # At this point we have a new file
2700                 self._downloader.increment_downloads()
2701
2702                 file_id = url.split('/')[-1]
2703                 # Rebuild url in english locale
2704                 url = 'http://depositfiles.com/en/files/' + file_id
2705
2706                 # Retrieve file webpage with 'Free download' button pressed
2707                 free_download_indication = { 'gateway_result' : '1' }
2708                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2709                 try:
2710                         self.report_download_webpage(file_id)
2711                         webpage = urllib2.urlopen(request).read()
2712                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2713                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2714                         return
2715
2716                 # Search for the real file URL
2717                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2718                 if (mobj is None) or (mobj.group(1) is None):
2719                         # Try to figure out reason of the error.
2720                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2721                         if (mobj is not None) and (mobj.group(1) is not None):
2722                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2723                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2724                         else:
2725                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2726                         return
2727
2728                 file_url = mobj.group(1)
2729                 file_extension = os.path.splitext(file_url)[1][1:]
2730
2731                 # Search for file title
2732                 mobj = re.search(r'<b title="(.*?)">', webpage)
2733                 if mobj is None:
2734                         self._downloader.trouble(u'ERROR: unable to extract title')
2735                         return
2736                 file_title = mobj.group(1).decode('utf-8')
2737
2738                 try:
2739                         # Process file information
2740                         self._downloader.process_info({
2741                                 'id':           file_id.decode('utf-8'),
2742                                 'url':          file_url.decode('utf-8'),
2743                                 'uploader':     u'NA',
2744                                 'upload_date':  u'NA',
2745                                 'title':        file_title,
2746                                 'stitle':       file_title,
2747                                 'ext':          file_extension.decode('utf-8'),
2748                                 'format':       u'NA',
2749                                 'player_url':   None,
2750                         })
2751                 except UnavailableVideoError, err:
2752                         self._downloader.trouble(u'ERROR: unable to download file')
2753
2754
2755 class FacebookIE(InfoExtractor):
2756         """Information Extractor for Facebook"""
2757
2758         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2759         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2760         _NETRC_MACHINE = 'facebook'
2761         _available_formats = ['video', 'highqual', 'lowqual']
2762         _video_extensions = {
2763                 'video': 'mp4',
2764                 'highqual': 'mp4',
2765                 'lowqual': 'mp4',
2766         }
2767         IE_NAME = u'facebook'
2768
2769         def __init__(self, downloader=None):
2770                 InfoExtractor.__init__(self, downloader)
2771
2772         def _reporter(self, message):
2773                 """Add header and report message."""
2774                 self._downloader.to_screen(u'[facebook] %s' % message)
2775
2776         def report_login(self):
2777                 """Report attempt to log in."""
2778                 self._reporter(u'Logging in')
2779
2780         def report_video_webpage_download(self, video_id):
2781                 """Report attempt to download video webpage."""
2782                 self._reporter(u'%s: Downloading video webpage' % video_id)
2783
2784         def report_information_extraction(self, video_id):
2785                 """Report attempt to extract video information."""
2786                 self._reporter(u'%s: Extracting video information' % video_id)
2787
2788         def _parse_page(self, video_webpage):
2789                 """Extract video information from page"""
2790                 # General data
2791                 data = {'title': r'\("video_title", "(.*?)"\)',
2792                         'description': r'<div class="datawrap">(.*?)</div>',
2793                         'owner': r'\("video_owner_name", "(.*?)"\)',
2794                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2795                         }
2796                 video_info = {}
2797                 for piece in data.keys():
2798                         mobj = re.search(data[piece], video_webpage)
2799                         if mobj is not None:
2800                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2801
2802                 # Video urls
2803                 video_urls = {}
2804                 for fmt in self._available_formats:
2805                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2806                         if mobj is not None:
2807                                 # URL is in a Javascript segment inside an escaped Unicode format within
2808                                 # the generally utf-8 page
2809                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2810                 video_info['video_urls'] = video_urls
2811
2812                 return video_info
2813
2814         def _real_initialize(self):
2815                 if self._downloader is None:
2816                         return
2817
2818                 useremail = None
2819                 password = None
2820                 downloader_params = self._downloader.params
2821
2822                 # Attempt to use provided username and password or .netrc data
2823                 if downloader_params.get('username', None) is not None:
2824                         useremail = downloader_params['username']
2825                         password = downloader_params['password']
2826                 elif downloader_params.get('usenetrc', False):
2827                         try:
2828                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2829                                 if info is not None:
2830                                         useremail = info[0]
2831                                         password = info[2]
2832                                 else:
2833                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2834                         except (IOError, netrc.NetrcParseError), err:
2835                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2836                                 return
2837
2838                 if useremail is None:
2839                         return
2840
2841                 # Log in
2842                 login_form = {
2843                         'email': useremail,
2844                         'pass': password,
2845                         'login': 'Log+In'
2846                         }
2847                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2848                 try:
2849                         self.report_login()
2850                         login_results = urllib2.urlopen(request).read()
2851                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2852                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2853                                 return
2854                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2855                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2856                         return
2857
2858         def _real_extract(self, url):
2859                 mobj = re.match(self._VALID_URL, url)
2860                 if mobj is None:
2861                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2862                         return
2863                 video_id = mobj.group('ID')
2864
2865                 # Get video webpage
2866                 self.report_video_webpage_download(video_id)
2867                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2868                 try:
2869                         page = urllib2.urlopen(request)
2870                         video_webpage = page.read()
2871                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2872                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2873                         return
2874
2875                 # Start extracting information
2876                 self.report_information_extraction(video_id)
2877
2878                 # Extract information
2879                 video_info = self._parse_page(video_webpage)
2880
2881                 # uploader
2882                 if 'owner' not in video_info:
2883                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2884                         return
2885                 video_uploader = video_info['owner']
2886
2887                 # title
2888                 if 'title' not in video_info:
2889                         self._downloader.trouble(u'ERROR: unable to extract video title')
2890                         return
2891                 video_title = video_info['title']
2892                 video_title = video_title.decode('utf-8')
2893                 video_title = sanitize_title(video_title)
2894
2895                 simple_title = _simplify_title(video_title)
2896
2897                 # thumbnail image
2898                 if 'thumbnail' not in video_info:
2899                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2900                         video_thumbnail = ''
2901                 else:
2902                         video_thumbnail = video_info['thumbnail']
2903
2904                 # upload date
2905                 upload_date = u'NA'
2906                 if 'upload_date' in video_info:
2907                         upload_time = video_info['upload_date']
2908                         timetuple = email.utils.parsedate_tz(upload_time)
2909                         if timetuple is not None:
2910                                 try:
2911                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2912                                 except:
2913                                         pass
2914
2915                 # description
2916                 video_description = video_info.get('description', 'No description available.')
2917
2918                 url_map = video_info['video_urls']
2919                 if len(url_map.keys()) > 0:
2920                         # Decide which formats to download
2921                         req_format = self._downloader.params.get('format', None)
2922                         format_limit = self._downloader.params.get('format_limit', None)
2923
2924                         if format_limit is not None and format_limit in self._available_formats:
2925                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2926                         else:
2927                                 format_list = self._available_formats
2928                         existing_formats = [x for x in format_list if x in url_map]
2929                         if len(existing_formats) == 0:
2930                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2931                                 return
2932                         if req_format is None:
2933                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2934                         elif req_format == 'worst':
2935                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2936                         elif req_format == '-1':
2937                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2938                         else:
2939                                 # Specific format
2940                                 if req_format not in url_map:
2941                                         self._downloader.trouble(u'ERROR: requested format not available')
2942                                         return
2943                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2944
2945                 for format_param, video_real_url in video_url_list:
2946
2947                         # At this point we have a new video
2948                         self._downloader.increment_downloads()
2949
2950                         # Extension
2951                         video_extension = self._video_extensions.get(format_param, 'mp4')
2952
2953                         try:
2954                                 # Process video information
2955                                 self._downloader.process_info({
2956                                         'id':           video_id.decode('utf-8'),
2957                                         'url':          video_real_url.decode('utf-8'),
2958                                         'uploader':     video_uploader.decode('utf-8'),
2959                                         'upload_date':  upload_date,
2960                                         'title':        video_title,
2961                                         'stitle':       simple_title,
2962                                         'ext':          video_extension.decode('utf-8'),
2963                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2964                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2965                                         'description':  video_description.decode('utf-8'),
2966                                         'player_url':   None,
2967                                 })
2968                         except UnavailableVideoError, err:
2969                                 self._downloader.trouble(u'\nERROR: unable to download video')
2970
2971 class BlipTVIE(InfoExtractor):
2972         """Information extractor for blip.tv"""
2973
2974         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2975         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2976         IE_NAME = u'blip.tv'
2977
2978         def report_extraction(self, file_id):
2979                 """Report information extraction."""
2980                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2981
2982         def report_direct_download(self, title):
2983                 """Report information extraction."""
2984                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2985
2986         def _real_extract(self, url):
2987                 mobj = re.match(self._VALID_URL, url)
2988                 if mobj is None:
2989                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2990                         return
2991
2992                 if '?' in url:
2993                         cchar = '&'
2994                 else:
2995                         cchar = '?'
2996                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2997                 request = urllib2.Request(json_url)
2998                 self.report_extraction(mobj.group(1))
2999                 info = None
3000                 try:
3001                         urlh = urllib2.urlopen(request)
3002                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3003                                 basename = url.split('/')[-1]
3004                                 title,ext = os.path.splitext(basename)
3005                                 title = title.decode('UTF-8')
3006                                 ext = ext.replace('.', '')
3007                                 self.report_direct_download(title)
3008                                 info = {
3009                                         'id': title,
3010                                         'url': url,
3011                                         'title': title,
3012                                         'stitle': _simplify_title(title),
3013                                         'ext': ext,
3014                                         'urlhandle': urlh
3015                                 }
3016                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3017                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3018                         return
3019                 if info is None: # Regular URL
3020                         try:
3021                                 json_code = urlh.read()
3022                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3023                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3024                                 return
3025
3026                         try:
3027                                 json_data = json.loads(json_code)
3028                                 if 'Post' in json_data:
3029                                         data = json_data['Post']
3030                                 else:
3031                                         data = json_data
3032         
3033                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3034                                 video_url = data['media']['url']
3035                                 umobj = re.match(self._URL_EXT, video_url)
3036                                 if umobj is None:
3037                                         raise ValueError('Can not determine filename extension')
3038                                 ext = umobj.group(1)
3039         
3040                                 info = {
3041                                         'id': data['item_id'],
3042                                         'url': video_url,
3043                                         'uploader': data['display_name'],
3044                                         'upload_date': upload_date,
3045                                         'title': data['title'],
3046                                         'stitle': _simplify_title(data['title']),
3047                                         'ext': ext,
3048                                         'format': data['media']['mimeType'],
3049                                         'thumbnail': data['thumbnailUrl'],
3050                                         'description': data['description'],
3051                                         'player_url': data['embedUrl']
3052                                 }
3053                         except (ValueError,KeyError), err:
3054                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3055                                 return
3056
3057                 self._downloader.increment_downloads()
3058
3059                 try:
3060                         self._downloader.process_info(info)
3061                 except UnavailableVideoError, err:
3062                         self._downloader.trouble(u'\nERROR: unable to download video')
3063
3064
3065 class MyVideoIE(InfoExtractor):
3066         """Information Extractor for myvideo.de."""
3067
3068         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3069         IE_NAME = u'myvideo'
3070
3071         def __init__(self, downloader=None):
3072                 InfoExtractor.__init__(self, downloader)
3073         
3074         def report_download_webpage(self, video_id):
3075                 """Report webpage download."""
3076                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3077
3078         def report_extraction(self, video_id):
3079                 """Report information extraction."""
3080                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3081
3082         def _real_extract(self,url):
3083                 mobj = re.match(self._VALID_URL, url)
3084                 if mobj is None:
3085                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3086                         return
3087
3088                 video_id = mobj.group(1)
3089
3090                 # Get video webpage
3091                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3092                 try:
3093                         self.report_download_webpage(video_id)
3094                         webpage = urllib2.urlopen(request).read()
3095                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3096                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3097                         return
3098
3099                 self.report_extraction(video_id)
3100                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3101                                  webpage)
3102                 if mobj is None:
3103                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3104                         return
3105                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3106
3107                 mobj = re.search('<title>([^<]+)</title>', webpage)
3108                 if mobj is None:
3109                         self._downloader.trouble(u'ERROR: unable to extract title')
3110                         return
3111
3112                 video_title = mobj.group(1)
3113                 video_title = sanitize_title(video_title)
3114
3115                 simple_title = _simplify_title(video_title)
3116
3117                 try:
3118                         self._downloader.process_info({
3119                                 'id':           video_id,
3120                                 'url':          video_url,
3121                                 'uploader':     u'NA',
3122                                 'upload_date':  u'NA',
3123                                 'title':        video_title,
3124                                 'stitle':       simple_title,
3125                                 'ext':          u'flv',
3126                                 'format':       u'NA',
3127                                 'player_url':   None,
3128                         })
3129                 except UnavailableVideoError:
3130                         self._downloader.trouble(u'\nERROR: Unable to download video')
3131
3132 class ComedyCentralIE(InfoExtractor):
3133         """Information extractor for The Daily Show and Colbert Report """
3134
3135         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3136         IE_NAME = u'comedycentral'
3137
3138         def report_extraction(self, episode_id):
3139                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3140         
3141         def report_config_download(self, episode_id):
3142                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3143
3144         def report_index_download(self, episode_id):
3145                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3146
3147         def report_player_url(self, episode_id):
3148                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3149
3150         def _real_extract(self, url):
3151                 mobj = re.match(self._VALID_URL, url)
3152                 if mobj is None:
3153                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3154                         return
3155
3156                 if mobj.group('shortname'):
3157                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3158                                 url = u'http://www.thedailyshow.com/full-episodes/'
3159                         else:
3160                                 url = u'http://www.colbertnation.com/full-episodes/'
3161                         mobj = re.match(self._VALID_URL, url)
3162                         assert mobj is not None
3163
3164                 dlNewest = not mobj.group('episode')
3165                 if dlNewest:
3166                         epTitle = mobj.group('showname')
3167                 else:
3168                         epTitle = mobj.group('episode')
3169
3170                 req = urllib2.Request(url)
3171                 self.report_extraction(epTitle)
3172                 try:
3173                         htmlHandle = urllib2.urlopen(req)
3174                         html = htmlHandle.read()
3175                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3176                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3177                         return
3178                 if dlNewest:
3179                         url = htmlHandle.geturl()
3180                         mobj = re.match(self._VALID_URL, url)
3181                         if mobj is None:
3182                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3183                                 return
3184                         if mobj.group('episode') == '':
3185                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3186                                 return
3187                         epTitle = mobj.group('episode')
3188
3189                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3190                 if len(mMovieParams) == 0:
3191                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3192                         return
3193
3194                 playerUrl_raw = mMovieParams[0][0]
3195                 self.report_player_url(epTitle)
3196                 try:
3197                         urlHandle = urllib2.urlopen(playerUrl_raw)
3198                         playerUrl = urlHandle.geturl()
3199                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3200                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3201                         return
3202
3203                 uri = mMovieParams[0][1]
3204                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3205                 self.report_index_download(epTitle)
3206                 try:
3207                         indexXml = urllib2.urlopen(indexUrl).read()
3208                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3209                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3210                         return
3211
3212                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3213                 itemEls = idoc.findall('.//item')
3214                 for itemEl in itemEls:
3215                         mediaId = itemEl.findall('./guid')[0].text
3216                         shortMediaId = mediaId.split(':')[-1]
3217                         showId = mediaId.split(':')[-2].replace('.com', '')
3218                         officialTitle = itemEl.findall('./title')[0].text
3219                         officialDate = itemEl.findall('./pubDate')[0].text
3220
3221                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3222                                                 urllib.urlencode({'uri': mediaId}))
3223                         configReq = urllib2.Request(configUrl)
3224                         self.report_config_download(epTitle)
3225                         try:
3226                                 configXml = urllib2.urlopen(configReq).read()
3227                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3228                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3229                                 return
3230
3231                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3232                         turls = []
3233                         for rendition in cdoc.findall('.//rendition'):
3234                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3235                                 turls.append(finfo)
3236
3237                         if len(turls) == 0:
3238                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3239                                 continue
3240
3241                         # For now, just pick the highest bitrate
3242                         format,video_url = turls[-1]
3243
3244                         self._downloader.increment_downloads()
3245
3246                         effTitle = showId + u'-' + epTitle
3247                         info = {
3248                                 'id': shortMediaId,
3249                                 'url': video_url,
3250                                 'uploader': showId,
3251                                 'upload_date': officialDate,
3252                                 'title': effTitle,
3253                                 'stitle': _simplify_title(effTitle),
3254                                 'ext': 'mp4',
3255                                 'format': format,
3256                                 'thumbnail': None,
3257                                 'description': officialTitle,
3258                                 'player_url': playerUrl
3259                         }
3260
3261                         try:
3262                                 self._downloader.process_info(info)
3263                         except UnavailableVideoError, err:
3264                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3265                                 continue
3266
3267
3268 class EscapistIE(InfoExtractor):
3269         """Information extractor for The Escapist """
3270
3271         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3272         IE_NAME = u'escapist'
3273
3274         def report_extraction(self, showName):
3275                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3276
3277         def report_config_download(self, showName):
3278                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3279
3280         def _real_extract(self, url):
3281                 htmlParser = HTMLParser.HTMLParser()
3282
3283                 mobj = re.match(self._VALID_URL, url)
3284                 if mobj is None:
3285                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3286                         return
3287                 showName = mobj.group('showname')
3288                 videoId = mobj.group('episode')
3289
3290                 self.report_extraction(showName)
3291                 try:
3292                         webPage = urllib2.urlopen(url).read()
3293                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3294                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3295                         return
3296
3297                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3298                 description = htmlParser.unescape(descMatch.group(1))
3299                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3300                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3301                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3302                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3303                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3304                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3305
3306                 self.report_config_download(showName)
3307                 try:
3308                         configJSON = urllib2.urlopen(configUrl).read()
3309                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3310                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3311                         return
3312
3313                 # Technically, it's JavaScript, not JSON
3314                 configJSON = configJSON.replace("'", '"')
3315
3316                 try:
3317                         config = json.loads(configJSON)
3318                 except (ValueError,), err:
3319                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3320                         return
3321
3322                 playlist = config['playlist']
3323                 videoUrl = playlist[1]['url']
3324
3325                 self._downloader.increment_downloads()
3326                 info = {
3327                         'id': videoId,
3328                         'url': videoUrl,
3329                         'uploader': showName,
3330                         'upload_date': None,
3331                         'title': showName,
3332                         'stitle': _simplify_title(showName),
3333                         'ext': 'flv',
3334                         'format': 'flv',
3335                         'thumbnail': imgUrl,
3336                         'description': description,
3337                         'player_url': playerUrl,
3338                 }
3339
3340                 try:
3341                         self._downloader.process_info(info)
3342                 except UnavailableVideoError, err:
3343                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3344
3345
3346 class CollegeHumorIE(InfoExtractor):
3347         """Information extractor for collegehumor.com"""
3348
3349         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3350         IE_NAME = u'collegehumor'
3351
3352         def report_webpage(self, video_id):
3353                 """Report information extraction."""
3354                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3355
3356         def report_extraction(self, video_id):
3357                 """Report information extraction."""
3358                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3359
3360         def _real_extract(self, url):
3361                 htmlParser = HTMLParser.HTMLParser()
3362
3363                 mobj = re.match(self._VALID_URL, url)
3364                 if mobj is None:
3365                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3366                         return
3367                 video_id = mobj.group('videoid')
3368
3369                 self.report_webpage(video_id)
3370                 request = urllib2.Request(url)
3371                 try:
3372                         webpage = urllib2.urlopen(request).read()
3373                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3374                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3375                         return
3376
3377                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3378                 if m is None:
3379                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3380                         return
3381                 internal_video_id = m.group('internalvideoid')
3382
3383                 info = {
3384                         'id': video_id,
3385                         'internal_id': internal_video_id,
3386                 }
3387
3388                 self.report_extraction(video_id)
3389                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3390                 try:
3391                         metaXml = urllib2.urlopen(xmlUrl).read()
3392                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3393                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3394                         return
3395
3396                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3397                 try:
3398                         videoNode = mdoc.findall('./video')[0]
3399                         info['description'] = videoNode.findall('./description')[0].text
3400                         info['title'] = videoNode.findall('./caption')[0].text
3401                         info['stitle'] = _simplify_title(info['title'])
3402                         info['url'] = videoNode.findall('./file')[0].text
3403                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3404                         info['ext'] = info['url'].rpartition('.')[2]
3405                         info['format'] = info['ext']
3406                 except IndexError:
3407                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3408                         return
3409
3410                 self._downloader.increment_downloads()
3411
3412                 try:
3413                         self._downloader.process_info(info)
3414                 except UnavailableVideoError, err:
3415                         self._downloader.trouble(u'\nERROR: unable to download video')
3416
3417
3418 class XVideosIE(InfoExtractor):
3419         """Information extractor for xvideos.com"""
3420
3421         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3422         IE_NAME = u'xvideos'
3423
3424         def report_webpage(self, video_id):
3425                 """Report information extraction."""
3426                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3427
3428         def report_extraction(self, video_id):
3429                 """Report information extraction."""
3430                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3431
3432         def _real_extract(self, url):
3433                 htmlParser = HTMLParser.HTMLParser()
3434
3435                 mobj = re.match(self._VALID_URL, url)
3436                 if mobj is None:
3437                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3438                         return
3439                 video_id = mobj.group(1).decode('utf-8')
3440
3441                 self.report_webpage(video_id)
3442
3443                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3444                 try:
3445                         webpage = urllib2.urlopen(request).read()
3446                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3447                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3448                         return
3449
3450                 self.report_extraction(video_id)
3451
3452
3453                 # Extract video URL
3454                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3455                 if mobj is None:
3456                         self._downloader.trouble(u'ERROR: unable to extract video url')
3457                         return
3458                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3459
3460
3461                 # Extract title
3462                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3463                 if mobj is None:
3464                         self._downloader.trouble(u'ERROR: unable to extract video title')
3465                         return
3466                 video_title = mobj.group(1).decode('utf-8')
3467
3468
3469                 # Extract video thumbnail
3470                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3471                 if mobj is None:
3472                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3473                         return
3474                 video_thumbnail = mobj.group(1).decode('utf-8')
3475
3476
3477
3478                 self._downloader.increment_downloads()
3479                 info = {
3480                         'id': video_id,
3481                         'url': video_url,
3482                         'uploader': None,
3483                         'upload_date': None,
3484                         'title': video_title,
3485                         'stitle': _simplify_title(video_title),
3486                         'ext': 'flv',
3487                         'format': 'flv',
3488                         'thumbnail': video_thumbnail,
3489                         'description': None,
3490                         'player_url': None,
3491                 }
3492
3493                 try:
3494                         self._downloader.process_info(info)
3495                 except UnavailableVideoError, err:
3496                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3497
3498
3499 class SoundcloudIE(InfoExtractor):
3500         """Information extractor for soundcloud.com
3501            To access the media, the uid of the song and a stream token
3502            must be extracted from the page source and the script must make
3503            a request to media.soundcloud.com/crossdomain.xml. Then
3504            the media can be grabbed by requesting from an url composed
3505            of the stream token and uid
3506          """
3507
3508         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3509         IE_NAME = u'soundcloud'
3510
3511         def __init__(self, downloader=None):
3512                 InfoExtractor.__init__(self, downloader)
3513
3514         def report_webpage(self, video_id):
3515                 """Report information extraction."""
3516                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3517
3518         def report_extraction(self, video_id):
3519                 """Report information extraction."""
3520                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3521
3522         def _real_extract(self, url):
3523                 htmlParser = HTMLParser.HTMLParser()
3524
3525                 mobj = re.match(self._VALID_URL, url)
3526                 if mobj is None:
3527                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3528                         return
3529
3530                 # extract uploader (which is in the url)
3531                 uploader = mobj.group(1).decode('utf-8')
3532                 # extract simple title (uploader + slug of song title)
3533                 slug_title =  mobj.group(2).decode('utf-8')
3534                 simple_title = uploader + '-' + slug_title
3535
3536                 self.report_webpage('%s/%s' % (uploader, slug_title))
3537
3538                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3539                 try:
3540                         webpage = urllib2.urlopen(request).read()
3541                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3542                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3543                         return
3544
3545                 self.report_extraction('%s/%s' % (uploader, slug_title))
3546
3547                 # extract uid and stream token that soundcloud hands out for access
3548                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3549                 if mobj:
3550                         video_id = mobj.group(1)
3551                         stream_token = mobj.group(2)
3552
3553                 # extract unsimplified title
3554                 mobj = re.search('"title":"(.*?)",', webpage)
3555                 if mobj:
3556                         title = mobj.group(1)
3557
3558                 # construct media url (with uid/token)
3559                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3560                 mediaURL = mediaURL % (video_id, stream_token)
3561
3562                 # description
3563                 description = u'No description available'
3564                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3565                 if mobj:
3566                         description = mobj.group(1)
3567                 
3568                 # upload date
3569                 upload_date = None
3570                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3571                 if mobj:
3572                         try:
3573                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3574                         except Exception, e:
3575                                 print str(e)
3576
3577                 # for soundcloud, a request to a cross domain is required for cookies
3578                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3579
3580                 try:
3581                         self._downloader.process_info({
3582                                 'id':           video_id.decode('utf-8'),
3583                                 'url':          mediaURL,
3584                                 'uploader':     uploader.decode('utf-8'),
3585                                 'upload_date':  upload_date,
3586                                 'title':        simple_title.decode('utf-8'),
3587                                 'stitle':       simple_title.decode('utf-8'),
3588                                 'ext':          u'mp3',
3589                                 'format':       u'NA',
3590                                 'player_url':   None,
3591                                 'description': description.decode('utf-8')
3592                         })
3593                 except UnavailableVideoError:
3594                         self._downloader.trouble(u'\nERROR: unable to download video')
3595
3596
3597 class InfoQIE(InfoExtractor):
3598         """Information extractor for infoq.com"""
3599
3600         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3601         IE_NAME = u'infoq'
3602
3603         def report_webpage(self, video_id):
3604                 """Report information extraction."""
3605                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3606
3607         def report_extraction(self, video_id):
3608                 """Report information extraction."""
3609                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3610
3611         def _real_extract(self, url):
3612                 htmlParser = HTMLParser.HTMLParser()
3613
3614                 mobj = re.match(self._VALID_URL, url)
3615                 if mobj is None:
3616                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3617                         return
3618
3619                 self.report_webpage(url)
3620
3621                 request = urllib2.Request(url)
3622                 try:
3623                         webpage = urllib2.urlopen(request).read()
3624                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3625                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3626                         return
3627
3628                 self.report_extraction(url)
3629
3630
3631                 # Extract video URL
3632                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3633                 if mobj is None:
3634                         self._downloader.trouble(u'ERROR: unable to extract video url')
3635                         return
3636                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3637
3638
3639                 # Extract title
3640                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3641                 if mobj is None:
3642                         self._downloader.trouble(u'ERROR: unable to extract video title')
3643                         return
3644                 video_title = mobj.group(1).decode('utf-8')
3645
3646                 # Extract description
3647                 video_description = u'No description available.'
3648                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3649                 if mobj is not None:
3650                         video_description = mobj.group(1).decode('utf-8')
3651
3652                 video_filename = video_url.split('/')[-1]
3653                 video_id, extension = video_filename.split('.')
3654
3655                 self._downloader.increment_downloads()
3656                 info = {
3657                         'id': video_id,
3658                         'url': video_url,
3659                         'uploader': None,
3660                         'upload_date': None,
3661                         'title': video_title,
3662                         'stitle': _simplify_title(video_title),
3663                         'ext': extension,
3664                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3665                         'thumbnail': None,
3666                         'description': video_description,
3667                         'player_url': None,
3668                 }
3669
3670                 try:
3671                         self._downloader.process_info(info)
3672                 except UnavailableVideoError, err:
3673                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3674
3675 class MixcloudIE(InfoExtractor):
3676         """Information extractor for www.mixcloud.com"""
3677         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3678         IE_NAME = u'mixcloud'
3679
3680         def __init__(self, downloader=None):
3681                 InfoExtractor.__init__(self, downloader)
3682
3683         def report_download_json(self, file_id):
3684                 """Report JSON download."""
3685                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3686
3687         def report_extraction(self, file_id):
3688                 """Report information extraction."""
3689                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3690
3691         def get_urls(self, jsonData, fmt, bitrate='best'):
3692                 """Get urls from 'audio_formats' section in json"""
3693                 file_url = None
3694                 try:
3695                         bitrate_list = jsonData[fmt]
3696                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3697                                 bitrate = max(bitrate_list) # select highest
3698
3699                         url_list = jsonData[fmt][bitrate]
3700                 except TypeError: # we have no bitrate info.
3701                         url_list = jsonData[fmt]
3702                                 
3703                 return url_list
3704
3705         def check_urls(self, url_list):
3706                 """Returns 1st active url from list"""
3707                 for url in url_list:
3708                         try:
3709                                 urllib2.urlopen(url)
3710                                 return url
3711                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3712                                 url = None
3713
3714                 return None
3715
3716         def _print_formats(self, formats):
3717                 print 'Available formats:'
3718                 for fmt in formats.keys():
3719                         for b in formats[fmt]:
3720                                 try:
3721                                         ext = formats[fmt][b][0]
3722                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3723                                 except TypeError: # we have no bitrate info
3724                                         ext = formats[fmt][0]
3725                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3726                                         break
3727
3728         def _real_extract(self, url):
3729                 mobj = re.match(self._VALID_URL, url)
3730                 if mobj is None:
3731                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3732                         return
3733                 # extract uploader & filename from url
3734                 uploader = mobj.group(1).decode('utf-8')
3735                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3736
3737                 # construct API request
3738                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3739                 # retrieve .json file with links to files
3740                 request = urllib2.Request(file_url)
3741                 try:
3742                         self.report_download_json(file_url)
3743                         jsonData = urllib2.urlopen(request).read()
3744                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3745                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3746                         return
3747
3748                 # parse JSON
3749                 json_data = json.loads(jsonData)
3750                 player_url = json_data['player_swf_url']
3751                 formats = dict(json_data['audio_formats'])
3752
3753                 req_format = self._downloader.params.get('format', None)
3754                 bitrate = None
3755
3756                 if self._downloader.params.get('listformats', None):
3757                         self._print_formats(formats)
3758                         return
3759
3760                 if req_format is None or req_format == 'best':
3761                         for format_param in formats.keys():
3762                                 url_list = self.get_urls(formats, format_param)
3763                                 # check urls
3764                                 file_url = self.check_urls(url_list)
3765                                 if file_url is not None:
3766                                         break # got it!
3767                 else:
3768                         if req_format not in formats.keys():
3769                                 self._downloader.trouble(u'ERROR: format is not available')
3770                                 return
3771
3772                         url_list = self.get_urls(formats, req_format)
3773                         file_url = self.check_urls(url_list)
3774                         format_param = req_format
3775
3776                 # We have audio
3777                 self._downloader.increment_downloads()
3778                 try:
3779                         # Process file information
3780                         self._downloader.process_info({
3781                                 'id': file_id.decode('utf-8'),
3782                                 'url': file_url.decode('utf-8'),
3783                                 'uploader':     uploader.decode('utf-8'),
3784                                 'upload_date': u'NA',
3785                                 'title': json_data['name'],
3786                                 'stitle': _simplify_title(json_data['name']),
3787                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3788                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3789                                 'thumbnail': json_data['thumbnail_url'],
3790                                 'description': json_data['description'],
3791                                 'player_url': player_url.decode('utf-8'),
3792                         })
3793                 except UnavailableVideoError, err:
3794                         self._downloader.trouble(u'ERROR: unable to download file')
3795
3796 class StanfordOpenClassroomIE(InfoExtractor):
3797         """Information extractor for Stanford's Open ClassRoom"""
3798
3799         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3800         IE_NAME = u'stanfordoc'
3801
3802         def report_download_webpage(self, objid):
3803                 """Report information extraction."""
3804                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3805
3806         def report_extraction(self, video_id):
3807                 """Report information extraction."""
3808                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3809
3810         def _real_extract(self, url):
3811                 mobj = re.match(self._VALID_URL, url)
3812                 if mobj is None:
3813                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3814                         return
3815
3816                 if mobj.group('course') and mobj.group('video'): # A specific video
3817                         course = mobj.group('course')
3818                         video = mobj.group('video')
3819                         info = {
3820                                 'id': _simplify_title(course + '_' + video),
3821                         }
3822         
3823                         self.report_extraction(info['id'])
3824                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3825                         xmlUrl = baseUrl + video + '.xml'
3826                         try:
3827                                 metaXml = urllib2.urlopen(xmlUrl).read()
3828                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3829                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3830                                 return
3831                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3832                         try:
3833                                 info['title'] = mdoc.findall('./title')[0].text
3834                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3835                         except IndexError:
3836                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3837                                 return
3838                         info['stitle'] = _simplify_title(info['title'])
3839                         info['ext'] = info['url'].rpartition('.')[2]
3840                         info['format'] = info['ext']
3841                         self._downloader.increment_downloads()
3842                         try:
3843                                 self._downloader.process_info(info)
3844                         except UnavailableVideoError, err:
3845                                 self._downloader.trouble(u'\nERROR: unable to download video')
3846                 elif mobj.group('course'): # A course page
3847                         unescapeHTML = HTMLParser.HTMLParser().unescape
3848
3849                         course = mobj.group('course')
3850                         info = {
3851                                 'id': _simplify_title(course),
3852                                 'type': 'playlist',
3853                         }
3854
3855                         self.report_download_webpage(info['id'])
3856                         try:
3857                                 coursepage = urllib2.urlopen(url).read()
3858                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3859                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3860                                 return
3861
3862                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3863                         if m:
3864                                 info['title'] = unescapeHTML(m.group(1))
3865                         else:
3866                                 info['title'] = info['id']
3867                         info['stitle'] = _simplify_title(info['title'])
3868
3869                         m = re.search('<description>([^<]+)</description>', coursepage)
3870                         if m:
3871                                 info['description'] = unescapeHTML(m.group(1))
3872
3873                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3874                         info['list'] = [
3875                                 {
3876                                         'type': 'reference',
3877                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3878                                 }
3879                                         for vpage in links]
3880
3881                         for entry in info['list']:
3882                                 assert entry['type'] == 'reference'
3883                                 self.extract(entry['url'])
3884                 else: # Root page
3885                         unescapeHTML = HTMLParser.HTMLParser().unescape
3886
3887                         info = {
3888                                 'id': 'Stanford OpenClassroom',
3889                                 'type': 'playlist',
3890                         }
3891
3892                         self.report_download_webpage(info['id'])
3893                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3894                         try:
3895                                 rootpage = urllib2.urlopen(rootURL).read()
3896                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3897                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3898                                 return
3899
3900                         info['title'] = info['id']
3901                         info['stitle'] = _simplify_title(info['title'])
3902
3903                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3904                         info['list'] = [
3905                                 {
3906                                         'type': 'reference',
3907                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3908                                 }
3909                                         for cpage in links]
3910
3911                         for entry in info['list']:
3912                                 assert entry['type'] == 'reference'
3913                                 self.extract(entry['url'])
3914
3915 class MTVIE(InfoExtractor):
3916         """Information extractor for MTV.com"""
3917
3918         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3919         IE_NAME = u'mtv'
3920
3921         def report_webpage(self, video_id):
3922                 """Report information extraction."""
3923                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3924
3925         def report_extraction(self, video_id):
3926                 """Report information extraction."""
3927                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3928
3929         def _real_extract(self, url):
3930                 mobj = re.match(self._VALID_URL, url)
3931                 if mobj is None:
3932                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3933                         return
3934                 if not mobj.group('proto'):
3935                         url = 'http://' + url
3936                 video_id = mobj.group('videoid')
3937                 self.report_webpage(video_id)
3938
3939                 request = urllib2.Request(url)
3940                 try:
3941                         webpage = urllib2.urlopen(request).read()
3942                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3943                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3944                         return
3945
3946                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3947                 if mobj is None:
3948                         self._downloader.trouble(u'ERROR: unable to extract song name')
3949                         return
3950                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3951                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3952                 if mobj is None:
3953                         self._downloader.trouble(u'ERROR: unable to extract performer')
3954                         return
3955                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3956                 video_title = performer + ' - ' + song_name 
3957
3958                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3959                 if mobj is None:
3960                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3961                         return
3962                 mtvn_uri = mobj.group(1)
3963
3964                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3965                 if mobj is None:
3966                         self._downloader.trouble(u'ERROR: unable to extract content id')
3967                         return
3968                 content_id = mobj.group(1)
3969
3970                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3971                 self.report_extraction(video_id)
3972                 request = urllib2.Request(videogen_url)
3973                 try:
3974                         metadataXml = urllib2.urlopen(request).read()
3975                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3976                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3977                         return
3978
3979                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3980                 renditions = mdoc.findall('.//rendition')
3981
3982                 # For now, always pick the highest quality.
3983                 rendition = renditions[-1]
3984
3985                 try:
3986                         _,_,ext = rendition.attrib['type'].partition('/')
3987                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3988                         video_url = rendition.find('./src').text
3989                 except KeyError:
3990                         self._downloader.trouble('Invalid rendition field.')
3991                         return
3992
3993                 self._downloader.increment_downloads()
3994                 info = {
3995                         'id': video_id,
3996                         'url': video_url,
3997                         'uploader': performer,
3998                         'title': video_title,
3999                         'stitle': _simplify_title(video_title),
4000                         'ext': ext,
4001                         'format': format,
4002                 }
4003
4004                 try:
4005                         self._downloader.process_info(info)
4006                 except UnavailableVideoError, err:
4007                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4008
4009
4010 class PostProcessor(object):
4011         """Post Processor class.
4012
4013         PostProcessor objects can be added to downloaders with their
4014         add_post_processor() method. When the downloader has finished a
4015         successful download, it will take its internal chain of PostProcessors
4016         and start calling the run() method on each one of them, first with
4017         an initial argument and then with the returned value of the previous
4018         PostProcessor.
4019
4020         The chain will be stopped if one of them ever returns None or the end
4021         of the chain is reached.
4022
4023         PostProcessor objects follow a "mutual registration" process similar
4024         to InfoExtractor objects.
4025         """
4026
4027         _downloader = None
4028
4029         def __init__(self, downloader=None):
4030                 self._downloader = downloader
4031
4032         def set_downloader(self, downloader):
4033                 """Sets the downloader for this PP."""
4034                 self._downloader = downloader
4035
4036         def run(self, information):
4037                 """Run the PostProcessor.
4038
4039                 The "information" argument is a dictionary like the ones
4040                 composed by InfoExtractors. The only difference is that this
4041                 one has an extra field called "filepath" that points to the
4042                 downloaded file.
4043
4044                 When this method returns None, the postprocessing chain is
4045                 stopped. However, this method may return an information
4046                 dictionary that will be passed to the next postprocessing
4047                 object in the chain. It can be the one it received after
4048                 changing some fields.
4049
4050                 In addition, this method may raise a PostProcessingError
4051                 exception that will be taken into account by the downloader
4052                 it was called from.
4053                 """
4054                 return information # by default, do nothing
4055
4056 class AudioConversionError(BaseException):
4057         def __init__(self, message):
4058                 self.message = message
4059
4060 class FFmpegExtractAudioPP(PostProcessor):
4061
4062         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4063                 PostProcessor.__init__(self, downloader)
4064                 if preferredcodec is None:
4065                         preferredcodec = 'best'
4066                 self._preferredcodec = preferredcodec
4067                 self._preferredquality = preferredquality
4068                 self._keepvideo = keepvideo
4069
4070         @staticmethod
4071         def get_audio_codec(path):
4072                 try:
4073                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4074                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4075                         output = handle.communicate()[0]
4076                         if handle.wait() != 0:
4077                                 return None
4078                 except (IOError, OSError):
4079                         return None
4080                 audio_codec = None
4081                 for line in output.split('\n'):
4082                         if line.startswith('codec_name='):
4083                                 audio_codec = line.split('=')[1].strip()
4084                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4085                                 return audio_codec
4086                 return None
4087
4088         @staticmethod
4089         def run_ffmpeg(path, out_path, codec, more_opts):
4090                 if codec is None:
4091                         acodec_opts = []
4092                 else:
4093                         acodec_opts = ['-acodec', codec]
4094                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4095                 try:
4096                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4097                         stdout,stderr = p.communicate()
4098                 except (IOError, OSError):
4099                         e = sys.exc_info()[1]
4100                         if isinstance(e, OSError) and e.errno == 2:
4101                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4102                         else:
4103                                 raise e
4104                 if p.returncode != 0:
4105                         msg = stderr.strip().split('\n')[-1]
4106                         raise AudioConversionError(msg)
4107
4108         def run(self, information):
4109                 path = information['filepath']
4110
4111                 filecodec = self.get_audio_codec(path)
4112                 if filecodec is None:
4113                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4114                         return None
4115
4116                 more_opts = []
4117                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4118                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4119                                 # Lossless, but in another container
4120                                 acodec = 'copy'
4121                                 extension = self._preferredcodec
4122                                 more_opts = ['-absf', 'aac_adtstoasc']
4123                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4124                                 # Lossless if possible
4125                                 acodec = 'copy'
4126                                 extension = filecodec
4127                                 if filecodec == 'aac':
4128                                         more_opts = ['-f', 'adts']
4129                                 if filecodec == 'vorbis':
4130                                         extension = 'ogg'
4131                         else:
4132                                 # MP3 otherwise.
4133                                 acodec = 'libmp3lame'
4134                                 extension = 'mp3'
4135                                 more_opts = []
4136                                 if self._preferredquality is not None:
4137                                         more_opts += ['-ab', self._preferredquality]
4138                 else:
4139                         # We convert the audio (lossy)
4140                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4141                         extension = self._preferredcodec
4142                         more_opts = []
4143                         if self._preferredquality is not None:
4144                                 more_opts += ['-ab', self._preferredquality]
4145                         if self._preferredcodec == 'aac':
4146                                 more_opts += ['-f', 'adts']
4147                         if self._preferredcodec == 'm4a':
4148                                 more_opts += ['-absf', 'aac_adtstoasc']
4149                         if self._preferredcodec == 'vorbis':
4150                                 extension = 'ogg'
4151                         if self._preferredcodec == 'wav':
4152                                 extension = 'wav'
4153                                 more_opts += ['-f', 'wav']
4154
4155                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4156                 new_path = prefix + sep + extension
4157                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4158                 try:
4159                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4160                 except:
4161                         etype,e,tb = sys.exc_info()
4162                         if isinstance(e, AudioConversionError):
4163                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4164                         else:
4165                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4166                         return None
4167
4168                 # Try to update the date time for extracted audio file.
4169                 if information.get('filetime') is not None:
4170                         try:
4171                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4172                         except:
4173                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4174
4175                 if not self._keepvideo:
4176                         try:
4177                                 os.remove(_encodeFilename(path))
4178                         except (IOError, OSError):
4179                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4180                                 return None
4181
4182                 information['filepath'] = new_path
4183                 return information
4184
4185
4186 def updateSelf(downloader, filename):
4187         ''' Update the program file with the latest version from the repository '''
4188         # Note: downloader only used for options
4189         if not os.access(filename, os.W_OK):
4190                 sys.exit('ERROR: no write permissions on %s' % filename)
4191
4192         downloader.to_screen(u'Updating to latest version...')
4193
4194         try:
4195                 try:
4196                         urlh = urllib.urlopen(UPDATE_URL)
4197                         newcontent = urlh.read()
4198                         
4199                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4200                         if vmatch is not None and vmatch.group(1) == __version__:
4201                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4202                                 return
4203                 finally:
4204                         urlh.close()
4205         except (IOError, OSError), err:
4206                 sys.exit('ERROR: unable to download latest version')
4207
4208         try:
4209                 outf = open(filename, 'wb')
4210                 try:
4211                         outf.write(newcontent)
4212                 finally:
4213                         outf.close()
4214         except (IOError, OSError), err:
4215                 sys.exit('ERROR: unable to overwrite current version')
4216
4217         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4218
4219 def parseOpts():
4220         def _readOptions(filename_bytes):
4221                 try:
4222                         optionf = open(filename_bytes)
4223                 except IOError:
4224                         return [] # silently skip if file is not present
4225                 try:
4226                         res = []
4227                         for l in optionf:
4228                                 res += shlex.split(l, comments=True)
4229                 finally:
4230                         optionf.close()
4231                 return res
4232
4233         def _format_option_string(option):
4234                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4235
4236                 opts = []
4237
4238                 if option._short_opts: opts.append(option._short_opts[0])
4239                 if option._long_opts: opts.append(option._long_opts[0])
4240                 if len(opts) > 1: opts.insert(1, ', ')
4241
4242                 if option.takes_value(): opts.append(' %s' % option.metavar)
4243
4244                 return "".join(opts)
4245
4246         def _find_term_columns():
4247                 columns = os.environ.get('COLUMNS', None)
4248                 if columns:
4249                         return int(columns)
4250
4251                 try:
4252                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4253                         out,err = sp.communicate()
4254                         return int(out.split()[1])
4255                 except:
4256                         pass
4257                 return None
4258
4259         max_width = 80
4260         max_help_position = 80
4261
4262         # No need to wrap help messages if we're on a wide console
4263         columns = _find_term_columns()
4264         if columns: max_width = columns
4265
4266         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4267         fmt.format_option_strings = _format_option_string
4268
4269         kw = {
4270                 'version'   : __version__,
4271                 'formatter' : fmt,
4272                 'usage' : '%prog [options] url [url...]',
4273                 'conflict_handler' : 'resolve',
4274         }
4275
4276         parser = optparse.OptionParser(**kw)
4277
4278         # option groups
4279         general        = optparse.OptionGroup(parser, 'General Options')
4280         selection      = optparse.OptionGroup(parser, 'Video Selection')
4281         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4282         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4283         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4284         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4285         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4286
4287         general.add_option('-h', '--help',
4288                         action='help', help='print this help text and exit')
4289         general.add_option('-v', '--version',
4290                         action='version', help='print program version and exit')
4291         general.add_option('-U', '--update',
4292                         action='store_true', dest='update_self', help='update this program to latest version')
4293         general.add_option('-i', '--ignore-errors',
4294                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4295         general.add_option('-r', '--rate-limit',
4296                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4297         general.add_option('-R', '--retries',
4298                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4299         general.add_option('--dump-user-agent',
4300                         action='store_true', dest='dump_user_agent',
4301                         help='display the current browser identification', default=False)
4302         general.add_option('--list-extractors',
4303                         action='store_true', dest='list_extractors',
4304                         help='List all supported extractors and the URLs they would handle', default=False)
4305
4306         selection.add_option('--playlist-start',
4307                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4308         selection.add_option('--playlist-end',
4309                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4310         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4311         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4312         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4313
4314         authentication.add_option('-u', '--username',
4315                         dest='username', metavar='USERNAME', help='account username')
4316         authentication.add_option('-p', '--password',
4317                         dest='password', metavar='PASSWORD', help='account password')
4318         authentication.add_option('-n', '--netrc',
4319                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4320
4321
4322         video_format.add_option('-f', '--format',
4323                         action='store', dest='format', metavar='FORMAT', help='video format code')
4324         video_format.add_option('--all-formats',
4325                         action='store_const', dest='format', help='download all available video formats', const='all')
4326         video_format.add_option('--prefer-free-formats',
4327                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4328         video_format.add_option('--max-quality',
4329                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4330         video_format.add_option('-F', '--list-formats',
4331                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4332
4333
4334         verbosity.add_option('-q', '--quiet',
4335                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4336         verbosity.add_option('-s', '--simulate',
4337                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4338         verbosity.add_option('--skip-download',
4339                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4340         verbosity.add_option('-g', '--get-url',
4341                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4342         verbosity.add_option('-e', '--get-title',
4343                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4344         verbosity.add_option('--get-thumbnail',
4345                         action='store_true', dest='getthumbnail',
4346                         help='simulate, quiet but print thumbnail URL', default=False)
4347         verbosity.add_option('--get-description',
4348                         action='store_true', dest='getdescription',
4349                         help='simulate, quiet but print video description', default=False)
4350         verbosity.add_option('--get-filename',
4351                         action='store_true', dest='getfilename',
4352                         help='simulate, quiet but print output filename', default=False)
4353         verbosity.add_option('--get-format',
4354                         action='store_true', dest='getformat',
4355                         help='simulate, quiet but print output format', default=False)
4356         verbosity.add_option('--no-progress',
4357                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4358         verbosity.add_option('--console-title',
4359                         action='store_true', dest='consoletitle',
4360                         help='display progress in console titlebar', default=False)
4361         verbosity.add_option('-v', '--verbose',
4362                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4363
4364
4365         filesystem.add_option('-t', '--title',
4366                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4367         filesystem.add_option('-l', '--literal',
4368                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4369         filesystem.add_option('-A', '--auto-number',
4370                         action='store_true', dest='autonumber',
4371                         help='number downloaded files starting from 00000', default=False)
4372         filesystem.add_option('-o', '--output',
4373                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4374         filesystem.add_option('-a', '--batch-file',
4375                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4376         filesystem.add_option('-w', '--no-overwrites',
4377                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4378         filesystem.add_option('-c', '--continue',
4379                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4380         filesystem.add_option('--no-continue',
4381                         action='store_false', dest='continue_dl',
4382                         help='do not resume partially downloaded files (restart from beginning)')
4383         filesystem.add_option('--cookies',
4384                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4385         filesystem.add_option('--no-part',
4386                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4387         filesystem.add_option('--no-mtime',
4388                         action='store_false', dest='updatetime',
4389                         help='do not use the Last-modified header to set the file modification time', default=True)
4390         filesystem.add_option('--write-description',
4391                         action='store_true', dest='writedescription',
4392                         help='write video description to a .description file', default=False)
4393         filesystem.add_option('--write-info-json',
4394                         action='store_true', dest='writeinfojson',
4395                         help='write video metadata to a .info.json file', default=False)
4396
4397
4398         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4399                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4400         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4401                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4402         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4403                         help='ffmpeg audio bitrate specification, 128k by default')
4404         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4405                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4406
4407
4408         parser.add_option_group(general)
4409         parser.add_option_group(selection)
4410         parser.add_option_group(filesystem)
4411         parser.add_option_group(verbosity)
4412         parser.add_option_group(video_format)
4413         parser.add_option_group(authentication)
4414         parser.add_option_group(postproc)
4415
4416         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4417         if xdg_config_home:
4418                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4419         else:
4420                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4421         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4422         opts, args = parser.parse_args(argv)
4423
4424         return parser, opts, args
4425
4426 def gen_extractors():
4427         """ Return a list of an instance of every supported extractor.
4428         The order does matter; the first extractor matched is the one handling the URL.
4429         """
4430         youtube_ie = YoutubeIE()
4431         google_ie = GoogleIE()
4432         yahoo_ie = YahooIE()
4433         return [
4434                 YoutubePlaylistIE(youtube_ie),
4435                 YoutubeUserIE(youtube_ie),
4436                 YoutubeSearchIE(youtube_ie),
4437                 youtube_ie,
4438                 MetacafeIE(youtube_ie),
4439                 DailymotionIE(),
4440                 google_ie,
4441                 GoogleSearchIE(google_ie),
4442                 PhotobucketIE(),
4443                 yahoo_ie,
4444                 YahooSearchIE(yahoo_ie),
4445                 DepositFilesIE(),
4446                 FacebookIE(),
4447                 BlipTVIE(),
4448                 VimeoIE(),
4449                 MyVideoIE(),
4450                 ComedyCentralIE(),
4451                 EscapistIE(),
4452                 CollegeHumorIE(),
4453                 XVideosIE(),
4454                 SoundcloudIE(),
4455                 InfoQIE(),
4456                 MixcloudIE(),
4457                 StanfordOpenClassroomIE(),
4458                 MTVIE(),
4459
4460                 GenericIE()
4461         ]
4462
4463 def _real_main():
4464         parser, opts, args = parseOpts()
4465
4466         # Open appropriate CookieJar
4467         if opts.cookiefile is None:
4468                 jar = cookielib.CookieJar()
4469         else:
4470                 try:
4471                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4472                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4473                                 jar.load()
4474                 except (IOError, OSError), err:
4475                         sys.exit(u'ERROR: unable to open cookie file')
4476
4477         # Dump user agent
4478         if opts.dump_user_agent:
4479                 print std_headers['User-Agent']
4480                 sys.exit(0)
4481
4482         # Batch file verification
4483         batchurls = []
4484         if opts.batchfile is not None:
4485                 try:
4486                         if opts.batchfile == '-':
4487                                 batchfd = sys.stdin
4488                         else:
4489                                 batchfd = open(opts.batchfile, 'r')
4490                         batchurls = batchfd.readlines()
4491                         batchurls = [x.strip() for x in batchurls]
4492                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4493                 except IOError:
4494                         sys.exit(u'ERROR: batch file could not be read')
4495         all_urls = batchurls + args
4496
4497         # General configuration
4498         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4499         proxy_handler = urllib2.ProxyHandler()
4500         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4501         urllib2.install_opener(opener)
4502         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4503
4504         if opts.verbose:
4505                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4506
4507         extractors = gen_extractors()
4508
4509         if opts.list_extractors:
4510                 for ie in extractors:
4511                         print(ie.IE_NAME)
4512                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4513                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4514                         for mu in matchedUrls:
4515                                 print(u'  ' + mu)
4516                 sys.exit(0)
4517
4518         # Conflicting, missing and erroneous options
4519         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4520                 parser.error(u'using .netrc conflicts with giving username/password')
4521         if opts.password is not None and opts.username is None:
4522                 parser.error(u'account username missing')
4523         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4524                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4525         if opts.usetitle and opts.useliteral:
4526                 parser.error(u'using title conflicts with using literal title')
4527         if opts.username is not None and opts.password is None:
4528                 opts.password = getpass.getpass(u'Type account password and press return:')
4529         if opts.ratelimit is not None:
4530                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4531                 if numeric_limit is None:
4532                         parser.error(u'invalid rate limit specified')
4533                 opts.ratelimit = numeric_limit
4534         if opts.retries is not None:
4535                 try:
4536                         opts.retries = long(opts.retries)
4537                 except (TypeError, ValueError), err:
4538                         parser.error(u'invalid retry count specified')
4539         try:
4540                 opts.playliststart = int(opts.playliststart)
4541                 if opts.playliststart <= 0:
4542                         raise ValueError(u'Playlist start must be positive')
4543         except (TypeError, ValueError), err:
4544                 parser.error(u'invalid playlist start number specified')
4545         try:
4546                 opts.playlistend = int(opts.playlistend)
4547                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4548                         raise ValueError(u'Playlist end must be greater than playlist start')
4549         except (TypeError, ValueError), err:
4550                 parser.error(u'invalid playlist end number specified')
4551         if opts.extractaudio:
4552                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4553                         parser.error(u'invalid audio format specified')
4554
4555         # File downloader
4556         fd = FileDownloader({
4557                 'usenetrc': opts.usenetrc,
4558                 'username': opts.username,
4559                 'password': opts.password,
4560                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4561                 'forceurl': opts.geturl,
4562                 'forcetitle': opts.gettitle,
4563                 'forcethumbnail': opts.getthumbnail,
4564                 'forcedescription': opts.getdescription,
4565                 'forcefilename': opts.getfilename,
4566                 'forceformat': opts.getformat,
4567                 'simulate': opts.simulate,
4568                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4569                 'format': opts.format,
4570                 'format_limit': opts.format_limit,
4571                 'listformats': opts.listformats,
4572                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4573                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4574                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4575                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4576                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4577                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4578                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4579                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4580                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4581                         or u'%(id)s.%(ext)s'),
4582                 'ignoreerrors': opts.ignoreerrors,
4583                 'ratelimit': opts.ratelimit,
4584                 'nooverwrites': opts.nooverwrites,
4585                 'retries': opts.retries,
4586                 'continuedl': opts.continue_dl,
4587                 'noprogress': opts.noprogress,
4588                 'playliststart': opts.playliststart,
4589                 'playlistend': opts.playlistend,
4590                 'logtostderr': opts.outtmpl == '-',
4591                 'consoletitle': opts.consoletitle,
4592                 'nopart': opts.nopart,
4593                 'updatetime': opts.updatetime,
4594                 'writedescription': opts.writedescription,
4595                 'writeinfojson': opts.writeinfojson,
4596                 'matchtitle': opts.matchtitle,
4597                 'rejecttitle': opts.rejecttitle,
4598                 'max_downloads': opts.max_downloads,
4599                 'prefer_free_formats': opts.prefer_free_formats,
4600                 'verbose': opts.verbose,
4601                 })
4602         for extractor in extractors:
4603                 fd.add_info_extractor(extractor)
4604
4605         # PostProcessors
4606         if opts.extractaudio:
4607                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4608
4609         # Update version
4610         if opts.update_self:
4611                 updateSelf(fd, sys.argv[0])
4612
4613         # Maybe do nothing
4614         if len(all_urls) < 1:
4615                 if not opts.update_self:
4616                         parser.error(u'you must provide at least one URL')
4617                 else:
4618                         sys.exit()
4619         
4620         try:
4621                 retcode = fd.download(all_urls)
4622         except MaxDownloadsReached:
4623                 fd.to_screen(u'--max-download limit reached, aborting.')
4624                 retcode = 101
4625
4626         # Dump cookie jar if requested
4627         if opts.cookiefile is not None:
4628                 try:
4629                         jar.save()
4630                 except (IOError, OSError), err:
4631                         sys.exit(u'ERROR: unable to save cookie jar')
4632
4633         sys.exit(retcode)
4634
4635 def main():
4636         try:
4637                 _real_main()
4638         except DownloadError:
4639                 sys.exit(1)
4640         except SameFileError:
4641                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4642         except KeyboardInterrupt:
4643                 sys.exit(u'\nERROR: Interrupted by user')
4644
4645 if __name__ == '__main__':
4646         main()
4647
4648 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: