release 2012.01.05
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2012.01.05'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48         import ctypes
49
50 try:
51         import email.utils
52 except ImportError: # Python 2.4
53         import email.Utils
54 try:
55         import cStringIO as StringIO
56 except ImportError:
57         import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61         from urlparse import parse_qs
62 except ImportError:
63         from cgi import parse_qs
64
65 try:
66         import lxml.etree
67 except ImportError:
68         pass # Handled below
69
70 try:
71         import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79         'Accept-Encoding': 'gzip, deflate',
80         'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84         import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86         import re
87         class json(object):
88                 @staticmethod
89                 def loads(s):
90                         s = s.decode('UTF-8')
91                         def raiseError(msg, i):
92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93                         def skipSpace(i, expectMore=True):
94                                 while i < len(s) and s[i] in ' \t\r\n':
95                                         i += 1
96                                 if expectMore:
97                                         if i >= len(s):
98                                                 raiseError('Premature end', i)
99                                 return i
100                         def decodeEscape(match):
101                                 esc = match.group(1)
102                                 _STATIC = {
103                                         '"': '"',
104                                         '\\': '\\',
105                                         '/': '/',
106                                         'b': unichr(0x8),
107                                         'f': unichr(0xc),
108                                         'n': '\n',
109                                         'r': '\r',
110                                         't': '\t',
111                                 }
112                                 if esc in _STATIC:
113                                         return _STATIC[esc]
114                                 if esc[0] == 'u':
115                                         if len(esc) == 1+4:
116                                                 return unichr(int(esc[1:5], 16))
117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
118                                                 hi = int(esc[1:5], 16)
119                                                 low = int(esc[7:11], 16)
120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121                                 raise ValueError('Unknown escape ' + str(esc))
122                         def parseString(i):
123                                 i += 1
124                                 e = i
125                                 while True:
126                                         e = s.index('"', e)
127                                         bslashes = 0
128                                         while s[e-bslashes-1] == '\\':
129                                                 bslashes += 1
130                                         if bslashes % 2 == 1:
131                                                 e += 1
132                                                 continue
133                                         break
134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135                                 stri = rexp.sub(decodeEscape, s[i:e])
136                                 return (e+1,stri)
137                         def parseObj(i):
138                                 i += 1
139                                 res = {}
140                                 i = skipSpace(i)
141                                 if s[i] == '}': # Empty dictionary
142                                         return (i+1,res)
143                                 while True:
144                                         if s[i] != '"':
145                                                 raiseError('Expected a string object key', i)
146                                         i,key = parseString(i)
147                                         i = skipSpace(i)
148                                         if i >= len(s) or s[i] != ':':
149                                                 raiseError('Expected a colon', i)
150                                         i,val = parse(i+1)
151                                         res[key] = val
152                                         i = skipSpace(i)
153                                         if s[i] == '}':
154                                                 return (i+1, res)
155                                         if s[i] != ',':
156                                                 raiseError('Expected comma or closing curly brace', i)
157                                         i = skipSpace(i+1)
158                         def parseArray(i):
159                                 res = []
160                                 i = skipSpace(i+1)
161                                 if s[i] == ']': # Empty array
162                                         return (i+1,res)
163                                 while True:
164                                         i,val = parse(i)
165                                         res.append(val)
166                                         i = skipSpace(i) # Raise exception if premature end
167                                         if s[i] == ']':
168                                                 return (i+1, res)
169                                         if s[i] != ',':
170                                                 raiseError('Expected a comma or closing bracket', i)
171                                         i = skipSpace(i+1)
172                         def parseDiscrete(i):
173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
174                                         if s.startswith(k, i):
175                                                 return (i+len(k), v)
176                                 raiseError('Not a boolean (or null)', i)
177                         def parseNumber(i):
178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179                                 if mobj is None:
180                                         raiseError('Not a number', i)
181                                 nums = mobj.group(1)
182                                 if '.' in nums or 'e' in nums or 'E' in nums:
183                                         return (i+len(nums), float(nums))
184                                 return (i+len(nums), int(nums))
185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186                         def parse(i):
187                                 i = skipSpace(i)
188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
189                                 i = skipSpace(i, False)
190                                 return (i,res)
191                         i,res = parse(0)
192                         if i < len(s):
193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194                         return res
195
196 def preferredencoding():
197         """Get preferred encoding.
198
199         Returns the best encoding scheme for the system, based on
200         locale.getpreferredencoding() and some further tweaks.
201         """
202         def yield_preferredencoding():
203                 try:
204                         pref = locale.getpreferredencoding()
205                         u'TEST'.encode(pref)
206                 except:
207                         pref = 'UTF-8'
208                 while True:
209                         yield pref
210         return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214         """Transforms an HTML entity to a Unicode character.
215
216         This function receives a match object and is intended to be used with
217         the re.sub() function.
218         """
219         entity = matchobj.group(1)
220
221         # Known non-numeric HTML entity
222         if entity in htmlentitydefs.name2codepoint:
223                 return unichr(htmlentitydefs.name2codepoint[entity])
224
225         # Unicode character
226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
227         if mobj is not None:
228                 numstr = mobj.group(1)
229                 if numstr.startswith(u'x'):
230                         base = 16
231                         numstr = u'0%s' % numstr
232                 else:
233                         base = 10
234                 return unichr(long(numstr, base))
235
236         # Unknown entity in name, return its literal representation
237         return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241         """Sanitizes a video title so it could be used as part of a filename."""
242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243         return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247         """Try to open the given filename, and slightly tweak it if this fails.
248
249         Attempts to open the given filename. If this fails, it tries to change
250         the filename slightly, step by step, until it's either able to open it
251         or it fails and raises a final exception, like the standard open()
252         function.
253
254         It returns the tuple (stream, definitive_file_name).
255         """
256         try:
257                 if filename == u'-':
258                         if sys.platform == 'win32':
259                                 import msvcrt
260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261                         return (sys.stdout, filename)
262                 stream = open(filename, open_mode)
263                 return (stream, filename)
264         except (IOError, OSError), err:
265                 # In case of error, try to remove win32 forbidden chars
266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268                 # An exception here should be caught in the caller
269                 stream = open(filename, open_mode)
270                 return (stream, filename)
271
272
273 def timeconvert(timestr):
274         """Convert RFC 2822 defined time string into system timestamp"""
275         timestamp = None
276         timetuple = email.utils.parsedate_tz(timestr)
277         if timetuple is not None:
278                 timestamp = email.utils.mktime_tz(timetuple)
279         return timestamp
280
281 def _simplify_title(title):
282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283         return expr.sub(u'_', title).strip(u'_')
284
285 def _orderedSet(iterable):
286         """ Remove all duplicates from the input iterable """
287         res = []
288         for el in iterable:
289                 if el not in res:
290                         res.append(el)
291         return res
292
293 def _unescapeHTML(s):
294     """
295     @param s a string (of type unicode)
296     """
297     assert type(s) == type(u'')
298
299     htmlParser = HTMLParser.HTMLParser()
300     return htmlParser.unescape(s)
301
302 class DownloadError(Exception):
303         """Download Error exception.
304
305         This exception may be thrown by FileDownloader objects if they are not
306         configured to continue on errors. They will contain the appropriate
307         error message.
308         """
309         pass
310
311
312 class SameFileError(Exception):
313         """Same File exception.
314
315         This exception will be thrown by FileDownloader objects if they detect
316         multiple files would have to be downloaded to the same file on disk.
317         """
318         pass
319
320
321 class PostProcessingError(Exception):
322         """Post Processing exception.
323
324         This exception may be raised by PostProcessor's .run() method to
325         indicate an error in the postprocessing task.
326         """
327         pass
328
329 class MaxDownloadsReached(Exception):
330         """ --max-downloads limit has been reached. """
331         pass
332
333
334 class UnavailableVideoError(Exception):
335         """Unavailable Format exception.
336
337         This exception will be thrown when a video is requested
338         in a format that is not available for that video.
339         """
340         pass
341
342
343 class ContentTooShortError(Exception):
344         """Content Too Short exception.
345
346         This exception may be raised by FileDownloader objects when a file they
347         download is too small for what the server announced first, indicating
348         the connection was probably interrupted.
349         """
350         # Both in bytes
351         downloaded = None
352         expected = None
353
354         def __init__(self, downloaded, expected):
355                 self.downloaded = downloaded
356                 self.expected = expected
357
358
359 class YoutubeDLHandler(urllib2.HTTPHandler):
360         """Handler for HTTP requests and responses.
361
362         This class, when installed with an OpenerDirector, automatically adds
363         the standard headers to every HTTP request and handles gzipped and
364         deflated responses from web servers. If compression is to be avoided in
365         a particular request, the original request in the program code only has
366         to include the HTTP header "Youtubedl-No-Compression", which will be
367         removed before making the real request.
368
369         Part of this code was copied from:
370
371         http://techknack.net/python-urllib2-handlers/
372
373         Andrew Rowls, the author of that code, agreed to release it to the
374         public domain.
375         """
376
377         @staticmethod
378         def deflate(data):
379                 try:
380                         return zlib.decompress(data, -zlib.MAX_WBITS)
381                 except zlib.error:
382                         return zlib.decompress(data)
383
384         @staticmethod
385         def addinfourl_wrapper(stream, headers, url, code):
386                 if hasattr(urllib2.addinfourl, 'getcode'):
387                         return urllib2.addinfourl(stream, headers, url, code)
388                 ret = urllib2.addinfourl(stream, headers, url)
389                 ret.code = code
390                 return ret
391
392         def http_request(self, req):
393                 for h in std_headers:
394                         if h in req.headers:
395                                 del req.headers[h]
396                         req.add_header(h, std_headers[h])
397                 if 'Youtubedl-no-compression' in req.headers:
398                         if 'Accept-encoding' in req.headers:
399                                 del req.headers['Accept-encoding']
400                         del req.headers['Youtubedl-no-compression']
401                 return req
402
403         def http_response(self, req, resp):
404                 old_resp = resp
405                 # gzip
406                 if resp.headers.get('Content-encoding', '') == 'gzip':
407                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
408                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
409                         resp.msg = old_resp.msg
410                 # deflate
411                 if resp.headers.get('Content-encoding', '') == 'deflate':
412                         gz = StringIO.StringIO(self.deflate(resp.read()))
413                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
414                         resp.msg = old_resp.msg
415                 return resp
416
417
418 class FileDownloader(object):
419         """File Downloader class.
420
421         File downloader objects are the ones responsible of downloading the
422         actual video file and writing it to disk if the user has requested
423         it, among some other tasks. In most cases there should be one per
424         program. As, given a video URL, the downloader doesn't know how to
425         extract all the needed information, task that InfoExtractors do, it
426         has to pass the URL to one of them.
427
428         For this, file downloader objects have a method that allows
429         InfoExtractors to be registered in a given order. When it is passed
430         a URL, the file downloader handles it to the first InfoExtractor it
431         finds that reports being able to handle it. The InfoExtractor extracts
432         all the information about the video or videos the URL refers to, and
433         asks the FileDownloader to process the video information, possibly
434         downloading the video.
435
436         File downloaders accept a lot of parameters. In order not to saturate
437         the object constructor with arguments, it receives a dictionary of
438         options instead. These options are available through the params
439         attribute for the InfoExtractors to use. The FileDownloader also
440         registers itself as the downloader in charge for the InfoExtractors
441         that are added to it, so this is a "mutual registration".
442
443         Available options:
444
445         username:         Username for authentication purposes.
446         password:         Password for authentication purposes.
447         usenetrc:         Use netrc for authentication instead.
448         quiet:            Do not print messages to stdout.
449         forceurl:         Force printing final URL.
450         forcetitle:       Force printing title.
451         forcethumbnail:   Force printing thumbnail URL.
452         forcedescription: Force printing description.
453         forcefilename:    Force printing final filename.
454         simulate:         Do not download the video files.
455         format:           Video format code.
456         format_limit:     Highest quality format to try.
457         outtmpl:          Template for output names.
458         ignoreerrors:     Do not stop on download errors.
459         ratelimit:        Download speed limit, in bytes/sec.
460         nooverwrites:     Prevent overwriting files.
461         retries:          Number of times to retry for HTTP error 5xx
462         continuedl:       Try to continue downloads if possible.
463         noprogress:       Do not print the progress bar.
464         playliststart:    Playlist item to start at.
465         playlistend:      Playlist item to end at.
466         matchtitle:       Download only matching titles.
467         rejecttitle:      Reject downloads for matching titles.
468         logtostderr:      Log messages to stderr instead of stdout.
469         consoletitle:     Display progress in console window's titlebar.
470         nopart:           Do not use temporary .part files.
471         updatetime:       Use the Last-modified header to set output file timestamps.
472         writedescription: Write the video description to a .description file
473         writeinfojson:    Write the video description to a .info.json file
474         """
475
476         params = None
477         _ies = []
478         _pps = []
479         _download_retcode = None
480         _num_downloads = None
481         _screen_file = None
482
483         def __init__(self, params):
484                 """Create a FileDownloader object with the given options."""
485                 self._ies = []
486                 self._pps = []
487                 self._download_retcode = 0
488                 self._num_downloads = 0
489                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
490                 self.params = params
491
492         @staticmethod
493         def format_bytes(bytes):
494                 if bytes is None:
495                         return 'N/A'
496                 if type(bytes) is str:
497                         bytes = float(bytes)
498                 if bytes == 0.0:
499                         exponent = 0
500                 else:
501                         exponent = long(math.log(bytes, 1024.0))
502                 suffix = 'bkMGTPEZY'[exponent]
503                 converted = float(bytes) / float(1024 ** exponent)
504                 return '%.2f%s' % (converted, suffix)
505
506         @staticmethod
507         def calc_percent(byte_counter, data_len):
508                 if data_len is None:
509                         return '---.-%'
510                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
511
512         @staticmethod
513         def calc_eta(start, now, total, current):
514                 if total is None:
515                         return '--:--'
516                 dif = now - start
517                 if current == 0 or dif < 0.001: # One millisecond
518                         return '--:--'
519                 rate = float(current) / dif
520                 eta = long((float(total) - float(current)) / rate)
521                 (eta_mins, eta_secs) = divmod(eta, 60)
522                 if eta_mins > 99:
523                         return '--:--'
524                 return '%02d:%02d' % (eta_mins, eta_secs)
525
526         @staticmethod
527         def calc_speed(start, now, bytes):
528                 dif = now - start
529                 if bytes == 0 or dif < 0.001: # One millisecond
530                         return '%10s' % '---b/s'
531                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
532
533         @staticmethod
534         def best_block_size(elapsed_time, bytes):
535                 new_min = max(bytes / 2.0, 1.0)
536                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
537                 if elapsed_time < 0.001:
538                         return long(new_max)
539                 rate = bytes / elapsed_time
540                 if rate > new_max:
541                         return long(new_max)
542                 if rate < new_min:
543                         return long(new_min)
544                 return long(rate)
545
546         @staticmethod
547         def parse_bytes(bytestr):
548                 """Parse a string indicating a byte quantity into a long integer."""
549                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
550                 if matchobj is None:
551                         return None
552                 number = float(matchobj.group(1))
553                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
554                 return long(round(number * multiplier))
555
556         def add_info_extractor(self, ie):
557                 """Add an InfoExtractor object to the end of the list."""
558                 self._ies.append(ie)
559                 ie.set_downloader(self)
560
561         def add_post_processor(self, pp):
562                 """Add a PostProcessor object to the end of the chain."""
563                 self._pps.append(pp)
564                 pp.set_downloader(self)
565
566         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
567                 """Print message to stdout if not in quiet mode."""
568                 try:
569                         if not self.params.get('quiet', False):
570                                 terminator = [u'\n', u''][skip_eol]
571                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
572                         self._screen_file.flush()
573                 except (UnicodeEncodeError), err:
574                         if not ignore_encoding_errors:
575                                 raise
576
577         def to_stderr(self, message):
578                 """Print message to stderr."""
579                 print >>sys.stderr, message.encode(preferredencoding())
580
581         def to_cons_title(self, message):
582                 """Set console/terminal window title to message."""
583                 if not self.params.get('consoletitle', False):
584                         return
585                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
586                         # c_wchar_p() might not be necessary if `message` is
587                         # already of type unicode()
588                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
589                 elif 'TERM' in os.environ:
590                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
591
592         def fixed_template(self):
593                 """Checks if the output template is fixed."""
594                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
595
596         def trouble(self, message=None):
597                 """Determine action to take when a download problem appears.
598
599                 Depending on if the downloader has been configured to ignore
600                 download errors or not, this method may throw an exception or
601                 not when errors are found, after printing the message.
602                 """
603                 if message is not None:
604                         self.to_stderr(message)
605                 if not self.params.get('ignoreerrors', False):
606                         raise DownloadError(message)
607                 self._download_retcode = 1
608
609         def slow_down(self, start_time, byte_counter):
610                 """Sleep if the download speed is over the rate limit."""
611                 rate_limit = self.params.get('ratelimit', None)
612                 if rate_limit is None or byte_counter == 0:
613                         return
614                 now = time.time()
615                 elapsed = now - start_time
616                 if elapsed <= 0.0:
617                         return
618                 speed = float(byte_counter) / elapsed
619                 if speed > rate_limit:
620                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
621
622         def temp_name(self, filename):
623                 """Returns a temporary filename for the given filename."""
624                 if self.params.get('nopart', False) or filename == u'-' or \
625                                 (os.path.exists(filename) and not os.path.isfile(filename)):
626                         return filename
627                 return filename + u'.part'
628
629         def undo_temp_name(self, filename):
630                 if filename.endswith(u'.part'):
631                         return filename[:-len(u'.part')]
632                 return filename
633
634         def try_rename(self, old_filename, new_filename):
635                 try:
636                         if old_filename == new_filename:
637                                 return
638                         os.rename(old_filename, new_filename)
639                 except (IOError, OSError), err:
640                         self.trouble(u'ERROR: unable to rename file')
641
642         def try_utime(self, filename, last_modified_hdr):
643                 """Try to set the last-modified time of the given file."""
644                 if last_modified_hdr is None:
645                         return
646                 if not os.path.isfile(filename):
647                         return
648                 timestr = last_modified_hdr
649                 if timestr is None:
650                         return
651                 filetime = timeconvert(timestr)
652                 if filetime is None:
653                         return filetime
654                 try:
655                         os.utime(filename, (time.time(), filetime))
656                 except:
657                         pass
658                 return filetime
659
660         def report_writedescription(self, descfn):
661                 """ Report that the description file is being written """
662                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
663
664         def report_writeinfojson(self, infofn):
665                 """ Report that the metadata file has been written """
666                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
667
668         def report_destination(self, filename):
669                 """Report destination filename."""
670                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
671
672         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
673                 """Report download progress."""
674                 if self.params.get('noprogress', False):
675                         return
676                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
677                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
678                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
679                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
680
681         def report_resuming_byte(self, resume_len):
682                 """Report attempt to resume at given byte."""
683                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
684
685         def report_retry(self, count, retries):
686                 """Report retry in case of HTTP error 5xx"""
687                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
688
689         def report_file_already_downloaded(self, file_name):
690                 """Report file has already been fully downloaded."""
691                 try:
692                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
693                 except (UnicodeEncodeError), err:
694                         self.to_screen(u'[download] The file has already been downloaded')
695
696         def report_unable_to_resume(self):
697                 """Report it was impossible to resume download."""
698                 self.to_screen(u'[download] Unable to resume')
699
700         def report_finish(self):
701                 """Report download finished."""
702                 if self.params.get('noprogress', False):
703                         self.to_screen(u'[download] Download completed')
704                 else:
705                         self.to_screen(u'')
706
707         def increment_downloads(self):
708                 """Increment the ordinal that assigns a number to each file."""
709                 self._num_downloads += 1
710
711         def prepare_filename(self, info_dict):
712                 """Generate the output filename."""
713                 try:
714                         template_dict = dict(info_dict)
715                         template_dict['epoch'] = unicode(long(time.time()))
716                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
717                         filename = self.params['outtmpl'] % template_dict
718                         return filename
719                 except (ValueError, KeyError), err:
720                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
721                         return None
722
723         def _match_entry(self, info_dict):
724                 """ Returns None iff the file should be downloaded """
725
726                 title = info_dict['title']
727                 matchtitle = self.params.get('matchtitle', False)
728                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
729                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
730                 rejecttitle = self.params.get('rejecttitle', False)
731                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
732                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
733                 return None
734
735         def process_info(self, info_dict):
736                 """Process a single dictionary returned by an InfoExtractor."""
737
738                 reason = self._match_entry(info_dict)
739                 if reason is not None:
740                         self.to_screen(u'[download] ' + reason)
741                         return
742
743                 max_downloads = self.params.get('max_downloads')
744                 if max_downloads is not None:
745                         if self._num_downloads > int(max_downloads):
746                                 raise MaxDownloadsReached()
747
748                 filename = self.prepare_filename(info_dict)
749                 
750                 # Forced printings
751                 if self.params.get('forcetitle', False):
752                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
753                 if self.params.get('forceurl', False):
754                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
755                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
756                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
757                 if self.params.get('forcedescription', False) and 'description' in info_dict:
758                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
759                 if self.params.get('forcefilename', False) and filename is not None:
760                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
761                 if self.params.get('forceformat', False):
762                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
763
764                 # Do nothing else if in simulate mode
765                 if self.params.get('simulate', False):
766                         return
767
768                 if filename is None:
769                         return
770
771                 try:
772                         dn = os.path.dirname(filename)
773                         if dn != '' and not os.path.exists(dn):
774                                 os.makedirs(dn)
775                 except (OSError, IOError), err:
776                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
777                         return
778
779                 if self.params.get('writedescription', False):
780                         try:
781                                 descfn = filename + '.description'
782                                 self.report_writedescription(descfn)
783                                 descfile = open(descfn, 'wb')
784                                 try:
785                                         descfile.write(info_dict['description'].encode('utf-8'))
786                                 finally:
787                                         descfile.close()
788                         except (OSError, IOError):
789                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
790                                 return
791
792                 if self.params.get('writeinfojson', False):
793                         infofn = filename + '.info.json'
794                         self.report_writeinfojson(infofn)
795                         try:
796                                 json.dump
797                         except (NameError,AttributeError):
798                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
799                                 return
800                         try:
801                                 infof = open(infofn, 'wb')
802                                 try:
803                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
804                                         json.dump(json_info_dict, infof)
805                                 finally:
806                                         infof.close()
807                         except (OSError, IOError):
808                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
809                                 return
810
811                 if not self.params.get('skip_download', False):
812                         if self.params.get('nooverwrites', False) and os.path.exists(filename):
813                                 success = True
814                         else:
815                                 try:
816                                         success = self._do_download(filename, info_dict)
817                                 except (OSError, IOError), err:
818                                         raise UnavailableVideoError
819                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
820                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
821                                         return
822                                 except (ContentTooShortError, ), err:
823                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
824                                         return
825         
826                         if success:
827                                 try:
828                                         self.post_process(filename, info_dict)
829                                 except (PostProcessingError), err:
830                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
831                                         return
832
833         def download(self, url_list):
834                 """Download a given list of URLs."""
835                 if len(url_list) > 1 and self.fixed_template():
836                         raise SameFileError(self.params['outtmpl'])
837
838                 for url in url_list:
839                         suitable_found = False
840                         for ie in self._ies:
841                                 # Go to next InfoExtractor if not suitable
842                                 if not ie.suitable(url):
843                                         continue
844
845                                 # Suitable InfoExtractor found
846                                 suitable_found = True
847
848                                 # Extract information from URL and process it
849                                 ie.extract(url)
850
851                                 # Suitable InfoExtractor had been found; go to next URL
852                                 break
853
854                         if not suitable_found:
855                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
856
857                 return self._download_retcode
858
859         def post_process(self, filename, ie_info):
860                 """Run the postprocessing chain on the given file."""
861                 info = dict(ie_info)
862                 info['filepath'] = filename
863                 for pp in self._pps:
864                         info = pp.run(info)
865                         if info is None:
866                                 break
867
868         def _download_with_rtmpdump(self, filename, url, player_url):
869                 self.report_destination(filename)
870                 tmpfilename = self.temp_name(filename)
871
872                 # Check for rtmpdump first
873                 try:
874                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
875                 except (OSError, IOError):
876                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
877                         return False
878
879                 # Download using rtmpdump. rtmpdump returns exit code 2 when
880                 # the connection was interrumpted and resuming appears to be
881                 # possible. This is part of rtmpdump's normal usage, AFAIK.
882                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
883                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
884                 while retval == 2 or retval == 1:
885                         prevsize = os.path.getsize(tmpfilename)
886                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
887                         time.sleep(5.0) # This seems to be needed
888                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
889                         cursize = os.path.getsize(tmpfilename)
890                         if prevsize == cursize and retval == 1:
891                                 break
892                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
893                         if prevsize == cursize and retval == 2 and cursize > 1024:
894                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
895                                 retval = 0
896                                 break
897                 if retval == 0:
898                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
899                         self.try_rename(tmpfilename, filename)
900                         return True
901                 else:
902                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
903                         return False
904
905         def _do_download(self, filename, info_dict):
906                 url = info_dict['url']
907                 player_url = info_dict.get('player_url', None)
908
909                 # Check file already present
910                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
911                         self.report_file_already_downloaded(filename)
912                         return True
913
914                 # Attempt to download using rtmpdump
915                 if url.startswith('rtmp'):
916                         return self._download_with_rtmpdump(filename, url, player_url)
917
918                 tmpfilename = self.temp_name(filename)
919                 stream = None
920
921                 # Do not include the Accept-Encoding header
922                 headers = {'Youtubedl-no-compression': 'True'}
923                 basic_request = urllib2.Request(url, None, headers)
924                 request = urllib2.Request(url, None, headers)
925
926                 # Establish possible resume length
927                 if os.path.isfile(tmpfilename):
928                         resume_len = os.path.getsize(tmpfilename)
929                 else:
930                         resume_len = 0
931
932                 open_mode = 'wb'
933                 if resume_len != 0:
934                         if self.params.get('continuedl', False):
935                                 self.report_resuming_byte(resume_len)
936                                 request.add_header('Range','bytes=%d-' % resume_len)
937                                 open_mode = 'ab'
938                         else:
939                                 resume_len = 0
940
941                 count = 0
942                 retries = self.params.get('retries', 0)
943                 while count <= retries:
944                         # Establish connection
945                         try:
946                                 if count == 0 and 'urlhandle' in info_dict:
947                                         data = info_dict['urlhandle']
948                                 data = urllib2.urlopen(request)
949                                 break
950                         except (urllib2.HTTPError, ), err:
951                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
952                                         # Unexpected HTTP error
953                                         raise
954                                 elif err.code == 416:
955                                         # Unable to resume (requested range not satisfiable)
956                                         try:
957                                                 # Open the connection again without the range header
958                                                 data = urllib2.urlopen(basic_request)
959                                                 content_length = data.info()['Content-Length']
960                                         except (urllib2.HTTPError, ), err:
961                                                 if err.code < 500 or err.code >= 600:
962                                                         raise
963                                         else:
964                                                 # Examine the reported length
965                                                 if (content_length is not None and
966                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
967                                                         # The file had already been fully downloaded.
968                                                         # Explanation to the above condition: in issue #175 it was revealed that
969                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
970                                                         # changing the file size slightly and causing problems for some users. So
971                                                         # I decided to implement a suggested change and consider the file
972                                                         # completely downloaded if the file size differs less than 100 bytes from
973                                                         # the one in the hard drive.
974                                                         self.report_file_already_downloaded(filename)
975                                                         self.try_rename(tmpfilename, filename)
976                                                         return True
977                                                 else:
978                                                         # The length does not match, we start the download over
979                                                         self.report_unable_to_resume()
980                                                         open_mode = 'wb'
981                                                         break
982                         # Retry
983                         count += 1
984                         if count <= retries:
985                                 self.report_retry(count, retries)
986
987                 if count > retries:
988                         self.trouble(u'ERROR: giving up after %s retries' % retries)
989                         return False
990
991                 data_len = data.info().get('Content-length', None)
992                 if data_len is not None:
993                         data_len = long(data_len) + resume_len
994                 data_len_str = self.format_bytes(data_len)
995                 byte_counter = 0 + resume_len
996                 block_size = 1024
997                 start = time.time()
998                 while True:
999                         # Download and write
1000                         before = time.time()
1001                         data_block = data.read(block_size)
1002                         after = time.time()
1003                         if len(data_block) == 0:
1004                                 break
1005                         byte_counter += len(data_block)
1006
1007                         # Open file just in time
1008                         if stream is None:
1009                                 try:
1010                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1011                                         assert stream is not None
1012                                         filename = self.undo_temp_name(tmpfilename)
1013                                         self.report_destination(filename)
1014                                 except (OSError, IOError), err:
1015                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1016                                         return False
1017                         try:
1018                                 stream.write(data_block)
1019                         except (IOError, OSError), err:
1020                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1021                                 return False
1022                         block_size = self.best_block_size(after - before, len(data_block))
1023
1024                         # Progress message
1025                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1026                         if data_len is None:
1027                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1028                         else:
1029                                 percent_str = self.calc_percent(byte_counter, data_len)
1030                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1031                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1032
1033                         # Apply rate limit
1034                         self.slow_down(start, byte_counter - resume_len)
1035
1036                 if stream is None:
1037                         self.trouble(u'\nERROR: Did not get any data blocks')
1038                         return False
1039                 stream.close()
1040                 self.report_finish()
1041                 if data_len is not None and byte_counter != data_len:
1042                         raise ContentTooShortError(byte_counter, long(data_len))
1043                 self.try_rename(tmpfilename, filename)
1044
1045                 # Update file modification time
1046                 if self.params.get('updatetime', True):
1047                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1048
1049                 return True
1050
1051
1052 class InfoExtractor(object):
1053         """Information Extractor class.
1054
1055         Information extractors are the classes that, given a URL, extract
1056         information from the video (or videos) the URL refers to. This
1057         information includes the real video URL, the video title and simplified
1058         title, author and others. The information is stored in a dictionary
1059         which is then passed to the FileDownloader. The FileDownloader
1060         processes this information possibly downloading the video to the file
1061         system, among other possible outcomes. The dictionaries must include
1062         the following fields:
1063
1064         id:             Video identifier.
1065         url:            Final video URL.
1066         uploader:       Nickname of the video uploader.
1067         title:          Literal title.
1068         stitle:         Simplified title.
1069         ext:            Video filename extension.
1070         format:         Video format.
1071         player_url:     SWF Player URL (may be None).
1072
1073         The following fields are optional. Their primary purpose is to allow
1074         youtube-dl to serve as the backend for a video search function, such
1075         as the one in youtube2mp3.  They are only used when their respective
1076         forced printing functions are called:
1077
1078         thumbnail:      Full URL to a video thumbnail image.
1079         description:    One-line video description.
1080
1081         Subclasses of this one should re-define the _real_initialize() and
1082         _real_extract() methods and define a _VALID_URL regexp.
1083         Probably, they should also be added to the list of extractors.
1084         """
1085
1086         _ready = False
1087         _downloader = None
1088
1089         def __init__(self, downloader=None):
1090                 """Constructor. Receives an optional downloader."""
1091                 self._ready = False
1092                 self.set_downloader(downloader)
1093
1094         def suitable(self, url):
1095                 """Receives a URL and returns True if suitable for this IE."""
1096                 return re.match(self._VALID_URL, url) is not None
1097
1098         def initialize(self):
1099                 """Initializes an instance (authentication, etc)."""
1100                 if not self._ready:
1101                         self._real_initialize()
1102                         self._ready = True
1103
1104         def extract(self, url):
1105                 """Extracts URL information and returns it in list of dicts."""
1106                 self.initialize()
1107                 return self._real_extract(url)
1108
1109         def set_downloader(self, downloader):
1110                 """Sets the downloader for this IE."""
1111                 self._downloader = downloader
1112
1113         def _real_initialize(self):
1114                 """Real initialization process. Redefine in subclasses."""
1115                 pass
1116
1117         def _real_extract(self, url):
1118                 """Real extraction process. Redefine in subclasses."""
1119                 pass
1120
1121
1122 class YoutubeIE(InfoExtractor):
1123         """Information extractor for youtube.com."""
1124
1125         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1126         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1127         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1128         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1129         _NETRC_MACHINE = 'youtube'
1130         # Listed in order of quality
1131         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1132         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1133         _video_extensions = {
1134                 '13': '3gp',
1135                 '17': 'mp4',
1136                 '18': 'mp4',
1137                 '22': 'mp4',
1138                 '37': 'mp4',
1139                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1140                 '43': 'webm',
1141                 '44': 'webm',
1142                 '45': 'webm',
1143         }
1144         _video_dimensions = {
1145                 '5': '240x400',
1146                 '6': '???',
1147                 '13': '???',
1148                 '17': '144x176',
1149                 '18': '360x640',
1150                 '22': '720x1280',
1151                 '34': '360x640',
1152                 '35': '480x854',
1153                 '37': '1080x1920',
1154                 '38': '3072x4096',
1155                 '43': '360x640',
1156                 '44': '480x854',
1157                 '45': '720x1280',
1158         }       
1159         IE_NAME = u'youtube'
1160
1161         def report_lang(self):
1162                 """Report attempt to set language."""
1163                 self._downloader.to_screen(u'[youtube] Setting language')
1164
1165         def report_login(self):
1166                 """Report attempt to log in."""
1167                 self._downloader.to_screen(u'[youtube] Logging in')
1168
1169         def report_age_confirmation(self):
1170                 """Report attempt to confirm age."""
1171                 self._downloader.to_screen(u'[youtube] Confirming age')
1172
1173         def report_video_webpage_download(self, video_id):
1174                 """Report attempt to download video webpage."""
1175                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1176
1177         def report_video_info_webpage_download(self, video_id):
1178                 """Report attempt to download video info webpage."""
1179                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1180
1181         def report_information_extraction(self, video_id):
1182                 """Report attempt to extract video information."""
1183                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1184
1185         def report_unavailable_format(self, video_id, format):
1186                 """Report extracted video URL."""
1187                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1188
1189         def report_rtmp_download(self):
1190                 """Indicate the download will use the RTMP protocol."""
1191                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1192
1193         def _print_formats(self, formats):
1194                 print 'Available formats:'
1195                 for x in formats:
1196                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1197
1198         def _real_initialize(self):
1199                 if self._downloader is None:
1200                         return
1201
1202                 username = None
1203                 password = None
1204                 downloader_params = self._downloader.params
1205
1206                 # Attempt to use provided username and password or .netrc data
1207                 if downloader_params.get('username', None) is not None:
1208                         username = downloader_params['username']
1209                         password = downloader_params['password']
1210                 elif downloader_params.get('usenetrc', False):
1211                         try:
1212                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1213                                 if info is not None:
1214                                         username = info[0]
1215                                         password = info[2]
1216                                 else:
1217                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1218                         except (IOError, netrc.NetrcParseError), err:
1219                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1220                                 return
1221
1222                 # Set language
1223                 request = urllib2.Request(self._LANG_URL)
1224                 try:
1225                         self.report_lang()
1226                         urllib2.urlopen(request).read()
1227                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1229                         return
1230
1231                 # No authentication to be performed
1232                 if username is None:
1233                         return
1234
1235                 # Log in
1236                 login_form = {
1237                                 'current_form': 'loginForm',
1238                                 'next':         '/',
1239                                 'action_login': 'Log In',
1240                                 'username':     username,
1241                                 'password':     password,
1242                                 }
1243                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1244                 try:
1245                         self.report_login()
1246                         login_results = urllib2.urlopen(request).read()
1247                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1248                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1249                                 return
1250                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1251                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1252                         return
1253
1254                 # Confirm age
1255                 age_form = {
1256                                 'next_url':             '/',
1257                                 'action_confirm':       'Confirm',
1258                                 }
1259                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1260                 try:
1261                         self.report_age_confirmation()
1262                         age_results = urllib2.urlopen(request).read()
1263                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1264                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1265                         return
1266
1267         def _real_extract(self, url):
1268                 # Extract video id from URL
1269                 mobj = re.match(self._VALID_URL, url)
1270                 if mobj is None:
1271                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1272                         return
1273                 video_id = mobj.group(2)
1274
1275                 # Get video webpage
1276                 self.report_video_webpage_download(video_id)
1277                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1278                 try:
1279                         video_webpage = urllib2.urlopen(request).read()
1280                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1281                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1282                         return
1283
1284                 # Attempt to extract SWF player URL
1285                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1286                 if mobj is not None:
1287                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1288                 else:
1289                         player_url = None
1290
1291                 # Get video info
1292                 self.report_video_info_webpage_download(video_id)
1293                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1294                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1295                                         % (video_id, el_type))
1296                         request = urllib2.Request(video_info_url)
1297                         try:
1298                                 video_info_webpage = urllib2.urlopen(request).read()
1299                                 video_info = parse_qs(video_info_webpage)
1300                                 if 'token' in video_info:
1301                                         break
1302                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1303                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1304                                 return
1305                 if 'token' not in video_info:
1306                         if 'reason' in video_info:
1307                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1308                         else:
1309                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1310                         return
1311
1312                 # Start extracting information
1313                 self.report_information_extraction(video_id)
1314
1315                 # uploader
1316                 if 'author' not in video_info:
1317                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1318                         return
1319                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1320
1321                 # title
1322                 if 'title' not in video_info:
1323                         self._downloader.trouble(u'ERROR: unable to extract video title')
1324                         return
1325                 video_title = urllib.unquote_plus(video_info['title'][0])
1326                 video_title = video_title.decode('utf-8')
1327                 video_title = sanitize_title(video_title)
1328
1329                 # simplified title
1330                 simple_title = _simplify_title(video_title)
1331
1332                 # thumbnail image
1333                 if 'thumbnail_url' not in video_info:
1334                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1335                         video_thumbnail = ''
1336                 else:   # don't panic if we can't find it
1337                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1338
1339                 # upload date
1340                 upload_date = u'NA'
1341                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1342                 if mobj is not None:
1343                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1344                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1345                         for expression in format_expressions:
1346                                 try:
1347                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1348                                 except:
1349                                         pass
1350
1351                 # description
1352                 try:
1353                         lxml.etree
1354                 except NameError:
1355                         video_description = u'No description available.'
1356                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1357                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1358                                 if mobj is not None:
1359                                         video_description = mobj.group(1).decode('utf-8')
1360                 else:
1361                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1362                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1363                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1364                         # TODO use another parser
1365
1366                 # token
1367                 video_token = urllib.unquote_plus(video_info['token'][0])
1368
1369                 # Decide which formats to download
1370                 req_format = self._downloader.params.get('format', None)
1371
1372                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1373                         self.report_rtmp_download()
1374                         video_url_list = [(None, video_info['conn'][0])]
1375                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1376                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1377                         url_data = [parse_qs(uds) for uds in url_data_strs]
1378                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1379                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1380
1381                         format_limit = self._downloader.params.get('format_limit', None)
1382                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1383                         if format_limit is not None and format_limit in available_formats:
1384                                 format_list = available_formats[available_formats.index(format_limit):]
1385                         else:
1386                                 format_list = available_formats
1387                         existing_formats = [x for x in format_list if x in url_map]
1388                         if len(existing_formats) == 0:
1389                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1390                                 return
1391                         if self._downloader.params.get('listformats', None):
1392                                 self._print_formats(existing_formats)
1393                                 return
1394                         if req_format is None or req_format == 'best':
1395                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1396                         elif req_format == 'worst':
1397                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1398                         elif req_format in ('-1', 'all'):
1399                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1400                         else:
1401                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1402                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1403                                 req_formats = req_format.split('/')
1404                                 video_url_list = None
1405                                 for rf in req_formats:
1406                                         if rf in url_map:
1407                                                 video_url_list = [(rf, url_map[rf])]
1408                                                 break
1409                                 if video_url_list is None:
1410                                         self._downloader.trouble(u'ERROR: requested format not available')
1411                                         return
1412                 else:
1413                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1414                         return
1415
1416                 for format_param, video_real_url in video_url_list:
1417                         # At this point we have a new video
1418                         self._downloader.increment_downloads()
1419
1420                         # Extension
1421                         video_extension = self._video_extensions.get(format_param, 'flv')
1422
1423                         try:
1424                                 # Process video information
1425                                 self._downloader.process_info({
1426                                         'id':           video_id.decode('utf-8'),
1427                                         'url':          video_real_url.decode('utf-8'),
1428                                         'uploader':     video_uploader.decode('utf-8'),
1429                                         'upload_date':  upload_date,
1430                                         'title':        video_title,
1431                                         'stitle':       simple_title,
1432                                         'ext':          video_extension.decode('utf-8'),
1433                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1434                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1435                                         'description':  video_description,
1436                                         'player_url':   player_url,
1437                                 })
1438                         except UnavailableVideoError, err:
1439                                 self._downloader.trouble(u'\nERROR: unable to download video')
1440
1441
1442 class MetacafeIE(InfoExtractor):
1443         """Information Extractor for metacafe.com."""
1444
1445         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1446         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1447         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1448         _youtube_ie = None
1449         IE_NAME = u'metacafe'
1450
1451         def __init__(self, youtube_ie, downloader=None):
1452                 InfoExtractor.__init__(self, downloader)
1453                 self._youtube_ie = youtube_ie
1454
1455         def report_disclaimer(self):
1456                 """Report disclaimer retrieval."""
1457                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1458
1459         def report_age_confirmation(self):
1460                 """Report attempt to confirm age."""
1461                 self._downloader.to_screen(u'[metacafe] Confirming age')
1462
1463         def report_download_webpage(self, video_id):
1464                 """Report webpage download."""
1465                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1466
1467         def report_extraction(self, video_id):
1468                 """Report information extraction."""
1469                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1470
1471         def _real_initialize(self):
1472                 # Retrieve disclaimer
1473                 request = urllib2.Request(self._DISCLAIMER)
1474                 try:
1475                         self.report_disclaimer()
1476                         disclaimer = urllib2.urlopen(request).read()
1477                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1478                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1479                         return
1480
1481                 # Confirm age
1482                 disclaimer_form = {
1483                         'filters': '0',
1484                         'submit': "Continue - I'm over 18",
1485                         }
1486                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1487                 try:
1488                         self.report_age_confirmation()
1489                         disclaimer = urllib2.urlopen(request).read()
1490                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1491                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1492                         return
1493
1494         def _real_extract(self, url):
1495                 # Extract id and simplified title from URL
1496                 mobj = re.match(self._VALID_URL, url)
1497                 if mobj is None:
1498                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1499                         return
1500
1501                 video_id = mobj.group(1)
1502
1503                 # Check if video comes from YouTube
1504                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1505                 if mobj2 is not None:
1506                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1507                         return
1508
1509                 # At this point we have a new video
1510                 self._downloader.increment_downloads()
1511
1512                 simple_title = mobj.group(2).decode('utf-8')
1513
1514                 # Retrieve video webpage to extract further information
1515                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1516                 try:
1517                         self.report_download_webpage(video_id)
1518                         webpage = urllib2.urlopen(request).read()
1519                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1520                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1521                         return
1522
1523                 # Extract URL, uploader and title from webpage
1524                 self.report_extraction(video_id)
1525                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1526                 if mobj is not None:
1527                         mediaURL = urllib.unquote(mobj.group(1))
1528                         video_extension = mediaURL[-3:]
1529
1530                         # Extract gdaKey if available
1531                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1532                         if mobj is None:
1533                                 video_url = mediaURL
1534                         else:
1535                                 gdaKey = mobj.group(1)
1536                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1537                 else:
1538                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1539                         if mobj is None:
1540                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1541                                 return
1542                         vardict = parse_qs(mobj.group(1))
1543                         if 'mediaData' not in vardict:
1544                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1545                                 return
1546                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1547                         if mobj is None:
1548                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1549                                 return
1550                         mediaURL = mobj.group(1).replace('\\/', '/')
1551                         video_extension = mediaURL[-3:]
1552                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1553
1554                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1555                 if mobj is None:
1556                         self._downloader.trouble(u'ERROR: unable to extract title')
1557                         return
1558                 video_title = mobj.group(1).decode('utf-8')
1559                 video_title = sanitize_title(video_title)
1560
1561                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1562                 if mobj is None:
1563                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1564                         return
1565                 video_uploader = mobj.group(1)
1566
1567                 try:
1568                         # Process video information
1569                         self._downloader.process_info({
1570                                 'id':           video_id.decode('utf-8'),
1571                                 'url':          video_url.decode('utf-8'),
1572                                 'uploader':     video_uploader.decode('utf-8'),
1573                                 'upload_date':  u'NA',
1574                                 'title':        video_title,
1575                                 'stitle':       simple_title,
1576                                 'ext':          video_extension.decode('utf-8'),
1577                                 'format':       u'NA',
1578                                 'player_url':   None,
1579                         })
1580                 except UnavailableVideoError:
1581                         self._downloader.trouble(u'\nERROR: unable to download video')
1582
1583
1584 class DailymotionIE(InfoExtractor):
1585         """Information Extractor for Dailymotion"""
1586
1587         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1588         IE_NAME = u'dailymotion'
1589
1590         def __init__(self, downloader=None):
1591                 InfoExtractor.__init__(self, downloader)
1592
1593         def report_download_webpage(self, video_id):
1594                 """Report webpage download."""
1595                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1596
1597         def report_extraction(self, video_id):
1598                 """Report information extraction."""
1599                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1600
1601         def _real_extract(self, url):
1602                 # Extract id and simplified title from URL
1603                 mobj = re.match(self._VALID_URL, url)
1604                 if mobj is None:
1605                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1606                         return
1607
1608                 # At this point we have a new video
1609                 self._downloader.increment_downloads()
1610                 video_id = mobj.group(1)
1611
1612                 video_extension = 'flv'
1613
1614                 # Retrieve video webpage to extract further information
1615                 request = urllib2.Request(url)
1616                 request.add_header('Cookie', 'family_filter=off')
1617                 try:
1618                         self.report_download_webpage(video_id)
1619                         webpage = urllib2.urlopen(request).read()
1620                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1621                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1622                         return
1623
1624                 # Extract URL, uploader and title from webpage
1625                 self.report_extraction(video_id)
1626                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1627                 if mobj is None:
1628                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1629                         return
1630                 sequence = urllib.unquote(mobj.group(1))
1631                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1632                 if mobj is None:
1633                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1634                         return
1635                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1636
1637                 # if needed add http://www.dailymotion.com/ if relative URL
1638
1639                 video_url = mediaURL
1640
1641                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1642                 if mobj is None:
1643                         self._downloader.trouble(u'ERROR: unable to extract title')
1644                         return
1645                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1646                 video_title = sanitize_title(video_title)
1647                 simple_title = _simplify_title(video_title)
1648
1649                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1650                 if mobj is None:
1651                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1652                         return
1653                 video_uploader = mobj.group(1)
1654
1655                 try:
1656                         # Process video information
1657                         self._downloader.process_info({
1658                                 'id':           video_id.decode('utf-8'),
1659                                 'url':          video_url.decode('utf-8'),
1660                                 'uploader':     video_uploader.decode('utf-8'),
1661                                 'upload_date':  u'NA',
1662                                 'title':        video_title,
1663                                 'stitle':       simple_title,
1664                                 'ext':          video_extension.decode('utf-8'),
1665                                 'format':       u'NA',
1666                                 'player_url':   None,
1667                         })
1668                 except UnavailableVideoError:
1669                         self._downloader.trouble(u'\nERROR: unable to download video')
1670
1671
1672 class GoogleIE(InfoExtractor):
1673         """Information extractor for video.google.com."""
1674
1675         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1676         IE_NAME = u'video.google'
1677
1678         def __init__(self, downloader=None):
1679                 InfoExtractor.__init__(self, downloader)
1680
1681         def report_download_webpage(self, video_id):
1682                 """Report webpage download."""
1683                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1684
1685         def report_extraction(self, video_id):
1686                 """Report information extraction."""
1687                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1688
1689         def _real_extract(self, url):
1690                 # Extract id from URL
1691                 mobj = re.match(self._VALID_URL, url)
1692                 if mobj is None:
1693                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1694                         return
1695
1696                 # At this point we have a new video
1697                 self._downloader.increment_downloads()
1698                 video_id = mobj.group(1)
1699
1700                 video_extension = 'mp4'
1701
1702                 # Retrieve video webpage to extract further information
1703                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1704                 try:
1705                         self.report_download_webpage(video_id)
1706                         webpage = urllib2.urlopen(request).read()
1707                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1708                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1709                         return
1710
1711                 # Extract URL, uploader, and title from webpage
1712                 self.report_extraction(video_id)
1713                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1714                 if mobj is None:
1715                         video_extension = 'flv'
1716                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1717                 if mobj is None:
1718                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1719                         return
1720                 mediaURL = urllib.unquote(mobj.group(1))
1721                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1722                 mediaURL = mediaURL.replace('\\x26', '\x26')
1723
1724                 video_url = mediaURL
1725
1726                 mobj = re.search(r'<title>(.*)</title>', webpage)
1727                 if mobj is None:
1728                         self._downloader.trouble(u'ERROR: unable to extract title')
1729                         return
1730                 video_title = mobj.group(1).decode('utf-8')
1731                 video_title = sanitize_title(video_title)
1732                 simple_title = _simplify_title(video_title)
1733
1734                 # Extract video description
1735                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1736                 if mobj is None:
1737                         self._downloader.trouble(u'ERROR: unable to extract video description')
1738                         return
1739                 video_description = mobj.group(1).decode('utf-8')
1740                 if not video_description:
1741                         video_description = 'No description available.'
1742
1743                 # Extract video thumbnail
1744                 if self._downloader.params.get('forcethumbnail', False):
1745                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1746                         try:
1747                                 webpage = urllib2.urlopen(request).read()
1748                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1749                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1750                                 return
1751                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1752                         if mobj is None:
1753                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1754                                 return
1755                         video_thumbnail = mobj.group(1)
1756                 else:   # we need something to pass to process_info
1757                         video_thumbnail = ''
1758
1759                 try:
1760                         # Process video information
1761                         self._downloader.process_info({
1762                                 'id':           video_id.decode('utf-8'),
1763                                 'url':          video_url.decode('utf-8'),
1764                                 'uploader':     u'NA',
1765                                 'upload_date':  u'NA',
1766                                 'title':        video_title,
1767                                 'stitle':       simple_title,
1768                                 'ext':          video_extension.decode('utf-8'),
1769                                 'format':       u'NA',
1770                                 'player_url':   None,
1771                         })
1772                 except UnavailableVideoError:
1773                         self._downloader.trouble(u'\nERROR: unable to download video')
1774
1775
1776 class PhotobucketIE(InfoExtractor):
1777         """Information extractor for photobucket.com."""
1778
1779         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1780         IE_NAME = u'photobucket'
1781
1782         def __init__(self, downloader=None):
1783                 InfoExtractor.__init__(self, downloader)
1784
1785         def report_download_webpage(self, video_id):
1786                 """Report webpage download."""
1787                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1788
1789         def report_extraction(self, video_id):
1790                 """Report information extraction."""
1791                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1792
1793         def _real_extract(self, url):
1794                 # Extract id from URL
1795                 mobj = re.match(self._VALID_URL, url)
1796                 if mobj is None:
1797                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1798                         return
1799
1800                 # At this point we have a new video
1801                 self._downloader.increment_downloads()
1802                 video_id = mobj.group(1)
1803
1804                 video_extension = 'flv'
1805
1806                 # Retrieve video webpage to extract further information
1807                 request = urllib2.Request(url)
1808                 try:
1809                         self.report_download_webpage(video_id)
1810                         webpage = urllib2.urlopen(request).read()
1811                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1812                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1813                         return
1814
1815                 # Extract URL, uploader, and title from webpage
1816                 self.report_extraction(video_id)
1817                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1818                 if mobj is None:
1819                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1820                         return
1821                 mediaURL = urllib.unquote(mobj.group(1))
1822
1823                 video_url = mediaURL
1824
1825                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1826                 if mobj is None:
1827                         self._downloader.trouble(u'ERROR: unable to extract title')
1828                         return
1829                 video_title = mobj.group(1).decode('utf-8')
1830                 video_title = sanitize_title(video_title)
1831                 simple_title = _simplify_title(vide_title)
1832
1833                 video_uploader = mobj.group(2).decode('utf-8')
1834
1835                 try:
1836                         # Process video information
1837                         self._downloader.process_info({
1838                                 'id':           video_id.decode('utf-8'),
1839                                 'url':          video_url.decode('utf-8'),
1840                                 'uploader':     video_uploader,
1841                                 'upload_date':  u'NA',
1842                                 'title':        video_title,
1843                                 'stitle':       simple_title,
1844                                 'ext':          video_extension.decode('utf-8'),
1845                                 'format':       u'NA',
1846                                 'player_url':   None,
1847                         })
1848                 except UnavailableVideoError:
1849                         self._downloader.trouble(u'\nERROR: unable to download video')
1850
1851
1852 class YahooIE(InfoExtractor):
1853         """Information extractor for video.yahoo.com."""
1854
1855         # _VALID_URL matches all Yahoo! Video URLs
1856         # _VPAGE_URL matches only the extractable '/watch/' URLs
1857         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1858         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1859         IE_NAME = u'video.yahoo'
1860
1861         def __init__(self, downloader=None):
1862                 InfoExtractor.__init__(self, downloader)
1863
1864         def report_download_webpage(self, video_id):
1865                 """Report webpage download."""
1866                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1867
1868         def report_extraction(self, video_id):
1869                 """Report information extraction."""
1870                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1871
1872         def _real_extract(self, url, new_video=True):
1873                 # Extract ID from URL
1874                 mobj = re.match(self._VALID_URL, url)
1875                 if mobj is None:
1876                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1877                         return
1878
1879                 # At this point we have a new video
1880                 self._downloader.increment_downloads()
1881                 video_id = mobj.group(2)
1882                 video_extension = 'flv'
1883
1884                 # Rewrite valid but non-extractable URLs as
1885                 # extractable English language /watch/ URLs
1886                 if re.match(self._VPAGE_URL, url) is None:
1887                         request = urllib2.Request(url)
1888                         try:
1889                                 webpage = urllib2.urlopen(request).read()
1890                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1891                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1892                                 return
1893
1894                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1895                         if mobj is None:
1896                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1897                                 return
1898                         yahoo_id = mobj.group(1)
1899
1900                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1901                         if mobj is None:
1902                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1903                                 return
1904                         yahoo_vid = mobj.group(1)
1905
1906                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1907                         return self._real_extract(url, new_video=False)
1908
1909                 # Retrieve video webpage to extract further information
1910                 request = urllib2.Request(url)
1911                 try:
1912                         self.report_download_webpage(video_id)
1913                         webpage = urllib2.urlopen(request).read()
1914                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1915                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1916                         return
1917
1918                 # Extract uploader and title from webpage
1919                 self.report_extraction(video_id)
1920                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1921                 if mobj is None:
1922                         self._downloader.trouble(u'ERROR: unable to extract video title')
1923                         return
1924                 video_title = mobj.group(1).decode('utf-8')
1925                 simple_title = _simplify_title(video_title)
1926
1927                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1928                 if mobj is None:
1929                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1930                         return
1931                 video_uploader = mobj.group(1).decode('utf-8')
1932
1933                 # Extract video thumbnail
1934                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1935                 if mobj is None:
1936                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1937                         return
1938                 video_thumbnail = mobj.group(1).decode('utf-8')
1939
1940                 # Extract video description
1941                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1942                 if mobj is None:
1943                         self._downloader.trouble(u'ERROR: unable to extract video description')
1944                         return
1945                 video_description = mobj.group(1).decode('utf-8')
1946                 if not video_description:
1947                         video_description = 'No description available.'
1948
1949                 # Extract video height and width
1950                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1951                 if mobj is None:
1952                         self._downloader.trouble(u'ERROR: unable to extract video height')
1953                         return
1954                 yv_video_height = mobj.group(1)
1955
1956                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1957                 if mobj is None:
1958                         self._downloader.trouble(u'ERROR: unable to extract video width')
1959                         return
1960                 yv_video_width = mobj.group(1)
1961
1962                 # Retrieve video playlist to extract media URL
1963                 # I'm not completely sure what all these options are, but we
1964                 # seem to need most of them, otherwise the server sends a 401.
1965                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1966                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1967                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1968                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1969                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1970                 try:
1971                         self.report_download_webpage(video_id)
1972                         webpage = urllib2.urlopen(request).read()
1973                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1974                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1975                         return
1976
1977                 # Extract media URL from playlist XML
1978                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1979                 if mobj is None:
1980                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1981                         return
1982                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1983                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1984
1985                 try:
1986                         # Process video information
1987                         self._downloader.process_info({
1988                                 'id':           video_id.decode('utf-8'),
1989                                 'url':          video_url,
1990                                 'uploader':     video_uploader,
1991                                 'upload_date':  u'NA',
1992                                 'title':        video_title,
1993                                 'stitle':       simple_title,
1994                                 'ext':          video_extension.decode('utf-8'),
1995                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1996                                 'description':  video_description,
1997                                 'thumbnail':    video_thumbnail,
1998                                 'player_url':   None,
1999                         })
2000                 except UnavailableVideoError:
2001                         self._downloader.trouble(u'\nERROR: unable to download video')
2002
2003
2004 class VimeoIE(InfoExtractor):
2005         """Information extractor for vimeo.com."""
2006
2007         # _VALID_URL matches Vimeo URLs
2008         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2009         IE_NAME = u'vimeo'
2010
2011         def __init__(self, downloader=None):
2012                 InfoExtractor.__init__(self, downloader)
2013
2014         def report_download_webpage(self, video_id):
2015                 """Report webpage download."""
2016                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2017
2018         def report_extraction(self, video_id):
2019                 """Report information extraction."""
2020                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2021
2022         def _real_extract(self, url, new_video=True):
2023                 # Extract ID from URL
2024                 mobj = re.match(self._VALID_URL, url)
2025                 if mobj is None:
2026                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2027                         return
2028
2029                 # At this point we have a new video
2030                 self._downloader.increment_downloads()
2031                 video_id = mobj.group(1)
2032
2033                 # Retrieve video webpage to extract further information
2034                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2035                 try:
2036                         self.report_download_webpage(video_id)
2037                         webpage = urllib2.urlopen(request).read()
2038                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2039                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2040                         return
2041
2042                 # Now we begin extracting as much information as we can from what we
2043                 # retrieved. First we extract the information common to all extractors,
2044                 # and latter we extract those that are Vimeo specific.
2045                 self.report_extraction(video_id)
2046
2047                 # Extract title
2048                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2049                 if mobj is None:
2050                         self._downloader.trouble(u'ERROR: unable to extract video title')
2051                         return
2052                 video_title = mobj.group(1).decode('utf-8')
2053                 simple_title = _simplify_title(video_title)
2054
2055                 # Extract uploader
2056                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2057                 if mobj is None:
2058                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2059                         return
2060                 video_uploader = mobj.group(1).decode('utf-8')
2061
2062                 # Extract video thumbnail
2063                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2064                 if mobj is None:
2065                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2066                         return
2067                 video_thumbnail = mobj.group(1).decode('utf-8')
2068
2069                 # # Extract video description
2070                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2071                 # if mobj is None:
2072                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2073                 #       return
2074                 # video_description = mobj.group(1).decode('utf-8')
2075                 # if not video_description: video_description = 'No description available.'
2076                 video_description = 'Foo.'
2077
2078                 # Vimeo specific: extract request signature
2079                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2080                 if mobj is None:
2081                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2082                         return
2083                 sig = mobj.group(1).decode('utf-8')
2084
2085                 # Vimeo specific: extract video quality information
2086                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2087                 if mobj is None:
2088                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2089                         return
2090                 quality = mobj.group(1).decode('utf-8')
2091
2092                 if int(quality) == 1:
2093                         quality = 'hd'
2094                 else:
2095                         quality = 'sd'
2096
2097                 # Vimeo specific: Extract request signature expiration
2098                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2099                 if mobj is None:
2100                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2101                         return
2102                 sig_exp = mobj.group(1).decode('utf-8')
2103
2104                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2105
2106                 try:
2107                         # Process video information
2108                         self._downloader.process_info({
2109                                 'id':           video_id.decode('utf-8'),
2110                                 'url':          video_url,
2111                                 'uploader':     video_uploader,
2112                                 'upload_date':  u'NA',
2113                                 'title':        video_title,
2114                                 'stitle':       simple_title,
2115                                 'ext':          u'mp4',
2116                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2117                                 'description':  video_description,
2118                                 'thumbnail':    video_thumbnail,
2119                                 'description':  video_description,
2120                                 'player_url':   None,
2121                         })
2122                 except UnavailableVideoError:
2123                         self._downloader.trouble(u'ERROR: unable to download video')
2124
2125
2126 class GenericIE(InfoExtractor):
2127         """Generic last-resort information extractor."""
2128
2129         _VALID_URL = r'.*'
2130         IE_NAME = u'generic'
2131
2132         def __init__(self, downloader=None):
2133                 InfoExtractor.__init__(self, downloader)
2134
2135         def report_download_webpage(self, video_id):
2136                 """Report webpage download."""
2137                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2138                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2139
2140         def report_extraction(self, video_id):
2141                 """Report information extraction."""
2142                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2143
2144         def _real_extract(self, url):
2145                 # At this point we have a new video
2146                 self._downloader.increment_downloads()
2147
2148                 video_id = url.split('/')[-1]
2149                 request = urllib2.Request(url)
2150                 try:
2151                         self.report_download_webpage(video_id)
2152                         webpage = urllib2.urlopen(request).read()
2153                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2154                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2155                         return
2156                 except ValueError, err:
2157                         # since this is the last-resort InfoExtractor, if
2158                         # this error is thrown, it'll be thrown here
2159                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2160                         return
2161
2162                 self.report_extraction(video_id)
2163                 # Start with something easy: JW Player in SWFObject
2164                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2165                 if mobj is None:
2166                         # Broaden the search a little bit
2167                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2168                 if mobj is None:
2169                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2170                         return
2171
2172                 # It's possible that one of the regexes
2173                 # matched, but returned an empty group:
2174                 if mobj.group(1) is None:
2175                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2176                         return
2177
2178                 video_url = urllib.unquote(mobj.group(1))
2179                 video_id = os.path.basename(video_url)
2180
2181                 # here's a fun little line of code for you:
2182                 video_extension = os.path.splitext(video_id)[1][1:]
2183                 video_id = os.path.splitext(video_id)[0]
2184
2185                 # it's tempting to parse this further, but you would
2186                 # have to take into account all the variations like
2187                 #   Video Title - Site Name
2188                 #   Site Name | Video Title
2189                 #   Video Title - Tagline | Site Name
2190                 # and so on and so forth; it's just not practical
2191                 mobj = re.search(r'<title>(.*)</title>', webpage)
2192                 if mobj is None:
2193                         self._downloader.trouble(u'ERROR: unable to extract title')
2194                         return
2195                 video_title = mobj.group(1).decode('utf-8')
2196                 video_title = sanitize_title(video_title)
2197                 simple_title = _simplify_title(video_title)
2198
2199                 # video uploader is domain name
2200                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2201                 if mobj is None:
2202                         self._downloader.trouble(u'ERROR: unable to extract title')
2203                         return
2204                 video_uploader = mobj.group(1).decode('utf-8')
2205
2206                 try:
2207                         # Process video information
2208                         self._downloader.process_info({
2209                                 'id':           video_id.decode('utf-8'),
2210                                 'url':          video_url.decode('utf-8'),
2211                                 'uploader':     video_uploader,
2212                                 'upload_date':  u'NA',
2213                                 'title':        video_title,
2214                                 'stitle':       simple_title,
2215                                 'ext':          video_extension.decode('utf-8'),
2216                                 'format':       u'NA',
2217                                 'player_url':   None,
2218                         })
2219                 except UnavailableVideoError, err:
2220                         self._downloader.trouble(u'\nERROR: unable to download video')
2221
2222
2223 class YoutubeSearchIE(InfoExtractor):
2224         """Information Extractor for YouTube search queries."""
2225         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2226         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2227         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2228         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2229         _youtube_ie = None
2230         _max_youtube_results = 1000
2231         IE_NAME = u'youtube:search'
2232
2233         def __init__(self, youtube_ie, downloader=None):
2234                 InfoExtractor.__init__(self, downloader)
2235                 self._youtube_ie = youtube_ie
2236
2237         def report_download_page(self, query, pagenum):
2238                 """Report attempt to download playlist page with given number."""
2239                 query = query.decode(preferredencoding())
2240                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2241
2242         def _real_initialize(self):
2243                 self._youtube_ie.initialize()
2244
2245         def _real_extract(self, query):
2246                 mobj = re.match(self._VALID_URL, query)
2247                 if mobj is None:
2248                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2249                         return
2250
2251                 prefix, query = query.split(':')
2252                 prefix = prefix[8:]
2253                 query = query.encode('utf-8')
2254                 if prefix == '':
2255                         self._download_n_results(query, 1)
2256                         return
2257                 elif prefix == 'all':
2258                         self._download_n_results(query, self._max_youtube_results)
2259                         return
2260                 else:
2261                         try:
2262                                 n = long(prefix)
2263                                 if n <= 0:
2264                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2265                                         return
2266                                 elif n > self._max_youtube_results:
2267                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2268                                         n = self._max_youtube_results
2269                                 self._download_n_results(query, n)
2270                                 return
2271                         except ValueError: # parsing prefix as integer fails
2272                                 self._download_n_results(query, 1)
2273                                 return
2274
2275         def _download_n_results(self, query, n):
2276                 """Downloads a specified number of results for a query"""
2277
2278                 video_ids = []
2279                 already_seen = set()
2280                 pagenum = 1
2281
2282                 while True:
2283                         self.report_download_page(query, pagenum)
2284                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2285                         request = urllib2.Request(result_url)
2286                         try:
2287                                 page = urllib2.urlopen(request).read()
2288                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2289                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2290                                 return
2291
2292                         # Extract video identifiers
2293                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2294                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2295                                 if video_id not in already_seen:
2296                                         video_ids.append(video_id)
2297                                         already_seen.add(video_id)
2298                                         if len(video_ids) == n:
2299                                                 # Specified n videos reached
2300                                                 for id in video_ids:
2301                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2302                                                 return
2303
2304                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2305                                 for id in video_ids:
2306                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2307                                 return
2308
2309                         pagenum = pagenum + 1
2310
2311
2312 class GoogleSearchIE(InfoExtractor):
2313         """Information Extractor for Google Video search queries."""
2314         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2315         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2316         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2317         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2318         _google_ie = None
2319         _max_google_results = 1000
2320         IE_NAME = u'video.google:search'
2321
2322         def __init__(self, google_ie, downloader=None):
2323                 InfoExtractor.__init__(self, downloader)
2324                 self._google_ie = google_ie
2325
2326         def report_download_page(self, query, pagenum):
2327                 """Report attempt to download playlist page with given number."""
2328                 query = query.decode(preferredencoding())
2329                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2330
2331         def _real_initialize(self):
2332                 self._google_ie.initialize()
2333
2334         def _real_extract(self, query):
2335                 mobj = re.match(self._VALID_URL, query)
2336                 if mobj is None:
2337                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2338                         return
2339
2340                 prefix, query = query.split(':')
2341                 prefix = prefix[8:]
2342                 query = query.encode('utf-8')
2343                 if prefix == '':
2344                         self._download_n_results(query, 1)
2345                         return
2346                 elif prefix == 'all':
2347                         self._download_n_results(query, self._max_google_results)
2348                         return
2349                 else:
2350                         try:
2351                                 n = long(prefix)
2352                                 if n <= 0:
2353                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2354                                         return
2355                                 elif n > self._max_google_results:
2356                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2357                                         n = self._max_google_results
2358                                 self._download_n_results(query, n)
2359                                 return
2360                         except ValueError: # parsing prefix as integer fails
2361                                 self._download_n_results(query, 1)
2362                                 return
2363
2364         def _download_n_results(self, query, n):
2365                 """Downloads a specified number of results for a query"""
2366
2367                 video_ids = []
2368                 already_seen = set()
2369                 pagenum = 1
2370
2371                 while True:
2372                         self.report_download_page(query, pagenum)
2373                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2374                         request = urllib2.Request(result_url)
2375                         try:
2376                                 page = urllib2.urlopen(request).read()
2377                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2378                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2379                                 return
2380
2381                         # Extract video identifiers
2382                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2383                                 video_id = mobj.group(1)
2384                                 if video_id not in already_seen:
2385                                         video_ids.append(video_id)
2386                                         already_seen.add(video_id)
2387                                         if len(video_ids) == n:
2388                                                 # Specified n videos reached
2389                                                 for id in video_ids:
2390                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2391                                                 return
2392
2393                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2394                                 for id in video_ids:
2395                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2396                                 return
2397
2398                         pagenum = pagenum + 1
2399
2400
2401 class YahooSearchIE(InfoExtractor):
2402         """Information Extractor for Yahoo! Video search queries."""
2403         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2404         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2405         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2406         _MORE_PAGES_INDICATOR = r'\s*Next'
2407         _yahoo_ie = None
2408         _max_yahoo_results = 1000
2409         IE_NAME = u'video.yahoo:search'
2410
2411         def __init__(self, yahoo_ie, downloader=None):
2412                 InfoExtractor.__init__(self, downloader)
2413                 self._yahoo_ie = yahoo_ie
2414
2415         def report_download_page(self, query, pagenum):
2416                 """Report attempt to download playlist page with given number."""
2417                 query = query.decode(preferredencoding())
2418                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2419
2420         def _real_initialize(self):
2421                 self._yahoo_ie.initialize()
2422
2423         def _real_extract(self, query):
2424                 mobj = re.match(self._VALID_URL, query)
2425                 if mobj is None:
2426                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2427                         return
2428
2429                 prefix, query = query.split(':')
2430                 prefix = prefix[8:]
2431                 query = query.encode('utf-8')
2432                 if prefix == '':
2433                         self._download_n_results(query, 1)
2434                         return
2435                 elif prefix == 'all':
2436                         self._download_n_results(query, self._max_yahoo_results)
2437                         return
2438                 else:
2439                         try:
2440                                 n = long(prefix)
2441                                 if n <= 0:
2442                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2443                                         return
2444                                 elif n > self._max_yahoo_results:
2445                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2446                                         n = self._max_yahoo_results
2447                                 self._download_n_results(query, n)
2448                                 return
2449                         except ValueError: # parsing prefix as integer fails
2450                                 self._download_n_results(query, 1)
2451                                 return
2452
2453         def _download_n_results(self, query, n):
2454                 """Downloads a specified number of results for a query"""
2455
2456                 video_ids = []
2457                 already_seen = set()
2458                 pagenum = 1
2459
2460                 while True:
2461                         self.report_download_page(query, pagenum)
2462                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2463                         request = urllib2.Request(result_url)
2464                         try:
2465                                 page = urllib2.urlopen(request).read()
2466                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2467                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2468                                 return
2469
2470                         # Extract video identifiers
2471                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2472                                 video_id = mobj.group(1)
2473                                 if video_id not in already_seen:
2474                                         video_ids.append(video_id)
2475                                         already_seen.add(video_id)
2476                                         if len(video_ids) == n:
2477                                                 # Specified n videos reached
2478                                                 for id in video_ids:
2479                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2480                                                 return
2481
2482                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2483                                 for id in video_ids:
2484                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2485                                 return
2486
2487                         pagenum = pagenum + 1
2488
2489
2490 class YoutubePlaylistIE(InfoExtractor):
2491         """Information Extractor for YouTube playlists."""
2492
2493         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2494         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2495         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2496         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2497         _youtube_ie = None
2498         IE_NAME = u'youtube:playlist'
2499
2500         def __init__(self, youtube_ie, downloader=None):
2501                 InfoExtractor.__init__(self, downloader)
2502                 self._youtube_ie = youtube_ie
2503
2504         def report_download_page(self, playlist_id, pagenum):
2505                 """Report attempt to download playlist page with given number."""
2506                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2507
2508         def _real_initialize(self):
2509                 self._youtube_ie.initialize()
2510
2511         def _real_extract(self, url):
2512                 # Extract playlist id
2513                 mobj = re.match(self._VALID_URL, url)
2514                 if mobj is None:
2515                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2516                         return
2517
2518                 # Single video case
2519                 if mobj.group(3) is not None:
2520                         self._youtube_ie.extract(mobj.group(3))
2521                         return
2522
2523                 # Download playlist pages
2524                 # prefix is 'p' as default for playlists but there are other types that need extra care
2525                 playlist_prefix = mobj.group(1)
2526                 if playlist_prefix == 'a':
2527                         playlist_access = 'artist'
2528                 else:
2529                         playlist_prefix = 'p'
2530                         playlist_access = 'view_play_list'
2531                 playlist_id = mobj.group(2)
2532                 video_ids = []
2533                 pagenum = 1
2534
2535                 while True:
2536                         self.report_download_page(playlist_id, pagenum)
2537                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2538                         request = urllib2.Request(url)
2539                         try:
2540                                 page = urllib2.urlopen(request).read()
2541                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2542                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2543                                 return
2544
2545                         # Extract video identifiers
2546                         ids_in_page = []
2547                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2548                                 if mobj.group(1) not in ids_in_page:
2549                                         ids_in_page.append(mobj.group(1))
2550                         video_ids.extend(ids_in_page)
2551
2552                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2553                                 break
2554                         pagenum = pagenum + 1
2555
2556                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2557                 playlistend = self._downloader.params.get('playlistend', -1)
2558                 video_ids = video_ids[playliststart:playlistend]
2559
2560                 for id in video_ids:
2561                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2562                 return
2563
2564
2565 class YoutubeUserIE(InfoExtractor):
2566         """Information Extractor for YouTube users."""
2567
2568         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2569         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2570         _GDATA_PAGE_SIZE = 50
2571         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2572         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2573         _youtube_ie = None
2574         IE_NAME = u'youtube:user'
2575
2576         def __init__(self, youtube_ie, downloader=None):
2577                 InfoExtractor.__init__(self, downloader)
2578                 self._youtube_ie = youtube_ie
2579
2580         def report_download_page(self, username, start_index):
2581                 """Report attempt to download user page."""
2582                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2583                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2584
2585         def _real_initialize(self):
2586                 self._youtube_ie.initialize()
2587
2588         def _real_extract(self, url):
2589                 # Extract username
2590                 mobj = re.match(self._VALID_URL, url)
2591                 if mobj is None:
2592                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2593                         return
2594
2595                 username = mobj.group(1)
2596
2597                 # Download video ids using YouTube Data API. Result size per
2598                 # query is limited (currently to 50 videos) so we need to query
2599                 # page by page until there are no video ids - it means we got
2600                 # all of them.
2601
2602                 video_ids = []
2603                 pagenum = 0
2604
2605                 while True:
2606                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2607                         self.report_download_page(username, start_index)
2608
2609                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2610
2611                         try:
2612                                 page = urllib2.urlopen(request).read()
2613                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2614                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2615                                 return
2616
2617                         # Extract video identifiers
2618                         ids_in_page = []
2619
2620                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2621                                 if mobj.group(1) not in ids_in_page:
2622                                         ids_in_page.append(mobj.group(1))
2623
2624                         video_ids.extend(ids_in_page)
2625
2626                         # A little optimization - if current page is not
2627                         # "full", ie. does not contain PAGE_SIZE video ids then
2628                         # we can assume that this page is the last one - there
2629                         # are no more ids on further pages - no need to query
2630                         # again.
2631
2632                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2633                                 break
2634
2635                         pagenum += 1
2636
2637                 all_ids_count = len(video_ids)
2638                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2639                 playlistend = self._downloader.params.get('playlistend', -1)
2640
2641                 if playlistend == -1:
2642                         video_ids = video_ids[playliststart:]
2643                 else:
2644                         video_ids = video_ids[playliststart:playlistend]
2645
2646                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2647                                 (username, all_ids_count, len(video_ids)))
2648
2649                 for video_id in video_ids:
2650                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2651
2652
2653 class DepositFilesIE(InfoExtractor):
2654         """Information extractor for depositfiles.com"""
2655
2656         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2657         IE_NAME = u'DepositFiles'
2658
2659         def __init__(self, downloader=None):
2660                 InfoExtractor.__init__(self, downloader)
2661
2662         def report_download_webpage(self, file_id):
2663                 """Report webpage download."""
2664                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2665
2666         def report_extraction(self, file_id):
2667                 """Report information extraction."""
2668                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2669
2670         def _real_extract(self, url):
2671                 # At this point we have a new file
2672                 self._downloader.increment_downloads()
2673
2674                 file_id = url.split('/')[-1]
2675                 # Rebuild url in english locale
2676                 url = 'http://depositfiles.com/en/files/' + file_id
2677
2678                 # Retrieve file webpage with 'Free download' button pressed
2679                 free_download_indication = { 'gateway_result' : '1' }
2680                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2681                 try:
2682                         self.report_download_webpage(file_id)
2683                         webpage = urllib2.urlopen(request).read()
2684                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2685                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2686                         return
2687
2688                 # Search for the real file URL
2689                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2690                 if (mobj is None) or (mobj.group(1) is None):
2691                         # Try to figure out reason of the error.
2692                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2693                         if (mobj is not None) and (mobj.group(1) is not None):
2694                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2695                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2696                         else:
2697                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2698                         return
2699
2700                 file_url = mobj.group(1)
2701                 file_extension = os.path.splitext(file_url)[1][1:]
2702
2703                 # Search for file title
2704                 mobj = re.search(r'<b title="(.*?)">', webpage)
2705                 if mobj is None:
2706                         self._downloader.trouble(u'ERROR: unable to extract title')
2707                         return
2708                 file_title = mobj.group(1).decode('utf-8')
2709
2710                 try:
2711                         # Process file information
2712                         self._downloader.process_info({
2713                                 'id':           file_id.decode('utf-8'),
2714                                 'url':          file_url.decode('utf-8'),
2715                                 'uploader':     u'NA',
2716                                 'upload_date':  u'NA',
2717                                 'title':        file_title,
2718                                 'stitle':       file_title,
2719                                 'ext':          file_extension.decode('utf-8'),
2720                                 'format':       u'NA',
2721                                 'player_url':   None,
2722                         })
2723                 except UnavailableVideoError, err:
2724                         self._downloader.trouble(u'ERROR: unable to download file')
2725
2726
2727 class FacebookIE(InfoExtractor):
2728         """Information Extractor for Facebook"""
2729
2730         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2731         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2732         _NETRC_MACHINE = 'facebook'
2733         _available_formats = ['video', 'highqual', 'lowqual']
2734         _video_extensions = {
2735                 'video': 'mp4',
2736                 'highqual': 'mp4',
2737                 'lowqual': 'mp4',
2738         }
2739         IE_NAME = u'facebook'
2740
2741         def __init__(self, downloader=None):
2742                 InfoExtractor.__init__(self, downloader)
2743
2744         def _reporter(self, message):
2745                 """Add header and report message."""
2746                 self._downloader.to_screen(u'[facebook] %s' % message)
2747
2748         def report_login(self):
2749                 """Report attempt to log in."""
2750                 self._reporter(u'Logging in')
2751
2752         def report_video_webpage_download(self, video_id):
2753                 """Report attempt to download video webpage."""
2754                 self._reporter(u'%s: Downloading video webpage' % video_id)
2755
2756         def report_information_extraction(self, video_id):
2757                 """Report attempt to extract video information."""
2758                 self._reporter(u'%s: Extracting video information' % video_id)
2759
2760         def _parse_page(self, video_webpage):
2761                 """Extract video information from page"""
2762                 # General data
2763                 data = {'title': r'\("video_title", "(.*?)"\)',
2764                         'description': r'<div class="datawrap">(.*?)</div>',
2765                         'owner': r'\("video_owner_name", "(.*?)"\)',
2766                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2767                         }
2768                 video_info = {}
2769                 for piece in data.keys():
2770                         mobj = re.search(data[piece], video_webpage)
2771                         if mobj is not None:
2772                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2773
2774                 # Video urls
2775                 video_urls = {}
2776                 for fmt in self._available_formats:
2777                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2778                         if mobj is not None:
2779                                 # URL is in a Javascript segment inside an escaped Unicode format within
2780                                 # the generally utf-8 page
2781                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2782                 video_info['video_urls'] = video_urls
2783
2784                 return video_info
2785
2786         def _real_initialize(self):
2787                 if self._downloader is None:
2788                         return
2789
2790                 useremail = None
2791                 password = None
2792                 downloader_params = self._downloader.params
2793
2794                 # Attempt to use provided username and password or .netrc data
2795                 if downloader_params.get('username', None) is not None:
2796                         useremail = downloader_params['username']
2797                         password = downloader_params['password']
2798                 elif downloader_params.get('usenetrc', False):
2799                         try:
2800                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2801                                 if info is not None:
2802                                         useremail = info[0]
2803                                         password = info[2]
2804                                 else:
2805                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2806                         except (IOError, netrc.NetrcParseError), err:
2807                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2808                                 return
2809
2810                 if useremail is None:
2811                         return
2812
2813                 # Log in
2814                 login_form = {
2815                         'email': useremail,
2816                         'pass': password,
2817                         'login': 'Log+In'
2818                         }
2819                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2820                 try:
2821                         self.report_login()
2822                         login_results = urllib2.urlopen(request).read()
2823                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2824                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2825                                 return
2826                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2827                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2828                         return
2829
2830         def _real_extract(self, url):
2831                 mobj = re.match(self._VALID_URL, url)
2832                 if mobj is None:
2833                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2834                         return
2835                 video_id = mobj.group('ID')
2836
2837                 # Get video webpage
2838                 self.report_video_webpage_download(video_id)
2839                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2840                 try:
2841                         page = urllib2.urlopen(request)
2842                         video_webpage = page.read()
2843                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2844                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2845                         return
2846
2847                 # Start extracting information
2848                 self.report_information_extraction(video_id)
2849
2850                 # Extract information
2851                 video_info = self._parse_page(video_webpage)
2852
2853                 # uploader
2854                 if 'owner' not in video_info:
2855                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2856                         return
2857                 video_uploader = video_info['owner']
2858
2859                 # title
2860                 if 'title' not in video_info:
2861                         self._downloader.trouble(u'ERROR: unable to extract video title')
2862                         return
2863                 video_title = video_info['title']
2864                 video_title = video_title.decode('utf-8')
2865                 video_title = sanitize_title(video_title)
2866
2867                 simple_title = _simplify_title(video_title)
2868
2869                 # thumbnail image
2870                 if 'thumbnail' not in video_info:
2871                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2872                         video_thumbnail = ''
2873                 else:
2874                         video_thumbnail = video_info['thumbnail']
2875
2876                 # upload date
2877                 upload_date = u'NA'
2878                 if 'upload_date' in video_info:
2879                         upload_time = video_info['upload_date']
2880                         timetuple = email.utils.parsedate_tz(upload_time)
2881                         if timetuple is not None:
2882                                 try:
2883                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2884                                 except:
2885                                         pass
2886
2887                 # description
2888                 video_description = video_info.get('description', 'No description available.')
2889
2890                 url_map = video_info['video_urls']
2891                 if len(url_map.keys()) > 0:
2892                         # Decide which formats to download
2893                         req_format = self._downloader.params.get('format', None)
2894                         format_limit = self._downloader.params.get('format_limit', None)
2895
2896                         if format_limit is not None and format_limit in self._available_formats:
2897                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2898                         else:
2899                                 format_list = self._available_formats
2900                         existing_formats = [x for x in format_list if x in url_map]
2901                         if len(existing_formats) == 0:
2902                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2903                                 return
2904                         if req_format is None:
2905                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2906                         elif req_format == 'worst':
2907                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2908                         elif req_format == '-1':
2909                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2910                         else:
2911                                 # Specific format
2912                                 if req_format not in url_map:
2913                                         self._downloader.trouble(u'ERROR: requested format not available')
2914                                         return
2915                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2916
2917                 for format_param, video_real_url in video_url_list:
2918
2919                         # At this point we have a new video
2920                         self._downloader.increment_downloads()
2921
2922                         # Extension
2923                         video_extension = self._video_extensions.get(format_param, 'mp4')
2924
2925                         try:
2926                                 # Process video information
2927                                 self._downloader.process_info({
2928                                         'id':           video_id.decode('utf-8'),
2929                                         'url':          video_real_url.decode('utf-8'),
2930                                         'uploader':     video_uploader.decode('utf-8'),
2931                                         'upload_date':  upload_date,
2932                                         'title':        video_title,
2933                                         'stitle':       simple_title,
2934                                         'ext':          video_extension.decode('utf-8'),
2935                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2936                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2937                                         'description':  video_description.decode('utf-8'),
2938                                         'player_url':   None,
2939                                 })
2940                         except UnavailableVideoError, err:
2941                                 self._downloader.trouble(u'\nERROR: unable to download video')
2942
2943 class BlipTVIE(InfoExtractor):
2944         """Information extractor for blip.tv"""
2945
2946         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2947         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2948         IE_NAME = u'blip.tv'
2949
2950         def report_extraction(self, file_id):
2951                 """Report information extraction."""
2952                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2953
2954         def report_direct_download(self, title):
2955                 """Report information extraction."""
2956                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2957
2958         def _real_extract(self, url):
2959                 mobj = re.match(self._VALID_URL, url)
2960                 if mobj is None:
2961                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2962                         return
2963
2964                 if '?' in url:
2965                         cchar = '&'
2966                 else:
2967                         cchar = '?'
2968                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2969                 request = urllib2.Request(json_url)
2970                 self.report_extraction(mobj.group(1))
2971                 info = None
2972                 try:
2973                         urlh = urllib2.urlopen(request)
2974                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2975                                 basename = url.split('/')[-1]
2976                                 title,ext = os.path.splitext(basename)
2977                                 title = title.decode('UTF-8')
2978                                 ext = ext.replace('.', '')
2979                                 self.report_direct_download(title)
2980                                 info = {
2981                                         'id': title,
2982                                         'url': url,
2983                                         'title': title,
2984                                         'stitle': _simplify_title(title),
2985                                         'ext': ext,
2986                                         'urlhandle': urlh
2987                                 }
2988                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2989                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2990                         return
2991                 if info is None: # Regular URL
2992                         try:
2993                                 json_code = urlh.read()
2994                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2995                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2996                                 return
2997
2998                         try:
2999                                 json_data = json.loads(json_code)
3000                                 if 'Post' in json_data:
3001                                         data = json_data['Post']
3002                                 else:
3003                                         data = json_data
3004         
3005                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3006                                 video_url = data['media']['url']
3007                                 umobj = re.match(self._URL_EXT, video_url)
3008                                 if umobj is None:
3009                                         raise ValueError('Can not determine filename extension')
3010                                 ext = umobj.group(1)
3011         
3012                                 info = {
3013                                         'id': data['item_id'],
3014                                         'url': video_url,
3015                                         'uploader': data['display_name'],
3016                                         'upload_date': upload_date,
3017                                         'title': data['title'],
3018                                         'stitle': _simplify_title(data['title']),
3019                                         'ext': ext,
3020                                         'format': data['media']['mimeType'],
3021                                         'thumbnail': data['thumbnailUrl'],
3022                                         'description': data['description'],
3023                                         'player_url': data['embedUrl']
3024                                 }
3025                         except (ValueError,KeyError), err:
3026                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3027                                 return
3028
3029                 self._downloader.increment_downloads()
3030
3031                 try:
3032                         self._downloader.process_info(info)
3033                 except UnavailableVideoError, err:
3034                         self._downloader.trouble(u'\nERROR: unable to download video')
3035
3036
3037 class MyVideoIE(InfoExtractor):
3038         """Information Extractor for myvideo.de."""
3039
3040         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3041         IE_NAME = u'myvideo'
3042
3043         def __init__(self, downloader=None):
3044                 InfoExtractor.__init__(self, downloader)
3045         
3046         def report_download_webpage(self, video_id):
3047                 """Report webpage download."""
3048                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3049
3050         def report_extraction(self, video_id):
3051                 """Report information extraction."""
3052                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3053
3054         def _real_extract(self,url):
3055                 mobj = re.match(self._VALID_URL, url)
3056                 if mobj is None:
3057                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3058                         return
3059
3060                 video_id = mobj.group(1)
3061
3062                 # Get video webpage
3063                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3064                 try:
3065                         self.report_download_webpage(video_id)
3066                         webpage = urllib2.urlopen(request).read()
3067                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3068                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3069                         return
3070
3071                 self.report_extraction(video_id)
3072                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3073                                  webpage)
3074                 if mobj is None:
3075                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3076                         return
3077                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3078
3079                 mobj = re.search('<title>([^<]+)</title>', webpage)
3080                 if mobj is None:
3081                         self._downloader.trouble(u'ERROR: unable to extract title')
3082                         return
3083
3084                 video_title = mobj.group(1)
3085                 video_title = sanitize_title(video_title)
3086
3087                 simple_title = _simplify_title(video_title)
3088
3089                 try:
3090                         self._downloader.process_info({
3091                                 'id':           video_id,
3092                                 'url':          video_url,
3093                                 'uploader':     u'NA',
3094                                 'upload_date':  u'NA',
3095                                 'title':        video_title,
3096                                 'stitle':       simple_title,
3097                                 'ext':          u'flv',
3098                                 'format':       u'NA',
3099                                 'player_url':   None,
3100                         })
3101                 except UnavailableVideoError:
3102                         self._downloader.trouble(u'\nERROR: Unable to download video')
3103
3104 class ComedyCentralIE(InfoExtractor):
3105         """Information extractor for The Daily Show and Colbert Report """
3106
3107         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3108         IE_NAME = u'comedycentral'
3109
3110         def report_extraction(self, episode_id):
3111                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3112         
3113         def report_config_download(self, episode_id):
3114                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3115
3116         def report_index_download(self, episode_id):
3117                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3118
3119         def report_player_url(self, episode_id):
3120                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3121
3122         def _real_extract(self, url):
3123                 mobj = re.match(self._VALID_URL, url)
3124                 if mobj is None:
3125                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3126                         return
3127
3128                 if mobj.group('shortname'):
3129                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3130                                 url = u'http://www.thedailyshow.com/full-episodes/'
3131                         else:
3132                                 url = u'http://www.colbertnation.com/full-episodes/'
3133                         mobj = re.match(self._VALID_URL, url)
3134                         assert mobj is not None
3135
3136                 dlNewest = not mobj.group('episode')
3137                 if dlNewest:
3138                         epTitle = mobj.group('showname')
3139                 else:
3140                         epTitle = mobj.group('episode')
3141
3142                 req = urllib2.Request(url)
3143                 self.report_extraction(epTitle)
3144                 try:
3145                         htmlHandle = urllib2.urlopen(req)
3146                         html = htmlHandle.read()
3147                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3148                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3149                         return
3150                 if dlNewest:
3151                         url = htmlHandle.geturl()
3152                         mobj = re.match(self._VALID_URL, url)
3153                         if mobj is None:
3154                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3155                                 return
3156                         if mobj.group('episode') == '':
3157                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3158                                 return
3159                         epTitle = mobj.group('episode')
3160
3161                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3162                 if len(mMovieParams) == 0:
3163                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3164                         return
3165
3166                 playerUrl_raw = mMovieParams[0][0]
3167                 self.report_player_url(epTitle)
3168                 try:
3169                         urlHandle = urllib2.urlopen(playerUrl_raw)
3170                         playerUrl = urlHandle.geturl()
3171                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3172                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3173                         return
3174
3175                 uri = mMovieParams[0][1]
3176                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3177                 self.report_index_download(epTitle)
3178                 try:
3179                         indexXml = urllib2.urlopen(indexUrl).read()
3180                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3181                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3182                         return
3183
3184                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3185                 itemEls = idoc.findall('.//item')
3186                 for itemEl in itemEls:
3187                         mediaId = itemEl.findall('./guid')[0].text
3188                         shortMediaId = mediaId.split(':')[-1]
3189                         showId = mediaId.split(':')[-2].replace('.com', '')
3190                         officialTitle = itemEl.findall('./title')[0].text
3191                         officialDate = itemEl.findall('./pubDate')[0].text
3192
3193                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3194                                                 urllib.urlencode({'uri': mediaId}))
3195                         configReq = urllib2.Request(configUrl)
3196                         self.report_config_download(epTitle)
3197                         try:
3198                                 configXml = urllib2.urlopen(configReq).read()
3199                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3200                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3201                                 return
3202
3203                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3204                         turls = []
3205                         for rendition in cdoc.findall('.//rendition'):
3206                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3207                                 turls.append(finfo)
3208
3209                         if len(turls) == 0:
3210                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3211                                 continue
3212
3213                         # For now, just pick the highest bitrate
3214                         format,video_url = turls[-1]
3215
3216                         self._downloader.increment_downloads()
3217
3218                         effTitle = showId + u'-' + epTitle
3219                         info = {
3220                                 'id': shortMediaId,
3221                                 'url': video_url,
3222                                 'uploader': showId,
3223                                 'upload_date': officialDate,
3224                                 'title': effTitle,
3225                                 'stitle': _simplify_title(effTitle),
3226                                 'ext': 'mp4',
3227                                 'format': format,
3228                                 'thumbnail': None,
3229                                 'description': officialTitle,
3230                                 'player_url': playerUrl
3231                         }
3232
3233                         try:
3234                                 self._downloader.process_info(info)
3235                         except UnavailableVideoError, err:
3236                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3237                                 continue
3238
3239
3240 class EscapistIE(InfoExtractor):
3241         """Information extractor for The Escapist """
3242
3243         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3244         IE_NAME = u'escapist'
3245
3246         def report_extraction(self, showName):
3247                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3248
3249         def report_config_download(self, showName):
3250                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3251
3252         def _real_extract(self, url):
3253                 htmlParser = HTMLParser.HTMLParser()
3254
3255                 mobj = re.match(self._VALID_URL, url)
3256                 if mobj is None:
3257                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3258                         return
3259                 showName = mobj.group('showname')
3260                 videoId = mobj.group('episode')
3261
3262                 self.report_extraction(showName)
3263                 try:
3264                         webPage = urllib2.urlopen(url).read()
3265                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3266                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3267                         return
3268
3269                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3270                 description = htmlParser.unescape(descMatch.group(1))
3271                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3272                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3273                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3274                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3275                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3276                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3277
3278                 self.report_config_download(showName)
3279                 try:
3280                         configJSON = urllib2.urlopen(configUrl).read()
3281                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3282                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3283                         return
3284
3285                 # Technically, it's JavaScript, not JSON
3286                 configJSON = configJSON.replace("'", '"')
3287
3288                 try:
3289                         config = json.loads(configJSON)
3290                 except (ValueError,), err:
3291                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3292                         return
3293
3294                 playlist = config['playlist']
3295                 videoUrl = playlist[1]['url']
3296
3297                 self._downloader.increment_downloads()
3298                 info = {
3299                         'id': videoId,
3300                         'url': videoUrl,
3301                         'uploader': showName,
3302                         'upload_date': None,
3303                         'title': showName,
3304                         'stitle': _simplify_title(showName),
3305                         'ext': 'flv',
3306                         'format': 'flv',
3307                         'thumbnail': imgUrl,
3308                         'description': description,
3309                         'player_url': playerUrl,
3310                 }
3311
3312                 try:
3313                         self._downloader.process_info(info)
3314                 except UnavailableVideoError, err:
3315                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3316
3317
3318 class CollegeHumorIE(InfoExtractor):
3319         """Information extractor for collegehumor.com"""
3320
3321         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3322         IE_NAME = u'collegehumor'
3323
3324         def report_webpage(self, video_id):
3325                 """Report information extraction."""
3326                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3327
3328         def report_extraction(self, video_id):
3329                 """Report information extraction."""
3330                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3331
3332         def _real_extract(self, url):
3333                 htmlParser = HTMLParser.HTMLParser()
3334
3335                 mobj = re.match(self._VALID_URL, url)
3336                 if mobj is None:
3337                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3338                         return
3339                 video_id = mobj.group('videoid')
3340
3341                 self.report_webpage(video_id)
3342                 request = urllib2.Request(url)
3343                 try:
3344                         webpage = urllib2.urlopen(request).read()
3345                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3346                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3347                         return
3348
3349                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3350                 if m is None:
3351                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3352                         return
3353                 internal_video_id = m.group('internalvideoid')
3354
3355                 info = {
3356                         'id': video_id,
3357                         'internal_id': internal_video_id,
3358                 }
3359
3360                 self.report_extraction(video_id)
3361                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3362                 try:
3363                         metaXml = urllib2.urlopen(xmlUrl).read()
3364                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3365                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3366                         return
3367
3368                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3369                 try:
3370                         videoNode = mdoc.findall('./video')[0]
3371                         info['description'] = videoNode.findall('./description')[0].text
3372                         info['title'] = videoNode.findall('./caption')[0].text
3373                         info['stitle'] = _simplify_title(info['title'])
3374                         info['url'] = videoNode.findall('./file')[0].text
3375                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3376                         info['ext'] = info['url'].rpartition('.')[2]
3377                         info['format'] = info['ext']
3378                 except IndexError:
3379                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3380                         return
3381
3382                 self._downloader.increment_downloads()
3383
3384                 try:
3385                         self._downloader.process_info(info)
3386                 except UnavailableVideoError, err:
3387                         self._downloader.trouble(u'\nERROR: unable to download video')
3388
3389
3390 class XVideosIE(InfoExtractor):
3391         """Information extractor for xvideos.com"""
3392
3393         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3394         IE_NAME = u'xvideos'
3395
3396         def report_webpage(self, video_id):
3397                 """Report information extraction."""
3398                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3399
3400         def report_extraction(self, video_id):
3401                 """Report information extraction."""
3402                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3403
3404         def _real_extract(self, url):
3405                 htmlParser = HTMLParser.HTMLParser()
3406
3407                 mobj = re.match(self._VALID_URL, url)
3408                 if mobj is None:
3409                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3410                         return
3411                 video_id = mobj.group(1).decode('utf-8')
3412
3413                 self.report_webpage(video_id)
3414
3415                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3416                 try:
3417                         webpage = urllib2.urlopen(request).read()
3418                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3419                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3420                         return
3421
3422                 self.report_extraction(video_id)
3423
3424
3425                 # Extract video URL
3426                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3427                 if mobj is None:
3428                         self._downloader.trouble(u'ERROR: unable to extract video url')
3429                         return
3430                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3431
3432
3433                 # Extract title
3434                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3435                 if mobj is None:
3436                         self._downloader.trouble(u'ERROR: unable to extract video title')
3437                         return
3438                 video_title = mobj.group(1).decode('utf-8')
3439
3440
3441                 # Extract video thumbnail
3442                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3443                 if mobj is None:
3444                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3445                         return
3446                 video_thumbnail = mobj.group(1).decode('utf-8')
3447
3448
3449
3450                 self._downloader.increment_downloads()
3451                 info = {
3452                         'id': video_id,
3453                         'url': video_url,
3454                         'uploader': None,
3455                         'upload_date': None,
3456                         'title': video_title,
3457                         'stitle': _simplify_title(video_title),
3458                         'ext': 'flv',
3459                         'format': 'flv',
3460                         'thumbnail': video_thumbnail,
3461                         'description': None,
3462                         'player_url': None,
3463                 }
3464
3465                 try:
3466                         self._downloader.process_info(info)
3467                 except UnavailableVideoError, err:
3468                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3469
3470
3471 class SoundcloudIE(InfoExtractor):
3472         """Information extractor for soundcloud.com
3473            To access the media, the uid of the song and a stream token
3474            must be extracted from the page source and the script must make
3475            a request to media.soundcloud.com/crossdomain.xml. Then
3476            the media can be grabbed by requesting from an url composed
3477            of the stream token and uid
3478          """
3479
3480         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3481         IE_NAME = u'soundcloud'
3482
3483         def __init__(self, downloader=None):
3484                 InfoExtractor.__init__(self, downloader)
3485
3486         def report_webpage(self, video_id):
3487                 """Report information extraction."""
3488                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3489
3490         def report_extraction(self, video_id):
3491                 """Report information extraction."""
3492                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3493
3494         def _real_extract(self, url):
3495                 htmlParser = HTMLParser.HTMLParser()
3496
3497                 mobj = re.match(self._VALID_URL, url)
3498                 if mobj is None:
3499                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3500                         return
3501
3502                 # extract uploader (which is in the url)
3503                 uploader = mobj.group(1).decode('utf-8')
3504                 # extract simple title (uploader + slug of song title)
3505                 slug_title =  mobj.group(2).decode('utf-8')
3506                 simple_title = uploader + '-' + slug_title
3507
3508                 self.report_webpage('%s/%s' % (uploader, slug_title))
3509
3510                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3511                 try:
3512                         webpage = urllib2.urlopen(request).read()
3513                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3514                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3515                         return
3516
3517                 self.report_extraction('%s/%s' % (uploader, slug_title))
3518
3519                 # extract uid and stream token that soundcloud hands out for access
3520                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3521                 if mobj:
3522                         video_id = mobj.group(1)
3523                         stream_token = mobj.group(2)
3524
3525                 # extract unsimplified title
3526                 mobj = re.search('"title":"(.*?)",', webpage)
3527                 if mobj:
3528                         title = mobj.group(1)
3529
3530                 # construct media url (with uid/token)
3531                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3532                 mediaURL = mediaURL % (video_id, stream_token)
3533
3534                 # description
3535                 description = u'No description available'
3536                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3537                 if mobj:
3538                         description = mobj.group(1)
3539                 
3540                 # upload date
3541                 upload_date = None
3542                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3543                 if mobj:
3544                         try:
3545                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3546                         except Exception, e:
3547                                 print str(e)
3548
3549                 # for soundcloud, a request to a cross domain is required for cookies
3550                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3551
3552                 try:
3553                         self._downloader.process_info({
3554                                 'id':           video_id.decode('utf-8'),
3555                                 'url':          mediaURL,
3556                                 'uploader':     uploader.decode('utf-8'),
3557                                 'upload_date':  upload_date,
3558                                 'title':        simple_title.decode('utf-8'),
3559                                 'stitle':       simple_title.decode('utf-8'),
3560                                 'ext':          u'mp3',
3561                                 'format':       u'NA',
3562                                 'player_url':   None,
3563                                 'description': description.decode('utf-8')
3564                         })
3565                 except UnavailableVideoError:
3566                         self._downloader.trouble(u'\nERROR: unable to download video')
3567
3568
3569 class InfoQIE(InfoExtractor):
3570         """Information extractor for infoq.com"""
3571
3572         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3573         IE_NAME = u'infoq'
3574
3575         def report_webpage(self, video_id):
3576                 """Report information extraction."""
3577                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3578
3579         def report_extraction(self, video_id):
3580                 """Report information extraction."""
3581                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3582
3583         def _real_extract(self, url):
3584                 htmlParser = HTMLParser.HTMLParser()
3585
3586                 mobj = re.match(self._VALID_URL, url)
3587                 if mobj is None:
3588                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3589                         return
3590
3591                 self.report_webpage(url)
3592
3593                 request = urllib2.Request(url)
3594                 try:
3595                         webpage = urllib2.urlopen(request).read()
3596                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3597                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3598                         return
3599
3600                 self.report_extraction(url)
3601
3602
3603                 # Extract video URL
3604                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3605                 if mobj is None:
3606                         self._downloader.trouble(u'ERROR: unable to extract video url')
3607                         return
3608                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3609
3610
3611                 # Extract title
3612                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3613                 if mobj is None:
3614                         self._downloader.trouble(u'ERROR: unable to extract video title')
3615                         return
3616                 video_title = mobj.group(1).decode('utf-8')
3617
3618                 # Extract description
3619                 video_description = u'No description available.'
3620                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3621                 if mobj is not None:
3622                         video_description = mobj.group(1).decode('utf-8')
3623
3624                 video_filename = video_url.split('/')[-1]
3625                 video_id, extension = video_filename.split('.')
3626
3627                 self._downloader.increment_downloads()
3628                 info = {
3629                         'id': video_id,
3630                         'url': video_url,
3631                         'uploader': None,
3632                         'upload_date': None,
3633                         'title': video_title,
3634                         'stitle': _simplify_title(video_title),
3635                         'ext': extension,
3636                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3637                         'thumbnail': None,
3638                         'description': video_description,
3639                         'player_url': None,
3640                 }
3641
3642                 try:
3643                         self._downloader.process_info(info)
3644                 except UnavailableVideoError, err:
3645                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3646
3647 class MixcloudIE(InfoExtractor):
3648         """Information extractor for www.mixcloud.com"""
3649         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3650         IE_NAME = u'mixcloud'
3651
3652         def __init__(self, downloader=None):
3653                 InfoExtractor.__init__(self, downloader)
3654
3655         def report_download_json(self, file_id):
3656                 """Report JSON download."""
3657                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3658
3659         def report_extraction(self, file_id):
3660                 """Report information extraction."""
3661                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3662
3663         def get_urls(self, jsonData, fmt, bitrate='best'):
3664                 """Get urls from 'audio_formats' section in json"""
3665                 file_url = None
3666                 try:
3667                         bitrate_list = jsonData[fmt]
3668                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3669                                 bitrate = max(bitrate_list) # select highest
3670
3671                         url_list = jsonData[fmt][bitrate]
3672                 except TypeError: # we have no bitrate info.
3673                         url_list = jsonData[fmt]
3674                                 
3675                 return url_list
3676
3677         def check_urls(self, url_list):
3678                 """Returns 1st active url from list"""
3679                 for url in url_list:
3680                         try:
3681                                 urllib2.urlopen(url)
3682                                 return url
3683                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3684                                 url = None
3685
3686                 return None
3687
3688         def _print_formats(self, formats):
3689                 print 'Available formats:'
3690                 for fmt in formats.keys():
3691                         for b in formats[fmt]:
3692                                 try:
3693                                         ext = formats[fmt][b][0]
3694                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3695                                 except TypeError: # we have no bitrate info
3696                                         ext = formats[fmt][0]
3697                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3698                                         break
3699
3700         def _real_extract(self, url):
3701                 mobj = re.match(self._VALID_URL, url)
3702                 if mobj is None:
3703                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3704                         return
3705                 # extract uploader & filename from url
3706                 uploader = mobj.group(1).decode('utf-8')
3707                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3708
3709                 # construct API request
3710                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3711                 # retrieve .json file with links to files
3712                 request = urllib2.Request(file_url)
3713                 try:
3714                         self.report_download_json(file_url)
3715                         jsonData = urllib2.urlopen(request).read()
3716                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3717                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3718                         return
3719
3720                 # parse JSON
3721                 json_data = json.loads(jsonData)
3722                 player_url = json_data['player_swf_url']
3723                 formats = dict(json_data['audio_formats'])
3724
3725                 req_format = self._downloader.params.get('format', None)
3726                 bitrate = None
3727
3728                 if self._downloader.params.get('listformats', None):
3729                         self._print_formats(formats)
3730                         return
3731
3732                 if req_format is None or req_format == 'best':
3733                         for format_param in formats.keys():
3734                                 url_list = self.get_urls(formats, format_param)
3735                                 # check urls
3736                                 file_url = self.check_urls(url_list)
3737                                 if file_url is not None:
3738                                         break # got it!
3739                 else:
3740                         if req_format not in formats.keys():
3741                                 self._downloader.trouble(u'ERROR: format is not available')
3742                                 return
3743
3744                         url_list = self.get_urls(formats, req_format)
3745                         file_url = self.check_urls(url_list)
3746                         format_param = req_format
3747
3748                 # We have audio
3749                 self._downloader.increment_downloads()
3750                 try:
3751                         # Process file information
3752                         self._downloader.process_info({
3753                                 'id':           file_id.decode('utf-8'),
3754                                 'url':          file_url.decode('utf-8'),
3755                                 'uploader':     uploader.decode('utf-8'),
3756                                 'upload_date':  u'NA',
3757                                 'title':        json_data['name'],
3758                                 'stitle':       _simplify_title(json_data['name']),
3759                                 'ext':          file_url.split('.')[-1].decode('utf-8'),
3760                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3761                                 'thumbnail':    json_data['thumbnail_url'],
3762                                 'description':  json_data['description'],
3763                                 'player_url':   player_url.decode('utf-8'),
3764                         })
3765                 except UnavailableVideoError, err:
3766                         self._downloader.trouble(u'ERROR: unable to download file')
3767
3768 class StanfordOpenClassroomIE(InfoExtractor):
3769         """Information extractor for Stanford's Open ClassRoom"""
3770
3771         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3772         IE_NAME = u'stanfordoc'
3773
3774         def report_download_webpage(self, objid):
3775                 """Report information extraction."""
3776                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3777
3778         def report_extraction(self, video_id):
3779                 """Report information extraction."""
3780                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3781
3782         def _real_extract(self, url):
3783                 mobj = re.match(self._VALID_URL, url)
3784                 if mobj is None:
3785                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3786                         return
3787
3788                 if mobj.group('course') and mobj.group('video'): # A specific video
3789                         course = mobj.group('course')
3790                         video = mobj.group('video')
3791                         info = {
3792                                 'id': _simplify_title(course + '_' + video),
3793                         }
3794         
3795                         self.report_extraction(info['id'])
3796                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3797                         xmlUrl = baseUrl + video + '.xml'
3798                         try:
3799                                 metaXml = urllib2.urlopen(xmlUrl).read()
3800                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3801                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3802                                 return
3803                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3804                         try:
3805                                 info['title'] = mdoc.findall('./title')[0].text
3806                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3807                         except IndexError:
3808                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3809                                 return
3810                         info['stitle'] = _simplify_title(info['title'])
3811                         info['ext'] = info['url'].rpartition('.')[2]
3812                         info['format'] = info['ext']
3813                         self._downloader.increment_downloads()
3814                         try:
3815                                 self._downloader.process_info(info)
3816                         except UnavailableVideoError, err:
3817                                 self._downloader.trouble(u'\nERROR: unable to download video')
3818                 elif mobj.group('course'): # A course page
3819                         unescapeHTML = HTMLParser.HTMLParser().unescape
3820
3821                         course = mobj.group('course')
3822                         info = {
3823                                 'id': _simplify_title(course),
3824                                 'type': 'playlist',
3825                         }
3826
3827                         self.report_download_webpage(info['id'])
3828                         try:
3829                                 coursepage = urllib2.urlopen(url).read()
3830                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3831                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3832                                 return
3833
3834                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3835                         if m:
3836                                 info['title'] = unescapeHTML(m.group(1))
3837                         else:
3838                                 info['title'] = info['id']
3839                         info['stitle'] = _simplify_title(info['title'])
3840
3841                         m = re.search('<description>([^<]+)</description>', coursepage)
3842                         if m:
3843                                 info['description'] = unescapeHTML(m.group(1))
3844
3845                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3846                         info['list'] = [
3847                                 {
3848                                         'type': 'reference',
3849                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3850                                 }
3851                                         for vpage in links]
3852
3853                         for entry in info['list']:
3854                                 assert entry['type'] == 'reference'
3855                                 self.extract(entry['url'])
3856                 else: # Root page
3857                         unescapeHTML = HTMLParser.HTMLParser().unescape
3858
3859                         info = {
3860                                 'id': 'Stanford OpenClassroom',
3861                                 'type': 'playlist',
3862                         }
3863
3864                         self.report_download_webpage(info['id'])
3865                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3866                         try:
3867                                 rootpage = urllib2.urlopen(rootURL).read()
3868                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3869                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3870                                 return
3871
3872                         info['title'] = info['id']
3873                         info['stitle'] = _simplify_title(info['title'])
3874
3875                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3876                         info['list'] = [
3877                                 {
3878                                         'type': 'reference',
3879                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3880                                 }
3881                                         for cpage in links]
3882
3883                         for entry in info['list']:
3884                                 assert entry['type'] == 'reference'
3885                                 self.extract(entry['url'])
3886
3887
3888 class PostProcessor(object):
3889         """Post Processor class.
3890
3891         PostProcessor objects can be added to downloaders with their
3892         add_post_processor() method. When the downloader has finished a
3893         successful download, it will take its internal chain of PostProcessors
3894         and start calling the run() method on each one of them, first with
3895         an initial argument and then with the returned value of the previous
3896         PostProcessor.
3897
3898         The chain will be stopped if one of them ever returns None or the end
3899         of the chain is reached.
3900
3901         PostProcessor objects follow a "mutual registration" process similar
3902         to InfoExtractor objects.
3903         """
3904
3905         _downloader = None
3906
3907         def __init__(self, downloader=None):
3908                 self._downloader = downloader
3909
3910         def set_downloader(self, downloader):
3911                 """Sets the downloader for this PP."""
3912                 self._downloader = downloader
3913
3914         def run(self, information):
3915                 """Run the PostProcessor.
3916
3917                 The "information" argument is a dictionary like the ones
3918                 composed by InfoExtractors. The only difference is that this
3919                 one has an extra field called "filepath" that points to the
3920                 downloaded file.
3921
3922                 When this method returns None, the postprocessing chain is
3923                 stopped. However, this method may return an information
3924                 dictionary that will be passed to the next postprocessing
3925                 object in the chain. It can be the one it received after
3926                 changing some fields.
3927
3928                 In addition, this method may raise a PostProcessingError
3929                 exception that will be taken into account by the downloader
3930                 it was called from.
3931                 """
3932                 return information # by default, do nothing
3933
3934 class AudioConversionError(BaseException):
3935         def __init__(self, message):
3936                 self.message = message
3937
3938 class FFmpegExtractAudioPP(PostProcessor):
3939
3940         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3941                 PostProcessor.__init__(self, downloader)
3942                 if preferredcodec is None:
3943                         preferredcodec = 'best'
3944                 self._preferredcodec = preferredcodec
3945                 self._preferredquality = preferredquality
3946                 self._keepvideo = keepvideo
3947
3948         @staticmethod
3949         def get_audio_codec(path):
3950                 try:
3951                         cmd = ['ffprobe', '-show_streams', '--', path]
3952                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3953                         output = handle.communicate()[0]
3954                         if handle.wait() != 0:
3955                                 return None
3956                 except (IOError, OSError):
3957                         return None
3958                 audio_codec = None
3959                 for line in output.split('\n'):
3960                         if line.startswith('codec_name='):
3961                                 audio_codec = line.split('=')[1].strip()
3962                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3963                                 return audio_codec
3964                 return None
3965
3966         @staticmethod
3967         def run_ffmpeg(path, out_path, codec, more_opts):
3968                 if codec is None:
3969                         acodec_opts = []
3970                 else:
3971                         acodec_opts = ['-acodec', codec]
3972                 cmd = ['ffmpeg', '-y', '-i', path, '-vn'] + acodec_opts + more_opts + ['--', out_path]
3973                 try:
3974                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3975                         stdout,stderr = p.communicate()
3976                 except (IOError, OSError):
3977                         e = sys.exc_info()[1]
3978                         if isinstance(e, OSError) and e.errno == 2:
3979                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
3980                         else:
3981                                 raise e
3982                 if p.returncode != 0:
3983                         msg = stderr.strip().split('\n')[-1]
3984                         raise AudioConversionError(msg)
3985
3986         def run(self, information):
3987                 path = information['filepath']
3988
3989                 filecodec = self.get_audio_codec(path)
3990                 if filecodec is None:
3991                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3992                         return None
3993
3994                 more_opts = []
3995                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3996                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
3997                                 # Lossless, but in another container
3998                                 acodec = 'copy'
3999                                 extension = self._preferredcodec
4000                                 more_opts = ['-absf', 'aac_adtstoasc']
4001                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4002                                 # Lossless if possible
4003                                 acodec = 'copy'
4004                                 extension = filecodec
4005                                 if filecodec == 'aac':
4006                                         more_opts = ['-f', 'adts']
4007                                 if filecodec == 'vorbis':
4008                                         extension = 'ogg'
4009                         else:
4010                                 # MP3 otherwise.
4011                                 acodec = 'libmp3lame'
4012                                 extension = 'mp3'
4013                                 more_opts = []
4014                                 if self._preferredquality is not None:
4015                                         more_opts += ['-ab', self._preferredquality]
4016                 else:
4017                         # We convert the audio (lossy)
4018                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4019                         extension = self._preferredcodec
4020                         more_opts = []
4021                         if self._preferredquality is not None:
4022                                 more_opts += ['-ab', self._preferredquality]
4023                         if self._preferredcodec == 'aac':
4024                                 more_opts += ['-f', 'adts']
4025                         if self._preferredcodec == 'm4a':
4026                                 more_opts += ['-absf', 'aac_adtstoasc']
4027                         if self._preferredcodec == 'vorbis':
4028                                 extension = 'ogg'
4029                         if self._preferredcodec == 'wav':
4030                                 extension = 'wav'
4031                                 more_opts += ['-f', 'wav']
4032
4033                 (prefix, ext) = os.path.splitext(path)
4034                 new_path = prefix + '.' + extension
4035                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4036                 try:
4037                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4038                 except:
4039                         etype,e,tb = sys.exc_info()
4040                         if isinstance(e, AudioConversionError):
4041                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4042                         else:
4043                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4044                         return None
4045
4046                 # Try to update the date time for extracted audio file.
4047                 if information.get('filetime') is not None:
4048                         try:
4049                                 os.utime(new_path, (time.time(), information['filetime']))
4050                         except:
4051                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4052
4053                 if not self._keepvideo:
4054                         try:
4055                                 os.remove(path)
4056                         except (IOError, OSError):
4057                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4058                                 return None
4059
4060                 information['filepath'] = new_path
4061                 return information
4062
4063
4064 def updateSelf(downloader, filename):
4065         ''' Update the program file with the latest version from the repository '''
4066         # Note: downloader only used for options
4067         if not os.access(filename, os.W_OK):
4068                 sys.exit('ERROR: no write permissions on %s' % filename)
4069
4070         downloader.to_screen('Updating to latest version...')
4071
4072         try:
4073                 try:
4074                         urlh = urllib.urlopen(UPDATE_URL)
4075                         newcontent = urlh.read()
4076                         
4077                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4078                         if vmatch is not None and vmatch.group(1) == __version__:
4079                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4080                                 return
4081                 finally:
4082                         urlh.close()
4083         except (IOError, OSError), err:
4084                 sys.exit('ERROR: unable to download latest version')
4085
4086         try:
4087                 outf = open(filename, 'wb')
4088                 try:
4089                         outf.write(newcontent)
4090                 finally:
4091                         outf.close()
4092         except (IOError, OSError), err:
4093                 sys.exit('ERROR: unable to overwrite current version')
4094
4095         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4096
4097 def parseOpts():
4098         # Deferred imports
4099         import getpass
4100         import optparse
4101         import shlex
4102
4103         def _readOptions(filename):
4104                 try:
4105                         optionf = open(filename)
4106                 except IOError:
4107                         return [] # silently skip if file is not present
4108                 try:
4109                         res = []
4110                         for l in optionf:
4111                                 res += shlex.split(l, comments=True)
4112                 finally:
4113                         optionf.close()
4114                 return res
4115
4116         def _format_option_string(option):
4117                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4118
4119                 opts = []
4120
4121                 if option._short_opts: opts.append(option._short_opts[0])
4122                 if option._long_opts: opts.append(option._long_opts[0])
4123                 if len(opts) > 1: opts.insert(1, ', ')
4124
4125                 if option.takes_value(): opts.append(' %s' % option.metavar)
4126
4127                 return "".join(opts)
4128
4129         def _find_term_columns():
4130                 columns = os.environ.get('COLUMNS', None)
4131                 if columns:
4132                         return int(columns)
4133
4134                 try:
4135                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4136                         out,err = sp.communicate()
4137                         return int(out.split()[1])
4138                 except:
4139                         pass
4140                 return None
4141
4142         max_width = 80
4143         max_help_position = 80
4144
4145         # No need to wrap help messages if we're on a wide console
4146         columns = _find_term_columns()
4147         if columns: max_width = columns
4148
4149         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4150         fmt.format_option_strings = _format_option_string
4151
4152         kw = {
4153                 'version'   : __version__,
4154                 'formatter' : fmt,
4155                 'usage' : '%prog [options] url [url...]',
4156                 'conflict_handler' : 'resolve',
4157         }
4158
4159         parser = optparse.OptionParser(**kw)
4160
4161         # option groups
4162         general        = optparse.OptionGroup(parser, 'General Options')
4163         selection      = optparse.OptionGroup(parser, 'Video Selection')
4164         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4165         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4166         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4167         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4168         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4169
4170         general.add_option('-h', '--help',
4171                         action='help', help='print this help text and exit')
4172         general.add_option('-v', '--version',
4173                         action='version', help='print program version and exit')
4174         general.add_option('-U', '--update',
4175                         action='store_true', dest='update_self', help='update this program to latest version')
4176         general.add_option('-i', '--ignore-errors',
4177                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4178         general.add_option('-r', '--rate-limit',
4179                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4180         general.add_option('-R', '--retries',
4181                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4182         general.add_option('--dump-user-agent',
4183                         action='store_true', dest='dump_user_agent',
4184                         help='display the current browser identification', default=False)
4185         general.add_option('--list-extractors',
4186                         action='store_true', dest='list_extractors',
4187                         help='List all supported extractors and the URLs they would handle', default=False)
4188
4189         selection.add_option('--playlist-start',
4190                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4191         selection.add_option('--playlist-end',
4192                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4193         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4194         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4195         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4196
4197         authentication.add_option('-u', '--username',
4198                         dest='username', metavar='USERNAME', help='account username')
4199         authentication.add_option('-p', '--password',
4200                         dest='password', metavar='PASSWORD', help='account password')
4201         authentication.add_option('-n', '--netrc',
4202                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4203
4204
4205         video_format.add_option('-f', '--format',
4206                         action='store', dest='format', metavar='FORMAT', help='video format code')
4207         video_format.add_option('--all-formats',
4208                         action='store_const', dest='format', help='download all available video formats', const='all')
4209         video_format.add_option('--prefer-free-formats',
4210                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4211         video_format.add_option('--max-quality',
4212                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4213         video_format.add_option('-F', '--list-formats',
4214                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4215
4216
4217         verbosity.add_option('-q', '--quiet',
4218                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4219         verbosity.add_option('-s', '--simulate',
4220                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4221         verbosity.add_option('--skip-download',
4222                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4223         verbosity.add_option('-g', '--get-url',
4224                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4225         verbosity.add_option('-e', '--get-title',
4226                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4227         verbosity.add_option('--get-thumbnail',
4228                         action='store_true', dest='getthumbnail',
4229                         help='simulate, quiet but print thumbnail URL', default=False)
4230         verbosity.add_option('--get-description',
4231                         action='store_true', dest='getdescription',
4232                         help='simulate, quiet but print video description', default=False)
4233         verbosity.add_option('--get-filename',
4234                         action='store_true', dest='getfilename',
4235                         help='simulate, quiet but print output filename', default=False)
4236         verbosity.add_option('--get-format',
4237                         action='store_true', dest='getformat',
4238                         help='simulate, quiet but print output format', default=False)
4239         verbosity.add_option('--no-progress',
4240                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4241         verbosity.add_option('--console-title',
4242                         action='store_true', dest='consoletitle',
4243                         help='display progress in console titlebar', default=False)
4244
4245
4246         filesystem.add_option('-t', '--title',
4247                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4248         filesystem.add_option('-l', '--literal',
4249                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4250         filesystem.add_option('-A', '--auto-number',
4251                         action='store_true', dest='autonumber',
4252                         help='number downloaded files starting from 00000', default=False)
4253         filesystem.add_option('-o', '--output',
4254                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4255         filesystem.add_option('-a', '--batch-file',
4256                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4257         filesystem.add_option('-w', '--no-overwrites',
4258                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4259         filesystem.add_option('-c', '--continue',
4260                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4261         filesystem.add_option('--no-continue',
4262                         action='store_false', dest='continue_dl',
4263                         help='do not resume partially downloaded files (restart from beginning)')
4264         filesystem.add_option('--cookies',
4265                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4266         filesystem.add_option('--no-part',
4267                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4268         filesystem.add_option('--no-mtime',
4269                         action='store_false', dest='updatetime',
4270                         help='do not use the Last-modified header to set the file modification time', default=True)
4271         filesystem.add_option('--write-description',
4272                         action='store_true', dest='writedescription',
4273                         help='write video description to a .description file', default=False)
4274         filesystem.add_option('--write-info-json',
4275                         action='store_true', dest='writeinfojson',
4276                         help='write video metadata to a .info.json file', default=False)
4277
4278
4279         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4280                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4281         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4282                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4283         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4284                         help='ffmpeg audio bitrate specification, 128k by default')
4285         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4286                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4287
4288
4289         parser.add_option_group(general)
4290         parser.add_option_group(selection)
4291         parser.add_option_group(filesystem)
4292         parser.add_option_group(verbosity)
4293         parser.add_option_group(video_format)
4294         parser.add_option_group(authentication)
4295         parser.add_option_group(postproc)
4296
4297         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4298         if xdg_config_home:
4299                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4300         else:
4301                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4302         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4303         opts, args = parser.parse_args(argv)
4304
4305         return parser, opts, args
4306
4307 def gen_extractors():
4308         """ Return a list of an instance of every supported extractor.
4309         The order does matter; the first extractor matched is the one handling the URL.
4310         """
4311         youtube_ie = YoutubeIE()
4312         google_ie = GoogleIE()
4313         yahoo_ie = YahooIE()
4314         return [
4315                 YoutubePlaylistIE(youtube_ie),
4316                 YoutubeUserIE(youtube_ie),
4317                 YoutubeSearchIE(youtube_ie),
4318                 youtube_ie,
4319                 MetacafeIE(youtube_ie),
4320                 DailymotionIE(),
4321                 google_ie,
4322                 GoogleSearchIE(google_ie),
4323                 PhotobucketIE(),
4324                 yahoo_ie,
4325                 YahooSearchIE(yahoo_ie),
4326                 DepositFilesIE(),
4327                 FacebookIE(),
4328                 BlipTVIE(),
4329                 VimeoIE(),
4330                 MyVideoIE(),
4331                 ComedyCentralIE(),
4332                 EscapistIE(),
4333                 CollegeHumorIE(),
4334                 XVideosIE(),
4335                 SoundcloudIE(),
4336                 InfoQIE(),
4337                 MixcloudIE(),
4338                 StanfordOpenClassroomIE(),
4339
4340                 GenericIE()
4341         ]
4342
4343 def _real_main():
4344         parser, opts, args = parseOpts()
4345
4346         # Open appropriate CookieJar
4347         if opts.cookiefile is None:
4348                 jar = cookielib.CookieJar()
4349         else:
4350                 try:
4351                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4352                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4353                                 jar.load()
4354                 except (IOError, OSError), err:
4355                         sys.exit(u'ERROR: unable to open cookie file')
4356
4357         # Dump user agent
4358         if opts.dump_user_agent:
4359                 print std_headers['User-Agent']
4360                 sys.exit(0)
4361
4362         # Batch file verification
4363         batchurls = []
4364         if opts.batchfile is not None:
4365                 try:
4366                         if opts.batchfile == '-':
4367                                 batchfd = sys.stdin
4368                         else:
4369                                 batchfd = open(opts.batchfile, 'r')
4370                         batchurls = batchfd.readlines()
4371                         batchurls = [x.strip() for x in batchurls]
4372                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4373                 except IOError:
4374                         sys.exit(u'ERROR: batch file could not be read')
4375         all_urls = batchurls + args
4376
4377         # General configuration
4378         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4379         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4380         urllib2.install_opener(opener)
4381         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4382
4383         extractors = gen_extractors()
4384
4385         if opts.list_extractors:
4386                 for ie in extractors:
4387                         print(ie.IE_NAME)
4388                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4389                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4390                         for mu in matchedUrls:
4391                                 print(u'  ' + mu)
4392                 sys.exit(0)
4393
4394         # Conflicting, missing and erroneous options
4395         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4396                 parser.error(u'using .netrc conflicts with giving username/password')
4397         if opts.password is not None and opts.username is None:
4398                 parser.error(u'account username missing')
4399         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4400                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4401         if opts.usetitle and opts.useliteral:
4402                 parser.error(u'using title conflicts with using literal title')
4403         if opts.username is not None and opts.password is None:
4404                 opts.password = getpass.getpass(u'Type account password and press return:')
4405         if opts.ratelimit is not None:
4406                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4407                 if numeric_limit is None:
4408                         parser.error(u'invalid rate limit specified')
4409                 opts.ratelimit = numeric_limit
4410         if opts.retries is not None:
4411                 try:
4412                         opts.retries = long(opts.retries)
4413                 except (TypeError, ValueError), err:
4414                         parser.error(u'invalid retry count specified')
4415         try:
4416                 opts.playliststart = int(opts.playliststart)
4417                 if opts.playliststart <= 0:
4418                         raise ValueError(u'Playlist start must be positive')
4419         except (TypeError, ValueError), err:
4420                 parser.error(u'invalid playlist start number specified')
4421         try:
4422                 opts.playlistend = int(opts.playlistend)
4423                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4424                         raise ValueError(u'Playlist end must be greater than playlist start')
4425         except (TypeError, ValueError), err:
4426                 parser.error(u'invalid playlist end number specified')
4427         if opts.extractaudio:
4428                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4429                         parser.error(u'invalid audio format specified')
4430
4431         # File downloader
4432         fd = FileDownloader({
4433                 'usenetrc': opts.usenetrc,
4434                 'username': opts.username,
4435                 'password': opts.password,
4436                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4437                 'forceurl': opts.geturl,
4438                 'forcetitle': opts.gettitle,
4439                 'forcethumbnail': opts.getthumbnail,
4440                 'forcedescription': opts.getdescription,
4441                 'forcefilename': opts.getfilename,
4442                 'forceformat': opts.getformat,
4443                 'simulate': opts.simulate,
4444                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4445                 'format': opts.format,
4446                 'format_limit': opts.format_limit,
4447                 'listformats': opts.listformats,
4448                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4449                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4450                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4451                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4452                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4453                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4454                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4455                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4456                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4457                         or u'%(id)s.%(ext)s'),
4458                 'ignoreerrors': opts.ignoreerrors,
4459                 'ratelimit': opts.ratelimit,
4460                 'nooverwrites': opts.nooverwrites,
4461                 'retries': opts.retries,
4462                 'continuedl': opts.continue_dl,
4463                 'noprogress': opts.noprogress,
4464                 'playliststart': opts.playliststart,
4465                 'playlistend': opts.playlistend,
4466                 'logtostderr': opts.outtmpl == '-',
4467                 'consoletitle': opts.consoletitle,
4468                 'nopart': opts.nopart,
4469                 'updatetime': opts.updatetime,
4470                 'writedescription': opts.writedescription,
4471                 'writeinfojson': opts.writeinfojson,
4472                 'matchtitle': opts.matchtitle,
4473                 'rejecttitle': opts.rejecttitle,
4474                 'max_downloads': opts.max_downloads,
4475                 'prefer_free_formats': opts.prefer_free_formats,
4476                 })
4477         for extractor in extractors:
4478                 fd.add_info_extractor(extractor)
4479
4480         # PostProcessors
4481         if opts.extractaudio:
4482                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4483
4484         # Update version
4485         if opts.update_self:
4486                 updateSelf(fd, sys.argv[0])
4487
4488         # Maybe do nothing
4489         if len(all_urls) < 1:
4490                 if not opts.update_self:
4491                         parser.error(u'you must provide at least one URL')
4492                 else:
4493                         sys.exit()
4494         
4495         try:
4496                 retcode = fd.download(all_urls)
4497         except MaxDownloadsReached:
4498                 fd.to_screen(u'--max-download limit reached, aborting.')
4499                 retcode = 101
4500
4501         # Dump cookie jar if requested
4502         if opts.cookiefile is not None:
4503                 try:
4504                         jar.save()
4505                 except (IOError, OSError), err:
4506                         sys.exit(u'ERROR: unable to save cookie jar')
4507
4508         sys.exit(retcode)
4509
4510 def main():
4511         try:
4512                 _real_main()
4513         except DownloadError:
4514                 sys.exit(1)
4515         except SameFileError:
4516                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4517         except KeyboardInterrupt:
4518                 sys.exit(u'\nERROR: Interrupted by user')
4519
4520 if __name__ == '__main__':
4521         main()
4522
4523 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: