OpenClassRoom IE (Closes: #234)
[youtube-dl.git] / youtube_dl / __init__.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2011.11.23'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48         import ctypes
49
50 try:
51         import email.utils
52 except ImportError: # Python 2.4
53         import email.Utils
54 try:
55         import cStringIO as StringIO
56 except ImportError:
57         import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61         from urlparse import parse_qs
62 except ImportError:
63         from cgi import parse_qs
64
65 try:
66         import lxml.etree
67 except ImportError:
68         pass # Handled below
69
70 try:
71         import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79         'Accept-Encoding': 'gzip, deflate',
80         'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84         import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86         import re
87         class json(object):
88                 @staticmethod
89                 def loads(s):
90                         s = s.decode('UTF-8')
91                         def raiseError(msg, i):
92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93                         def skipSpace(i, expectMore=True):
94                                 while i < len(s) and s[i] in ' \t\r\n':
95                                         i += 1
96                                 if expectMore:
97                                         if i >= len(s):
98                                                 raiseError('Premature end', i)
99                                 return i
100                         def decodeEscape(match):
101                                 esc = match.group(1)
102                                 _STATIC = {
103                                         '"': '"',
104                                         '\\': '\\',
105                                         '/': '/',
106                                         'b': unichr(0x8),
107                                         'f': unichr(0xc),
108                                         'n': '\n',
109                                         'r': '\r',
110                                         't': '\t',
111                                 }
112                                 if esc in _STATIC:
113                                         return _STATIC[esc]
114                                 if esc[0] == 'u':
115                                         if len(esc) == 1+4:
116                                                 return unichr(int(esc[1:5], 16))
117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
118                                                 hi = int(esc[1:5], 16)
119                                                 low = int(esc[7:11], 16)
120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121                                 raise ValueError('Unknown escape ' + str(esc))
122                         def parseString(i):
123                                 i += 1
124                                 e = i
125                                 while True:
126                                         e = s.index('"', e)
127                                         bslashes = 0
128                                         while s[e-bslashes-1] == '\\':
129                                                 bslashes += 1
130                                         if bslashes % 2 == 1:
131                                                 e += 1
132                                                 continue
133                                         break
134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135                                 stri = rexp.sub(decodeEscape, s[i:e])
136                                 return (e+1,stri)
137                         def parseObj(i):
138                                 i += 1
139                                 res = {}
140                                 i = skipSpace(i)
141                                 if s[i] == '}': # Empty dictionary
142                                         return (i+1,res)
143                                 while True:
144                                         if s[i] != '"':
145                                                 raiseError('Expected a string object key', i)
146                                         i,key = parseString(i)
147                                         i = skipSpace(i)
148                                         if i >= len(s) or s[i] != ':':
149                                                 raiseError('Expected a colon', i)
150                                         i,val = parse(i+1)
151                                         res[key] = val
152                                         i = skipSpace(i)
153                                         if s[i] == '}':
154                                                 return (i+1, res)
155                                         if s[i] != ',':
156                                                 raiseError('Expected comma or closing curly brace', i)
157                                         i = skipSpace(i+1)
158                         def parseArray(i):
159                                 res = []
160                                 i = skipSpace(i+1)
161                                 if s[i] == ']': # Empty array
162                                         return (i+1,res)
163                                 while True:
164                                         i,val = parse(i)
165                                         res.append(val)
166                                         i = skipSpace(i) # Raise exception if premature end
167                                         if s[i] == ']':
168                                                 return (i+1, res)
169                                         if s[i] != ',':
170                                                 raiseError('Expected a comma or closing bracket', i)
171                                         i = skipSpace(i+1)
172                         def parseDiscrete(i):
173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
174                                         if s.startswith(k, i):
175                                                 return (i+len(k), v)
176                                 raiseError('Not a boolean (or null)', i)
177                         def parseNumber(i):
178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179                                 if mobj is None:
180                                         raiseError('Not a number', i)
181                                 nums = mobj.group(1)
182                                 if '.' in nums or 'e' in nums or 'E' in nums:
183                                         return (i+len(nums), float(nums))
184                                 return (i+len(nums), int(nums))
185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186                         def parse(i):
187                                 i = skipSpace(i)
188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
189                                 i = skipSpace(i, False)
190                                 return (i,res)
191                         i,res = parse(0)
192                         if i < len(s):
193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194                         return res
195
196 def preferredencoding():
197         """Get preferred encoding.
198
199         Returns the best encoding scheme for the system, based on
200         locale.getpreferredencoding() and some further tweaks.
201         """
202         def yield_preferredencoding():
203                 try:
204                         pref = locale.getpreferredencoding()
205                         u'TEST'.encode(pref)
206                 except:
207                         pref = 'UTF-8'
208                 while True:
209                         yield pref
210         return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214         """Transforms an HTML entity to a Unicode character.
215
216         This function receives a match object and is intended to be used with
217         the re.sub() function.
218         """
219         entity = matchobj.group(1)
220
221         # Known non-numeric HTML entity
222         if entity in htmlentitydefs.name2codepoint:
223                 return unichr(htmlentitydefs.name2codepoint[entity])
224
225         # Unicode character
226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
227         if mobj is not None:
228                 numstr = mobj.group(1)
229                 if numstr.startswith(u'x'):
230                         base = 16
231                         numstr = u'0%s' % numstr
232                 else:
233                         base = 10
234                 return unichr(long(numstr, base))
235
236         # Unknown entity in name, return its literal representation
237         return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241         """Sanitizes a video title so it could be used as part of a filename."""
242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243         return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247         """Try to open the given filename, and slightly tweak it if this fails.
248
249         Attempts to open the given filename. If this fails, it tries to change
250         the filename slightly, step by step, until it's either able to open it
251         or it fails and raises a final exception, like the standard open()
252         function.
253
254         It returns the tuple (stream, definitive_file_name).
255         """
256         try:
257                 if filename == u'-':
258                         if sys.platform == 'win32':
259                                 import msvcrt
260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261                         return (sys.stdout, filename)
262                 stream = open(filename, open_mode)
263                 return (stream, filename)
264         except (IOError, OSError), err:
265                 # In case of error, try to remove win32 forbidden chars
266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268                 # An exception here should be caught in the caller
269                 stream = open(filename, open_mode)
270                 return (stream, filename)
271
272
273 def timeconvert(timestr):
274         """Convert RFC 2822 defined time string into system timestamp"""
275         timestamp = None
276         timetuple = email.utils.parsedate_tz(timestr)
277         if timetuple is not None:
278                 timestamp = email.utils.mktime_tz(timetuple)
279         return timestamp
280
281 def _simplify_title(title):
282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283         return expr.sub(u'_', title).strip(u'_')
284
285 def _orderedSet(iterable):
286         """ Remove all duplicates from the input iterable """
287         res = []
288         for el in iterable:
289                 if el not in res:
290                         res.append(el)
291         return res
292
293 class DownloadError(Exception):
294         """Download Error exception.
295
296         This exception may be thrown by FileDownloader objects if they are not
297         configured to continue on errors. They will contain the appropriate
298         error message.
299         """
300         pass
301
302
303 class SameFileError(Exception):
304         """Same File exception.
305
306         This exception will be thrown by FileDownloader objects if they detect
307         multiple files would have to be downloaded to the same file on disk.
308         """
309         pass
310
311
312 class PostProcessingError(Exception):
313         """Post Processing exception.
314
315         This exception may be raised by PostProcessor's .run() method to
316         indicate an error in the postprocessing task.
317         """
318         pass
319
320
321 class UnavailableVideoError(Exception):
322         """Unavailable Format exception.
323
324         This exception will be thrown when a video is requested
325         in a format that is not available for that video.
326         """
327         pass
328
329
330 class ContentTooShortError(Exception):
331         """Content Too Short exception.
332
333         This exception may be raised by FileDownloader objects when a file they
334         download is too small for what the server announced first, indicating
335         the connection was probably interrupted.
336         """
337         # Both in bytes
338         downloaded = None
339         expected = None
340
341         def __init__(self, downloaded, expected):
342                 self.downloaded = downloaded
343                 self.expected = expected
344
345
346 class YoutubeDLHandler(urllib2.HTTPHandler):
347         """Handler for HTTP requests and responses.
348
349         This class, when installed with an OpenerDirector, automatically adds
350         the standard headers to every HTTP request and handles gzipped and
351         deflated responses from web servers. If compression is to be avoided in
352         a particular request, the original request in the program code only has
353         to include the HTTP header "Youtubedl-No-Compression", which will be
354         removed before making the real request.
355
356         Part of this code was copied from:
357
358         http://techknack.net/python-urllib2-handlers/
359
360         Andrew Rowls, the author of that code, agreed to release it to the
361         public domain.
362         """
363
364         @staticmethod
365         def deflate(data):
366                 try:
367                         return zlib.decompress(data, -zlib.MAX_WBITS)
368                 except zlib.error:
369                         return zlib.decompress(data)
370
371         @staticmethod
372         def addinfourl_wrapper(stream, headers, url, code):
373                 if hasattr(urllib2.addinfourl, 'getcode'):
374                         return urllib2.addinfourl(stream, headers, url, code)
375                 ret = urllib2.addinfourl(stream, headers, url)
376                 ret.code = code
377                 return ret
378
379         def http_request(self, req):
380                 for h in std_headers:
381                         if h in req.headers:
382                                 del req.headers[h]
383                         req.add_header(h, std_headers[h])
384                 if 'Youtubedl-no-compression' in req.headers:
385                         if 'Accept-encoding' in req.headers:
386                                 del req.headers['Accept-encoding']
387                         del req.headers['Youtubedl-no-compression']
388                 return req
389
390         def http_response(self, req, resp):
391                 old_resp = resp
392                 # gzip
393                 if resp.headers.get('Content-encoding', '') == 'gzip':
394                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
395                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
396                         resp.msg = old_resp.msg
397                 # deflate
398                 if resp.headers.get('Content-encoding', '') == 'deflate':
399                         gz = StringIO.StringIO(self.deflate(resp.read()))
400                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
401                         resp.msg = old_resp.msg
402                 return resp
403
404
405 class FileDownloader(object):
406         """File Downloader class.
407
408         File downloader objects are the ones responsible of downloading the
409         actual video file and writing it to disk if the user has requested
410         it, among some other tasks. In most cases there should be one per
411         program. As, given a video URL, the downloader doesn't know how to
412         extract all the needed information, task that InfoExtractors do, it
413         has to pass the URL to one of them.
414
415         For this, file downloader objects have a method that allows
416         InfoExtractors to be registered in a given order. When it is passed
417         a URL, the file downloader handles it to the first InfoExtractor it
418         finds that reports being able to handle it. The InfoExtractor extracts
419         all the information about the video or videos the URL refers to, and
420         asks the FileDownloader to process the video information, possibly
421         downloading the video.
422
423         File downloaders accept a lot of parameters. In order not to saturate
424         the object constructor with arguments, it receives a dictionary of
425         options instead. These options are available through the params
426         attribute for the InfoExtractors to use. The FileDownloader also
427         registers itself as the downloader in charge for the InfoExtractors
428         that are added to it, so this is a "mutual registration".
429
430         Available options:
431
432         username:         Username for authentication purposes.
433         password:         Password for authentication purposes.
434         usenetrc:         Use netrc for authentication instead.
435         quiet:            Do not print messages to stdout.
436         forceurl:         Force printing final URL.
437         forcetitle:       Force printing title.
438         forcethumbnail:   Force printing thumbnail URL.
439         forcedescription: Force printing description.
440         forcefilename:    Force printing final filename.
441         simulate:         Do not download the video files.
442         format:           Video format code.
443         format_limit:     Highest quality format to try.
444         outtmpl:          Template for output names.
445         ignoreerrors:     Do not stop on download errors.
446         ratelimit:        Download speed limit, in bytes/sec.
447         nooverwrites:     Prevent overwriting files.
448         retries:          Number of times to retry for HTTP error 5xx
449         continuedl:       Try to continue downloads if possible.
450         noprogress:       Do not print the progress bar.
451         playliststart:    Playlist item to start at.
452         playlistend:      Playlist item to end at.
453         matchtitle:       Download only matching titles.
454         rejecttitle:      Reject downloads for matching titles.
455         logtostderr:      Log messages to stderr instead of stdout.
456         consoletitle:     Display progress in console window's titlebar.
457         nopart:           Do not use temporary .part files.
458         updatetime:       Use the Last-modified header to set output file timestamps.
459         writedescription: Write the video description to a .description file
460         writeinfojson:    Write the video description to a .info.json file
461         """
462
463         params = None
464         _ies = []
465         _pps = []
466         _download_retcode = None
467         _num_downloads = None
468         _screen_file = None
469
470         def __init__(self, params):
471                 """Create a FileDownloader object with the given options."""
472                 self._ies = []
473                 self._pps = []
474                 self._download_retcode = 0
475                 self._num_downloads = 0
476                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
477                 self.params = params
478
479         @staticmethod
480         def format_bytes(bytes):
481                 if bytes is None:
482                         return 'N/A'
483                 if type(bytes) is str:
484                         bytes = float(bytes)
485                 if bytes == 0.0:
486                         exponent = 0
487                 else:
488                         exponent = long(math.log(bytes, 1024.0))
489                 suffix = 'bkMGTPEZY'[exponent]
490                 converted = float(bytes) / float(1024 ** exponent)
491                 return '%.2f%s' % (converted, suffix)
492
493         @staticmethod
494         def calc_percent(byte_counter, data_len):
495                 if data_len is None:
496                         return '---.-%'
497                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
498
499         @staticmethod
500         def calc_eta(start, now, total, current):
501                 if total is None:
502                         return '--:--'
503                 dif = now - start
504                 if current == 0 or dif < 0.001: # One millisecond
505                         return '--:--'
506                 rate = float(current) / dif
507                 eta = long((float(total) - float(current)) / rate)
508                 (eta_mins, eta_secs) = divmod(eta, 60)
509                 if eta_mins > 99:
510                         return '--:--'
511                 return '%02d:%02d' % (eta_mins, eta_secs)
512
513         @staticmethod
514         def calc_speed(start, now, bytes):
515                 dif = now - start
516                 if bytes == 0 or dif < 0.001: # One millisecond
517                         return '%10s' % '---b/s'
518                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
519
520         @staticmethod
521         def best_block_size(elapsed_time, bytes):
522                 new_min = max(bytes / 2.0, 1.0)
523                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
524                 if elapsed_time < 0.001:
525                         return long(new_max)
526                 rate = bytes / elapsed_time
527                 if rate > new_max:
528                         return long(new_max)
529                 if rate < new_min:
530                         return long(new_min)
531                 return long(rate)
532
533         @staticmethod
534         def parse_bytes(bytestr):
535                 """Parse a string indicating a byte quantity into a long integer."""
536                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
537                 if matchobj is None:
538                         return None
539                 number = float(matchobj.group(1))
540                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
541                 return long(round(number * multiplier))
542
543         def add_info_extractor(self, ie):
544                 """Add an InfoExtractor object to the end of the list."""
545                 self._ies.append(ie)
546                 ie.set_downloader(self)
547
548         def add_post_processor(self, pp):
549                 """Add a PostProcessor object to the end of the chain."""
550                 self._pps.append(pp)
551                 pp.set_downloader(self)
552
553         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
554                 """Print message to stdout if not in quiet mode."""
555                 try:
556                         if not self.params.get('quiet', False):
557                                 terminator = [u'\n', u''][skip_eol]
558                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
559                         self._screen_file.flush()
560                 except (UnicodeEncodeError), err:
561                         if not ignore_encoding_errors:
562                                 raise
563
564         def to_stderr(self, message):
565                 """Print message to stderr."""
566                 print >>sys.stderr, message.encode(preferredencoding())
567
568         def to_cons_title(self, message):
569                 """Set console/terminal window title to message."""
570                 if not self.params.get('consoletitle', False):
571                         return
572                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
573                         # c_wchar_p() might not be necessary if `message` is
574                         # already of type unicode()
575                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
576                 elif 'TERM' in os.environ:
577                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
578
579         def fixed_template(self):
580                 """Checks if the output template is fixed."""
581                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
582
583         def trouble(self, message=None):
584                 """Determine action to take when a download problem appears.
585
586                 Depending on if the downloader has been configured to ignore
587                 download errors or not, this method may throw an exception or
588                 not when errors are found, after printing the message.
589                 """
590                 if message is not None:
591                         self.to_stderr(message)
592                 if not self.params.get('ignoreerrors', False):
593                         raise DownloadError(message)
594                 self._download_retcode = 1
595
596         def slow_down(self, start_time, byte_counter):
597                 """Sleep if the download speed is over the rate limit."""
598                 rate_limit = self.params.get('ratelimit', None)
599                 if rate_limit is None or byte_counter == 0:
600                         return
601                 now = time.time()
602                 elapsed = now - start_time
603                 if elapsed <= 0.0:
604                         return
605                 speed = float(byte_counter) / elapsed
606                 if speed > rate_limit:
607                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
608
609         def temp_name(self, filename):
610                 """Returns a temporary filename for the given filename."""
611                 if self.params.get('nopart', False) or filename == u'-' or \
612                                 (os.path.exists(filename) and not os.path.isfile(filename)):
613                         return filename
614                 return filename + u'.part'
615
616         def undo_temp_name(self, filename):
617                 if filename.endswith(u'.part'):
618                         return filename[:-len(u'.part')]
619                 return filename
620
621         def try_rename(self, old_filename, new_filename):
622                 try:
623                         if old_filename == new_filename:
624                                 return
625                         os.rename(old_filename, new_filename)
626                 except (IOError, OSError), err:
627                         self.trouble(u'ERROR: unable to rename file')
628
629         def try_utime(self, filename, last_modified_hdr):
630                 """Try to set the last-modified time of the given file."""
631                 if last_modified_hdr is None:
632                         return
633                 if not os.path.isfile(filename):
634                         return
635                 timestr = last_modified_hdr
636                 if timestr is None:
637                         return
638                 filetime = timeconvert(timestr)
639                 if filetime is None:
640                         return filetime
641                 try:
642                         os.utime(filename, (time.time(), filetime))
643                 except:
644                         pass
645                 return filetime
646
647         def report_writedescription(self, descfn):
648                 """ Report that the description file is being written """
649                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
650
651         def report_writeinfojson(self, infofn):
652                 """ Report that the metadata file has been written """
653                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
654
655         def report_destination(self, filename):
656                 """Report destination filename."""
657                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
658
659         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
660                 """Report download progress."""
661                 if self.params.get('noprogress', False):
662                         return
663                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
664                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
665                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
666                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
667
668         def report_resuming_byte(self, resume_len):
669                 """Report attempt to resume at given byte."""
670                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
671
672         def report_retry(self, count, retries):
673                 """Report retry in case of HTTP error 5xx"""
674                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
675
676         def report_file_already_downloaded(self, file_name):
677                 """Report file has already been fully downloaded."""
678                 try:
679                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
680                 except (UnicodeEncodeError), err:
681                         self.to_screen(u'[download] The file has already been downloaded')
682
683         def report_unable_to_resume(self):
684                 """Report it was impossible to resume download."""
685                 self.to_screen(u'[download] Unable to resume')
686
687         def report_finish(self):
688                 """Report download finished."""
689                 if self.params.get('noprogress', False):
690                         self.to_screen(u'[download] Download completed')
691                 else:
692                         self.to_screen(u'')
693
694         def increment_downloads(self):
695                 """Increment the ordinal that assigns a number to each file."""
696                 self._num_downloads += 1
697
698         def prepare_filename(self, info_dict):
699                 """Generate the output filename."""
700                 try:
701                         template_dict = dict(info_dict)
702                         template_dict['epoch'] = unicode(long(time.time()))
703                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
704                         filename = self.params['outtmpl'] % template_dict
705                         return filename
706                 except (ValueError, KeyError), err:
707                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
708                         return None
709
710         def _match_entry(self, info_dict):
711                 """ Returns None iff the file should be downloaded """
712
713                 title = info_dict['title']
714                 matchtitle = self.params.get('matchtitle', False)
715                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
716                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
717                 rejecttitle = self.params.get('rejecttitle', False)
718                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
719                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
720                 return None
721
722         def process_info(self, info_dict):
723                 """Process a single dictionary returned by an InfoExtractor."""
724
725                 reason = self._match_entry(info_dict)
726                 if reason is not None:
727                         self.to_screen(u'[download] ' + reason)
728                         return
729
730                 max_downloads = self.params.get('max_downloads')
731                 if max_downloads is not None:
732                         if self._num_downloads > int(max_downloads):
733                                 self.to_screen(u'[download] Maximum number of downloads reached. Skipping ' + info_dict['title'])
734                                 return
735
736                 filename = self.prepare_filename(info_dict)
737                 
738                 # Forced printings
739                 if self.params.get('forcetitle', False):
740                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
741                 if self.params.get('forceurl', False):
742                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
743                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
744                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
745                 if self.params.get('forcedescription', False) and 'description' in info_dict:
746                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
747                 if self.params.get('forcefilename', False) and filename is not None:
748                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
749                 if self.params.get('forceformat', False):
750                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
751
752                 # Do nothing else if in simulate mode
753                 if self.params.get('simulate', False):
754                         return
755
756                 if filename is None:
757                         return
758
759                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
760                         self.to_stderr(u'WARNING: file exists and will be skipped')
761                         return
762
763                 try:
764                         dn = os.path.dirname(filename)
765                         if dn != '' and not os.path.exists(dn):
766                                 os.makedirs(dn)
767                 except (OSError, IOError), err:
768                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
769                         return
770
771                 if self.params.get('writedescription', False):
772                         try:
773                                 descfn = filename + '.description'
774                                 self.report_writedescription(descfn)
775                                 descfile = open(descfn, 'wb')
776                                 try:
777                                         descfile.write(info_dict['description'].encode('utf-8'))
778                                 finally:
779                                         descfile.close()
780                         except (OSError, IOError):
781                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
782                                 return
783
784                 if self.params.get('writeinfojson', False):
785                         infofn = filename + '.info.json'
786                         self.report_writeinfojson(infofn)
787                         try:
788                                 json.dump
789                         except (NameError,AttributeError):
790                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
791                                 return
792                         try:
793                                 infof = open(infofn, 'wb')
794                                 try:
795                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
796                                         json.dump(json_info_dict, infof)
797                                 finally:
798                                         infof.close()
799                         except (OSError, IOError):
800                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
801                                 return
802
803                 if not self.params.get('skip_download', False):
804                         try:
805                                 success = self._do_download(filename, info_dict)
806                         except (OSError, IOError), err:
807                                 raise UnavailableVideoError
808                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
809                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
810                                 return
811                         except (ContentTooShortError, ), err:
812                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
813                                 return
814         
815                         if success:
816                                 try:
817                                         self.post_process(filename, info_dict)
818                                 except (PostProcessingError), err:
819                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
820                                         return
821
822         def download(self, url_list):
823                 """Download a given list of URLs."""
824                 if len(url_list) > 1 and self.fixed_template():
825                         raise SameFileError(self.params['outtmpl'])
826
827                 for url in url_list:
828                         suitable_found = False
829                         for ie in self._ies:
830                                 # Go to next InfoExtractor if not suitable
831                                 if not ie.suitable(url):
832                                         continue
833
834                                 # Suitable InfoExtractor found
835                                 suitable_found = True
836
837                                 # Extract information from URL and process it
838                                 ie.extract(url)
839
840                                 # Suitable InfoExtractor had been found; go to next URL
841                                 break
842
843                         if not suitable_found:
844                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
845
846                 return self._download_retcode
847
848         def post_process(self, filename, ie_info):
849                 """Run the postprocessing chain on the given file."""
850                 info = dict(ie_info)
851                 info['filepath'] = filename
852                 for pp in self._pps:
853                         info = pp.run(info)
854                         if info is None:
855                                 break
856
857         def _download_with_rtmpdump(self, filename, url, player_url):
858                 self.report_destination(filename)
859                 tmpfilename = self.temp_name(filename)
860
861                 # Check for rtmpdump first
862                 try:
863                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
864                 except (OSError, IOError):
865                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
866                         return False
867
868                 # Download using rtmpdump. rtmpdump returns exit code 2 when
869                 # the connection was interrumpted and resuming appears to be
870                 # possible. This is part of rtmpdump's normal usage, AFAIK.
871                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
872                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
873                 while retval == 2 or retval == 1:
874                         prevsize = os.path.getsize(tmpfilename)
875                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
876                         time.sleep(5.0) # This seems to be needed
877                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
878                         cursize = os.path.getsize(tmpfilename)
879                         if prevsize == cursize and retval == 1:
880                                 break
881                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
882                         if prevsize == cursize and retval == 2 and cursize > 1024:
883                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
884                                 retval = 0
885                                 break
886                 if retval == 0:
887                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
888                         self.try_rename(tmpfilename, filename)
889                         return True
890                 else:
891                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
892                         return False
893
894         def _do_download(self, filename, info_dict):
895                 url = info_dict['url']
896                 player_url = info_dict.get('player_url', None)
897
898                 # Check file already present
899                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
900                         self.report_file_already_downloaded(filename)
901                         return True
902
903                 # Attempt to download using rtmpdump
904                 if url.startswith('rtmp'):
905                         return self._download_with_rtmpdump(filename, url, player_url)
906
907                 tmpfilename = self.temp_name(filename)
908                 stream = None
909
910                 # Do not include the Accept-Encoding header
911                 headers = {'Youtubedl-no-compression': 'True'}
912                 basic_request = urllib2.Request(url, None, headers)
913                 request = urllib2.Request(url, None, headers)
914
915                 # Establish possible resume length
916                 if os.path.isfile(tmpfilename):
917                         resume_len = os.path.getsize(tmpfilename)
918                 else:
919                         resume_len = 0
920
921                 open_mode = 'wb'
922                 if resume_len != 0:
923                         if self.params.get('continuedl', False):
924                                 self.report_resuming_byte(resume_len)
925                                 request.add_header('Range','bytes=%d-' % resume_len)
926                                 open_mode = 'ab'
927                         else:
928                                 resume_len = 0
929
930                 count = 0
931                 retries = self.params.get('retries', 0)
932                 while count <= retries:
933                         # Establish connection
934                         try:
935                                 if count == 0 and 'urlhandle' in info_dict:
936                                         data = info_dict['urlhandle']
937                                 data = urllib2.urlopen(request)
938                                 break
939                         except (urllib2.HTTPError, ), err:
940                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
941                                         # Unexpected HTTP error
942                                         raise
943                                 elif err.code == 416:
944                                         # Unable to resume (requested range not satisfiable)
945                                         try:
946                                                 # Open the connection again without the range header
947                                                 data = urllib2.urlopen(basic_request)
948                                                 content_length = data.info()['Content-Length']
949                                         except (urllib2.HTTPError, ), err:
950                                                 if err.code < 500 or err.code >= 600:
951                                                         raise
952                                         else:
953                                                 # Examine the reported length
954                                                 if (content_length is not None and
955                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
956                                                         # The file had already been fully downloaded.
957                                                         # Explanation to the above condition: in issue #175 it was revealed that
958                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
959                                                         # changing the file size slightly and causing problems for some users. So
960                                                         # I decided to implement a suggested change and consider the file
961                                                         # completely downloaded if the file size differs less than 100 bytes from
962                                                         # the one in the hard drive.
963                                                         self.report_file_already_downloaded(filename)
964                                                         self.try_rename(tmpfilename, filename)
965                                                         return True
966                                                 else:
967                                                         # The length does not match, we start the download over
968                                                         self.report_unable_to_resume()
969                                                         open_mode = 'wb'
970                                                         break
971                         # Retry
972                         count += 1
973                         if count <= retries:
974                                 self.report_retry(count, retries)
975
976                 if count > retries:
977                         self.trouble(u'ERROR: giving up after %s retries' % retries)
978                         return False
979
980                 data_len = data.info().get('Content-length', None)
981                 if data_len is not None:
982                         data_len = long(data_len) + resume_len
983                 data_len_str = self.format_bytes(data_len)
984                 byte_counter = 0 + resume_len
985                 block_size = 1024
986                 start = time.time()
987                 while True:
988                         # Download and write
989                         before = time.time()
990                         data_block = data.read(block_size)
991                         after = time.time()
992                         if len(data_block) == 0:
993                                 break
994                         byte_counter += len(data_block)
995
996                         # Open file just in time
997                         if stream is None:
998                                 try:
999                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1000                                         assert stream is not None
1001                                         filename = self.undo_temp_name(tmpfilename)
1002                                         self.report_destination(filename)
1003                                 except (OSError, IOError), err:
1004                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1005                                         return False
1006                         try:
1007                                 stream.write(data_block)
1008                         except (IOError, OSError), err:
1009                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1010                                 return False
1011                         block_size = self.best_block_size(after - before, len(data_block))
1012
1013                         # Progress message
1014                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1015                         if data_len is None:
1016                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1017                         else:
1018                                 percent_str = self.calc_percent(byte_counter, data_len)
1019                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1020                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1021
1022                         # Apply rate limit
1023                         self.slow_down(start, byte_counter - resume_len)
1024
1025                 if stream is None:
1026                         self.trouble(u'\nERROR: Did not get any data blocks')
1027                         return False
1028                 stream.close()
1029                 self.report_finish()
1030                 if data_len is not None and byte_counter != data_len:
1031                         raise ContentTooShortError(byte_counter, long(data_len))
1032                 self.try_rename(tmpfilename, filename)
1033
1034                 # Update file modification time
1035                 if self.params.get('updatetime', True):
1036                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1037
1038                 return True
1039
1040
1041 class InfoExtractor(object):
1042         """Information Extractor class.
1043
1044         Information extractors are the classes that, given a URL, extract
1045         information from the video (or videos) the URL refers to. This
1046         information includes the real video URL, the video title and simplified
1047         title, author and others. The information is stored in a dictionary
1048         which is then passed to the FileDownloader. The FileDownloader
1049         processes this information possibly downloading the video to the file
1050         system, among other possible outcomes. The dictionaries must include
1051         the following fields:
1052
1053         id:             Video identifier.
1054         url:            Final video URL.
1055         uploader:       Nickname of the video uploader.
1056         title:          Literal title.
1057         stitle:         Simplified title.
1058         ext:            Video filename extension.
1059         format:         Video format.
1060         player_url:     SWF Player URL (may be None).
1061
1062         The following fields are optional. Their primary purpose is to allow
1063         youtube-dl to serve as the backend for a video search function, such
1064         as the one in youtube2mp3.  They are only used when their respective
1065         forced printing functions are called:
1066
1067         thumbnail:      Full URL to a video thumbnail image.
1068         description:    One-line video description.
1069
1070         Subclasses of this one should re-define the _real_initialize() and
1071         _real_extract() methods and define a _VALID_URL regexp.
1072         Probably, they should also be added to the list of extractors.
1073         """
1074
1075         _ready = False
1076         _downloader = None
1077
1078         def __init__(self, downloader=None):
1079                 """Constructor. Receives an optional downloader."""
1080                 self._ready = False
1081                 self.set_downloader(downloader)
1082
1083         def suitable(self, url):
1084                 """Receives a URL and returns True if suitable for this IE."""
1085                 return re.match(self._VALID_URL, url) is not None
1086
1087         def initialize(self):
1088                 """Initializes an instance (authentication, etc)."""
1089                 if not self._ready:
1090                         self._real_initialize()
1091                         self._ready = True
1092
1093         def extract(self, url):
1094                 """Extracts URL information and returns it in list of dicts."""
1095                 self.initialize()
1096                 return self._real_extract(url)
1097
1098         def set_downloader(self, downloader):
1099                 """Sets the downloader for this IE."""
1100                 self._downloader = downloader
1101
1102         def _real_initialize(self):
1103                 """Real initialization process. Redefine in subclasses."""
1104                 pass
1105
1106         def _real_extract(self, url):
1107                 """Real extraction process. Redefine in subclasses."""
1108                 pass
1109
1110
1111 class YoutubeIE(InfoExtractor):
1112         """Information extractor for youtube.com."""
1113
1114         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1115         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1116         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1117         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1118         _NETRC_MACHINE = 'youtube'
1119         # Listed in order of quality
1120         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1121         _video_extensions = {
1122                 '13': '3gp',
1123                 '17': 'mp4',
1124                 '18': 'mp4',
1125                 '22': 'mp4',
1126                 '37': 'mp4',
1127                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1128                 '43': 'webm',
1129                 '44': 'webm',
1130                 '45': 'webm',
1131         }
1132         _video_dimensions = {
1133                 '5': '240x400',
1134                 '6': '???',
1135                 '13': '???',
1136                 '17': '144x176',
1137                 '18': '360x640',
1138                 '22': '720x1280',
1139                 '34': '360x640',
1140                 '35': '480x854',
1141                 '37': '1080x1920',
1142                 '38': '3072x4096',
1143                 '43': '360x640',
1144                 '44': '480x854',
1145                 '45': '720x1280',
1146         }       
1147         IE_NAME = u'youtube'
1148
1149         def report_lang(self):
1150                 """Report attempt to set language."""
1151                 self._downloader.to_screen(u'[youtube] Setting language')
1152
1153         def report_login(self):
1154                 """Report attempt to log in."""
1155                 self._downloader.to_screen(u'[youtube] Logging in')
1156
1157         def report_age_confirmation(self):
1158                 """Report attempt to confirm age."""
1159                 self._downloader.to_screen(u'[youtube] Confirming age')
1160
1161         def report_video_webpage_download(self, video_id):
1162                 """Report attempt to download video webpage."""
1163                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1164
1165         def report_video_info_webpage_download(self, video_id):
1166                 """Report attempt to download video info webpage."""
1167                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1168
1169         def report_information_extraction(self, video_id):
1170                 """Report attempt to extract video information."""
1171                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1172
1173         def report_unavailable_format(self, video_id, format):
1174                 """Report extracted video URL."""
1175                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1176
1177         def report_rtmp_download(self):
1178                 """Indicate the download will use the RTMP protocol."""
1179                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1180
1181         def _print_formats(self, formats):
1182                 print 'Available formats:'
1183                 for x in formats:
1184                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1185
1186         def _real_initialize(self):
1187                 if self._downloader is None:
1188                         return
1189
1190                 username = None
1191                 password = None
1192                 downloader_params = self._downloader.params
1193
1194                 # Attempt to use provided username and password or .netrc data
1195                 if downloader_params.get('username', None) is not None:
1196                         username = downloader_params['username']
1197                         password = downloader_params['password']
1198                 elif downloader_params.get('usenetrc', False):
1199                         try:
1200                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1201                                 if info is not None:
1202                                         username = info[0]
1203                                         password = info[2]
1204                                 else:
1205                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1206                         except (IOError, netrc.NetrcParseError), err:
1207                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1208                                 return
1209
1210                 # Set language
1211                 request = urllib2.Request(self._LANG_URL)
1212                 try:
1213                         self.report_lang()
1214                         urllib2.urlopen(request).read()
1215                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1216                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1217                         return
1218
1219                 # No authentication to be performed
1220                 if username is None:
1221                         return
1222
1223                 # Log in
1224                 login_form = {
1225                                 'current_form': 'loginForm',
1226                                 'next':         '/',
1227                                 'action_login': 'Log In',
1228                                 'username':     username,
1229                                 'password':     password,
1230                                 }
1231                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1232                 try:
1233                         self.report_login()
1234                         login_results = urllib2.urlopen(request).read()
1235                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1236                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1237                                 return
1238                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1239                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1240                         return
1241
1242                 # Confirm age
1243                 age_form = {
1244                                 'next_url':             '/',
1245                                 'action_confirm':       'Confirm',
1246                                 }
1247                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1248                 try:
1249                         self.report_age_confirmation()
1250                         age_results = urllib2.urlopen(request).read()
1251                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1252                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1253                         return
1254
1255         def _real_extract(self, url):
1256                 # Extract video id from URL
1257                 mobj = re.match(self._VALID_URL, url)
1258                 if mobj is None:
1259                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1260                         return
1261                 video_id = mobj.group(2)
1262
1263                 # Get video webpage
1264                 self.report_video_webpage_download(video_id)
1265                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1266                 try:
1267                         video_webpage = urllib2.urlopen(request).read()
1268                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1269                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1270                         return
1271
1272                 # Attempt to extract SWF player URL
1273                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1274                 if mobj is not None:
1275                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1276                 else:
1277                         player_url = None
1278
1279                 # Get video info
1280                 self.report_video_info_webpage_download(video_id)
1281                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1282                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1283                                         % (video_id, el_type))
1284                         request = urllib2.Request(video_info_url)
1285                         try:
1286                                 video_info_webpage = urllib2.urlopen(request).read()
1287                                 video_info = parse_qs(video_info_webpage)
1288                                 if 'token' in video_info:
1289                                         break
1290                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1291                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1292                                 return
1293                 if 'token' not in video_info:
1294                         if 'reason' in video_info:
1295                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1296                         else:
1297                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1298                         return
1299
1300                 # Start extracting information
1301                 self.report_information_extraction(video_id)
1302
1303                 # uploader
1304                 if 'author' not in video_info:
1305                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1306                         return
1307                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1308
1309                 # title
1310                 if 'title' not in video_info:
1311                         self._downloader.trouble(u'ERROR: unable to extract video title')
1312                         return
1313                 video_title = urllib.unquote_plus(video_info['title'][0])
1314                 video_title = video_title.decode('utf-8')
1315                 video_title = sanitize_title(video_title)
1316
1317                 # simplified title
1318                 simple_title = _simplify_title(video_title)
1319
1320                 # thumbnail image
1321                 if 'thumbnail_url' not in video_info:
1322                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1323                         video_thumbnail = ''
1324                 else:   # don't panic if we can't find it
1325                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1326
1327                 # upload date
1328                 upload_date = u'NA'
1329                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1330                 if mobj is not None:
1331                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1332                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1333                         for expression in format_expressions:
1334                                 try:
1335                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1336                                 except:
1337                                         pass
1338
1339                 # description
1340                 try:
1341                         lxml.etree
1342                 except NameError:
1343                         video_description = u'No description available.'
1344                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1345                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1346                                 if mobj is not None:
1347                                         video_description = mobj.group(1).decode('utf-8')
1348                 else:
1349                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1350                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1351                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1352                         # TODO use another parser
1353
1354                 # token
1355                 video_token = urllib.unquote_plus(video_info['token'][0])
1356
1357                 # Decide which formats to download
1358                 req_format = self._downloader.params.get('format', None)
1359
1360                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1361                         self.report_rtmp_download()
1362                         video_url_list = [(None, video_info['conn'][0])]
1363                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1364                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1365                         url_data = [parse_qs(uds) for uds in url_data_strs]
1366                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1367                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1368
1369                         format_limit = self._downloader.params.get('format_limit', None)
1370                         if format_limit is not None and format_limit in self._available_formats:
1371                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1372                         else:
1373                                 format_list = self._available_formats
1374                         existing_formats = [x for x in format_list if x in url_map]
1375                         if len(existing_formats) == 0:
1376                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1377                                 return
1378                         if self._downloader.params.get('listformats', None):
1379                                 self._print_formats(existing_formats)
1380                                 return
1381                         if req_format is None or req_format == 'best':
1382                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1383                         elif req_format == 'worst':
1384                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1385                         elif req_format in ('-1', 'all'):
1386                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1387                         else:
1388                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1389                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1390                                 req_formats = req_format.split('/')
1391                                 video_url_list = None
1392                                 for rf in req_formats:
1393                                         if rf in url_map:
1394                                                 video_url_list = [(rf, url_map[rf])]
1395                                                 break
1396                                 if video_url_list is None:
1397                                         self._downloader.trouble(u'ERROR: requested format not available')
1398                                         return
1399                 else:
1400                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1401                         return
1402
1403                 for format_param, video_real_url in video_url_list:
1404                         # At this point we have a new video
1405                         self._downloader.increment_downloads()
1406
1407                         # Extension
1408                         video_extension = self._video_extensions.get(format_param, 'flv')
1409
1410                         try:
1411                                 # Process video information
1412                                 self._downloader.process_info({
1413                                         'id':           video_id.decode('utf-8'),
1414                                         'url':          video_real_url.decode('utf-8'),
1415                                         'uploader':     video_uploader.decode('utf-8'),
1416                                         'upload_date':  upload_date,
1417                                         'title':        video_title,
1418                                         'stitle':       simple_title,
1419                                         'ext':          video_extension.decode('utf-8'),
1420                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1421                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1422                                         'description':  video_description,
1423                                         'player_url':   player_url,
1424                                 })
1425                         except UnavailableVideoError, err:
1426                                 self._downloader.trouble(u'\nERROR: unable to download video')
1427
1428
1429 class MetacafeIE(InfoExtractor):
1430         """Information Extractor for metacafe.com."""
1431
1432         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1433         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1434         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1435         _youtube_ie = None
1436         IE_NAME = u'metacafe'
1437
1438         def __init__(self, youtube_ie, downloader=None):
1439                 InfoExtractor.__init__(self, downloader)
1440                 self._youtube_ie = youtube_ie
1441
1442         def report_disclaimer(self):
1443                 """Report disclaimer retrieval."""
1444                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1445
1446         def report_age_confirmation(self):
1447                 """Report attempt to confirm age."""
1448                 self._downloader.to_screen(u'[metacafe] Confirming age')
1449
1450         def report_download_webpage(self, video_id):
1451                 """Report webpage download."""
1452                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1453
1454         def report_extraction(self, video_id):
1455                 """Report information extraction."""
1456                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1457
1458         def _real_initialize(self):
1459                 # Retrieve disclaimer
1460                 request = urllib2.Request(self._DISCLAIMER)
1461                 try:
1462                         self.report_disclaimer()
1463                         disclaimer = urllib2.urlopen(request).read()
1464                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1465                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1466                         return
1467
1468                 # Confirm age
1469                 disclaimer_form = {
1470                         'filters': '0',
1471                         'submit': "Continue - I'm over 18",
1472                         }
1473                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1474                 try:
1475                         self.report_age_confirmation()
1476                         disclaimer = urllib2.urlopen(request).read()
1477                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1478                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1479                         return
1480
1481         def _real_extract(self, url):
1482                 # Extract id and simplified title from URL
1483                 mobj = re.match(self._VALID_URL, url)
1484                 if mobj is None:
1485                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1486                         return
1487
1488                 video_id = mobj.group(1)
1489
1490                 # Check if video comes from YouTube
1491                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1492                 if mobj2 is not None:
1493                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1494                         return
1495
1496                 # At this point we have a new video
1497                 self._downloader.increment_downloads()
1498
1499                 simple_title = mobj.group(2).decode('utf-8')
1500
1501                 # Retrieve video webpage to extract further information
1502                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1503                 try:
1504                         self.report_download_webpage(video_id)
1505                         webpage = urllib2.urlopen(request).read()
1506                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1507                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1508                         return
1509
1510                 # Extract URL, uploader and title from webpage
1511                 self.report_extraction(video_id)
1512                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1513                 if mobj is not None:
1514                         mediaURL = urllib.unquote(mobj.group(1))
1515                         video_extension = mediaURL[-3:]
1516
1517                         # Extract gdaKey if available
1518                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1519                         if mobj is None:
1520                                 video_url = mediaURL
1521                         else:
1522                                 gdaKey = mobj.group(1)
1523                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1524                 else:
1525                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1526                         if mobj is None:
1527                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1528                                 return
1529                         vardict = parse_qs(mobj.group(1))
1530                         if 'mediaData' not in vardict:
1531                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1532                                 return
1533                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1534                         if mobj is None:
1535                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1536                                 return
1537                         mediaURL = mobj.group(1).replace('\\/', '/')
1538                         video_extension = mediaURL[-3:]
1539                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1540
1541                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1542                 if mobj is None:
1543                         self._downloader.trouble(u'ERROR: unable to extract title')
1544                         return
1545                 video_title = mobj.group(1).decode('utf-8')
1546                 video_title = sanitize_title(video_title)
1547
1548                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1549                 if mobj is None:
1550                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1551                         return
1552                 video_uploader = mobj.group(1)
1553
1554                 try:
1555                         # Process video information
1556                         self._downloader.process_info({
1557                                 'id':           video_id.decode('utf-8'),
1558                                 'url':          video_url.decode('utf-8'),
1559                                 'uploader':     video_uploader.decode('utf-8'),
1560                                 'upload_date':  u'NA',
1561                                 'title':        video_title,
1562                                 'stitle':       simple_title,
1563                                 'ext':          video_extension.decode('utf-8'),
1564                                 'format':       u'NA',
1565                                 'player_url':   None,
1566                         })
1567                 except UnavailableVideoError:
1568                         self._downloader.trouble(u'\nERROR: unable to download video')
1569
1570
1571 class DailymotionIE(InfoExtractor):
1572         """Information Extractor for Dailymotion"""
1573
1574         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1575         IE_NAME = u'dailymotion'
1576
1577         def __init__(self, downloader=None):
1578                 InfoExtractor.__init__(self, downloader)
1579
1580         def report_download_webpage(self, video_id):
1581                 """Report webpage download."""
1582                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1583
1584         def report_extraction(self, video_id):
1585                 """Report information extraction."""
1586                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1587
1588         def _real_extract(self, url):
1589                 # Extract id and simplified title from URL
1590                 mobj = re.match(self._VALID_URL, url)
1591                 if mobj is None:
1592                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1593                         return
1594
1595                 # At this point we have a new video
1596                 self._downloader.increment_downloads()
1597                 video_id = mobj.group(1)
1598
1599                 simple_title = mobj.group(2).decode('utf-8')
1600                 video_extension = 'flv'
1601
1602                 # Retrieve video webpage to extract further information
1603                 request = urllib2.Request(url)
1604                 request.add_header('Cookie', 'family_filter=off')
1605                 try:
1606                         self.report_download_webpage(video_id)
1607                         webpage = urllib2.urlopen(request).read()
1608                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1609                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1610                         return
1611
1612                 # Extract URL, uploader and title from webpage
1613                 self.report_extraction(video_id)
1614                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1615                 if mobj is None:
1616                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1617                         return
1618                 sequence = urllib.unquote(mobj.group(1))
1619                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1620                 if mobj is None:
1621                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1622                         return
1623                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1624
1625                 # if needed add http://www.dailymotion.com/ if relative URL
1626
1627                 video_url = mediaURL
1628
1629                 mobj = re.search(r'(?im)<title>\s*(.+)\s*-\s*Video\s+Dailymotion</title>', webpage)
1630                 if mobj is None:
1631                         self._downloader.trouble(u'ERROR: unable to extract title')
1632                         return
1633                 video_title = mobj.group(1).decode('utf-8')
1634                 video_title = sanitize_title(video_title)
1635
1636                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1637                 if mobj is None:
1638                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1639                         return
1640                 video_uploader = mobj.group(1)
1641
1642                 try:
1643                         # Process video information
1644                         self._downloader.process_info({
1645                                 'id':           video_id.decode('utf-8'),
1646                                 'url':          video_url.decode('utf-8'),
1647                                 'uploader':     video_uploader.decode('utf-8'),
1648                                 'upload_date':  u'NA',
1649                                 'title':        video_title,
1650                                 'stitle':       simple_title,
1651                                 'ext':          video_extension.decode('utf-8'),
1652                                 'format':       u'NA',
1653                                 'player_url':   None,
1654                         })
1655                 except UnavailableVideoError:
1656                         self._downloader.trouble(u'\nERROR: unable to download video')
1657
1658
1659 class GoogleIE(InfoExtractor):
1660         """Information extractor for video.google.com."""
1661
1662         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1663         IE_NAME = u'video.google'
1664
1665         def __init__(self, downloader=None):
1666                 InfoExtractor.__init__(self, downloader)
1667
1668         def report_download_webpage(self, video_id):
1669                 """Report webpage download."""
1670                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1671
1672         def report_extraction(self, video_id):
1673                 """Report information extraction."""
1674                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1675
1676         def _real_extract(self, url):
1677                 # Extract id from URL
1678                 mobj = re.match(self._VALID_URL, url)
1679                 if mobj is None:
1680                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1681                         return
1682
1683                 # At this point we have a new video
1684                 self._downloader.increment_downloads()
1685                 video_id = mobj.group(1)
1686
1687                 video_extension = 'mp4'
1688
1689                 # Retrieve video webpage to extract further information
1690                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1691                 try:
1692                         self.report_download_webpage(video_id)
1693                         webpage = urllib2.urlopen(request).read()
1694                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1695                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1696                         return
1697
1698                 # Extract URL, uploader, and title from webpage
1699                 self.report_extraction(video_id)
1700                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1701                 if mobj is None:
1702                         video_extension = 'flv'
1703                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1704                 if mobj is None:
1705                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1706                         return
1707                 mediaURL = urllib.unquote(mobj.group(1))
1708                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1709                 mediaURL = mediaURL.replace('\\x26', '\x26')
1710
1711                 video_url = mediaURL
1712
1713                 mobj = re.search(r'<title>(.*)</title>', webpage)
1714                 if mobj is None:
1715                         self._downloader.trouble(u'ERROR: unable to extract title')
1716                         return
1717                 video_title = mobj.group(1).decode('utf-8')
1718                 video_title = sanitize_title(video_title)
1719                 simple_title = _simplify_title(video_title)
1720
1721                 # Extract video description
1722                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1723                 if mobj is None:
1724                         self._downloader.trouble(u'ERROR: unable to extract video description')
1725                         return
1726                 video_description = mobj.group(1).decode('utf-8')
1727                 if not video_description:
1728                         video_description = 'No description available.'
1729
1730                 # Extract video thumbnail
1731                 if self._downloader.params.get('forcethumbnail', False):
1732                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1733                         try:
1734                                 webpage = urllib2.urlopen(request).read()
1735                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1736                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1737                                 return
1738                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1739                         if mobj is None:
1740                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1741                                 return
1742                         video_thumbnail = mobj.group(1)
1743                 else:   # we need something to pass to process_info
1744                         video_thumbnail = ''
1745
1746                 try:
1747                         # Process video information
1748                         self._downloader.process_info({
1749                                 'id':           video_id.decode('utf-8'),
1750                                 'url':          video_url.decode('utf-8'),
1751                                 'uploader':     u'NA',
1752                                 'upload_date':  u'NA',
1753                                 'title':        video_title,
1754                                 'stitle':       simple_title,
1755                                 'ext':          video_extension.decode('utf-8'),
1756                                 'format':       u'NA',
1757                                 'player_url':   None,
1758                         })
1759                 except UnavailableVideoError:
1760                         self._downloader.trouble(u'\nERROR: unable to download video')
1761
1762
1763 class PhotobucketIE(InfoExtractor):
1764         """Information extractor for photobucket.com."""
1765
1766         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1767         IE_NAME = u'photobucket'
1768
1769         def __init__(self, downloader=None):
1770                 InfoExtractor.__init__(self, downloader)
1771
1772         def report_download_webpage(self, video_id):
1773                 """Report webpage download."""
1774                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1775
1776         def report_extraction(self, video_id):
1777                 """Report information extraction."""
1778                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1779
1780         def _real_extract(self, url):
1781                 # Extract id from URL
1782                 mobj = re.match(self._VALID_URL, url)
1783                 if mobj is None:
1784                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1785                         return
1786
1787                 # At this point we have a new video
1788                 self._downloader.increment_downloads()
1789                 video_id = mobj.group(1)
1790
1791                 video_extension = 'flv'
1792
1793                 # Retrieve video webpage to extract further information
1794                 request = urllib2.Request(url)
1795                 try:
1796                         self.report_download_webpage(video_id)
1797                         webpage = urllib2.urlopen(request).read()
1798                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1799                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1800                         return
1801
1802                 # Extract URL, uploader, and title from webpage
1803                 self.report_extraction(video_id)
1804                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1805                 if mobj is None:
1806                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1807                         return
1808                 mediaURL = urllib.unquote(mobj.group(1))
1809
1810                 video_url = mediaURL
1811
1812                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1813                 if mobj is None:
1814                         self._downloader.trouble(u'ERROR: unable to extract title')
1815                         return
1816                 video_title = mobj.group(1).decode('utf-8')
1817                 video_title = sanitize_title(video_title)
1818                 simple_title = _simplify_title(vide_title)
1819
1820                 video_uploader = mobj.group(2).decode('utf-8')
1821
1822                 try:
1823                         # Process video information
1824                         self._downloader.process_info({
1825                                 'id':           video_id.decode('utf-8'),
1826                                 'url':          video_url.decode('utf-8'),
1827                                 'uploader':     video_uploader,
1828                                 'upload_date':  u'NA',
1829                                 'title':        video_title,
1830                                 'stitle':       simple_title,
1831                                 'ext':          video_extension.decode('utf-8'),
1832                                 'format':       u'NA',
1833                                 'player_url':   None,
1834                         })
1835                 except UnavailableVideoError:
1836                         self._downloader.trouble(u'\nERROR: unable to download video')
1837
1838
1839 class YahooIE(InfoExtractor):
1840         """Information extractor for video.yahoo.com."""
1841
1842         # _VALID_URL matches all Yahoo! Video URLs
1843         # _VPAGE_URL matches only the extractable '/watch/' URLs
1844         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1845         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1846         IE_NAME = u'video.yahoo'
1847
1848         def __init__(self, downloader=None):
1849                 InfoExtractor.__init__(self, downloader)
1850
1851         def report_download_webpage(self, video_id):
1852                 """Report webpage download."""
1853                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1854
1855         def report_extraction(self, video_id):
1856                 """Report information extraction."""
1857                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1858
1859         def _real_extract(self, url, new_video=True):
1860                 # Extract ID from URL
1861                 mobj = re.match(self._VALID_URL, url)
1862                 if mobj is None:
1863                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1864                         return
1865
1866                 # At this point we have a new video
1867                 self._downloader.increment_downloads()
1868                 video_id = mobj.group(2)
1869                 video_extension = 'flv'
1870
1871                 # Rewrite valid but non-extractable URLs as
1872                 # extractable English language /watch/ URLs
1873                 if re.match(self._VPAGE_URL, url) is None:
1874                         request = urllib2.Request(url)
1875                         try:
1876                                 webpage = urllib2.urlopen(request).read()
1877                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1878                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1879                                 return
1880
1881                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1882                         if mobj is None:
1883                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1884                                 return
1885                         yahoo_id = mobj.group(1)
1886
1887                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1888                         if mobj is None:
1889                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1890                                 return
1891                         yahoo_vid = mobj.group(1)
1892
1893                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1894                         return self._real_extract(url, new_video=False)
1895
1896                 # Retrieve video webpage to extract further information
1897                 request = urllib2.Request(url)
1898                 try:
1899                         self.report_download_webpage(video_id)
1900                         webpage = urllib2.urlopen(request).read()
1901                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1902                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1903                         return
1904
1905                 # Extract uploader and title from webpage
1906                 self.report_extraction(video_id)
1907                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1908                 if mobj is None:
1909                         self._downloader.trouble(u'ERROR: unable to extract video title')
1910                         return
1911                 video_title = mobj.group(1).decode('utf-8')
1912                 simple_title = _simplify_title(video_title)
1913
1914                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1915                 if mobj is None:
1916                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1917                         return
1918                 video_uploader = mobj.group(1).decode('utf-8')
1919
1920                 # Extract video thumbnail
1921                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1922                 if mobj is None:
1923                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1924                         return
1925                 video_thumbnail = mobj.group(1).decode('utf-8')
1926
1927                 # Extract video description
1928                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1929                 if mobj is None:
1930                         self._downloader.trouble(u'ERROR: unable to extract video description')
1931                         return
1932                 video_description = mobj.group(1).decode('utf-8')
1933                 if not video_description:
1934                         video_description = 'No description available.'
1935
1936                 # Extract video height and width
1937                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1938                 if mobj is None:
1939                         self._downloader.trouble(u'ERROR: unable to extract video height')
1940                         return
1941                 yv_video_height = mobj.group(1)
1942
1943                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1944                 if mobj is None:
1945                         self._downloader.trouble(u'ERROR: unable to extract video width')
1946                         return
1947                 yv_video_width = mobj.group(1)
1948
1949                 # Retrieve video playlist to extract media URL
1950                 # I'm not completely sure what all these options are, but we
1951                 # seem to need most of them, otherwise the server sends a 401.
1952                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1953                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1954                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1955                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1956                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1957                 try:
1958                         self.report_download_webpage(video_id)
1959                         webpage = urllib2.urlopen(request).read()
1960                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1961                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1962                         return
1963
1964                 # Extract media URL from playlist XML
1965                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1966                 if mobj is None:
1967                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1968                         return
1969                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1970                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1971
1972                 try:
1973                         # Process video information
1974                         self._downloader.process_info({
1975                                 'id':           video_id.decode('utf-8'),
1976                                 'url':          video_url,
1977                                 'uploader':     video_uploader,
1978                                 'upload_date':  u'NA',
1979                                 'title':        video_title,
1980                                 'stitle':       simple_title,
1981                                 'ext':          video_extension.decode('utf-8'),
1982                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1983                                 'description':  video_description,
1984                                 'thumbnail':    video_thumbnail,
1985                                 'player_url':   None,
1986                         })
1987                 except UnavailableVideoError:
1988                         self._downloader.trouble(u'\nERROR: unable to download video')
1989
1990
1991 class VimeoIE(InfoExtractor):
1992         """Information extractor for vimeo.com."""
1993
1994         # _VALID_URL matches Vimeo URLs
1995         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1996         IE_NAME = u'vimeo'
1997
1998         def __init__(self, downloader=None):
1999                 InfoExtractor.__init__(self, downloader)
2000
2001         def report_download_webpage(self, video_id):
2002                 """Report webpage download."""
2003                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2004
2005         def report_extraction(self, video_id):
2006                 """Report information extraction."""
2007                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2008
2009         def _real_extract(self, url, new_video=True):
2010                 # Extract ID from URL
2011                 mobj = re.match(self._VALID_URL, url)
2012                 if mobj is None:
2013                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2014                         return
2015
2016                 # At this point we have a new video
2017                 self._downloader.increment_downloads()
2018                 video_id = mobj.group(1)
2019
2020                 # Retrieve video webpage to extract further information
2021                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2022                 try:
2023                         self.report_download_webpage(video_id)
2024                         webpage = urllib2.urlopen(request).read()
2025                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2026                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2027                         return
2028
2029                 # Now we begin extracting as much information as we can from what we
2030                 # retrieved. First we extract the information common to all extractors,
2031                 # and latter we extract those that are Vimeo specific.
2032                 self.report_extraction(video_id)
2033
2034                 # Extract title
2035                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2036                 if mobj is None:
2037                         self._downloader.trouble(u'ERROR: unable to extract video title')
2038                         return
2039                 video_title = mobj.group(1).decode('utf-8')
2040                 simple_title = _simplify_title(video_title)
2041
2042                 # Extract uploader
2043                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2044                 if mobj is None:
2045                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2046                         return
2047                 video_uploader = mobj.group(1).decode('utf-8')
2048
2049                 # Extract video thumbnail
2050                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2051                 if mobj is None:
2052                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2053                         return
2054                 video_thumbnail = mobj.group(1).decode('utf-8')
2055
2056                 # # Extract video description
2057                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2058                 # if mobj is None:
2059                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2060                 #       return
2061                 # video_description = mobj.group(1).decode('utf-8')
2062                 # if not video_description: video_description = 'No description available.'
2063                 video_description = 'Foo.'
2064
2065                 # Vimeo specific: extract request signature
2066                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2067                 if mobj is None:
2068                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2069                         return
2070                 sig = mobj.group(1).decode('utf-8')
2071
2072                 # Vimeo specific: extract video quality information
2073                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2074                 if mobj is None:
2075                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2076                         return
2077                 quality = mobj.group(1).decode('utf-8')
2078
2079                 if int(quality) == 1:
2080                         quality = 'hd'
2081                 else:
2082                         quality = 'sd'
2083
2084                 # Vimeo specific: Extract request signature expiration
2085                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2086                 if mobj is None:
2087                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2088                         return
2089                 sig_exp = mobj.group(1).decode('utf-8')
2090
2091                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2092
2093                 try:
2094                         # Process video information
2095                         self._downloader.process_info({
2096                                 'id':           video_id.decode('utf-8'),
2097                                 'url':          video_url,
2098                                 'uploader':     video_uploader,
2099                                 'upload_date':  u'NA',
2100                                 'title':        video_title,
2101                                 'stitle':       simple_title,
2102                                 'ext':          u'mp4',
2103                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2104                                 'description':  video_description,
2105                                 'thumbnail':    video_thumbnail,
2106                                 'description':  video_description,
2107                                 'player_url':   None,
2108                         })
2109                 except UnavailableVideoError:
2110                         self._downloader.trouble(u'ERROR: unable to download video')
2111
2112
2113 class GenericIE(InfoExtractor):
2114         """Generic last-resort information extractor."""
2115
2116         _VALID_URL = r'.*'
2117         IE_NAME = u'generic'
2118
2119         def __init__(self, downloader=None):
2120                 InfoExtractor.__init__(self, downloader)
2121
2122         def report_download_webpage(self, video_id):
2123                 """Report webpage download."""
2124                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2125                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2126
2127         def report_extraction(self, video_id):
2128                 """Report information extraction."""
2129                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2130
2131         def _real_extract(self, url):
2132                 # At this point we have a new video
2133                 self._downloader.increment_downloads()
2134
2135                 video_id = url.split('/')[-1]
2136                 request = urllib2.Request(url)
2137                 try:
2138                         self.report_download_webpage(video_id)
2139                         webpage = urllib2.urlopen(request).read()
2140                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2141                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2142                         return
2143                 except ValueError, err:
2144                         # since this is the last-resort InfoExtractor, if
2145                         # this error is thrown, it'll be thrown here
2146                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2147                         return
2148
2149                 self.report_extraction(video_id)
2150                 # Start with something easy: JW Player in SWFObject
2151                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2152                 if mobj is None:
2153                         # Broaden the search a little bit
2154                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2155                 if mobj is None:
2156                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2157                         return
2158
2159                 # It's possible that one of the regexes
2160                 # matched, but returned an empty group:
2161                 if mobj.group(1) is None:
2162                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2163                         return
2164
2165                 video_url = urllib.unquote(mobj.group(1))
2166                 video_id = os.path.basename(video_url)
2167
2168                 # here's a fun little line of code for you:
2169                 video_extension = os.path.splitext(video_id)[1][1:]
2170                 video_id = os.path.splitext(video_id)[0]
2171
2172                 # it's tempting to parse this further, but you would
2173                 # have to take into account all the variations like
2174                 #   Video Title - Site Name
2175                 #   Site Name | Video Title
2176                 #   Video Title - Tagline | Site Name
2177                 # and so on and so forth; it's just not practical
2178                 mobj = re.search(r'<title>(.*)</title>', webpage)
2179                 if mobj is None:
2180                         self._downloader.trouble(u'ERROR: unable to extract title')
2181                         return
2182                 video_title = mobj.group(1).decode('utf-8')
2183                 video_title = sanitize_title(video_title)
2184                 simple_title = _simplify_title(video_title)
2185
2186                 # video uploader is domain name
2187                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2188                 if mobj is None:
2189                         self._downloader.trouble(u'ERROR: unable to extract title')
2190                         return
2191                 video_uploader = mobj.group(1).decode('utf-8')
2192
2193                 try:
2194                         # Process video information
2195                         self._downloader.process_info({
2196                                 'id':           video_id.decode('utf-8'),
2197                                 'url':          video_url.decode('utf-8'),
2198                                 'uploader':     video_uploader,
2199                                 'upload_date':  u'NA',
2200                                 'title':        video_title,
2201                                 'stitle':       simple_title,
2202                                 'ext':          video_extension.decode('utf-8'),
2203                                 'format':       u'NA',
2204                                 'player_url':   None,
2205                         })
2206                 except UnavailableVideoError, err:
2207                         self._downloader.trouble(u'\nERROR: unable to download video')
2208
2209
2210 class YoutubeSearchIE(InfoExtractor):
2211         """Information Extractor for YouTube search queries."""
2212         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2213         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2214         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2215         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2216         _youtube_ie = None
2217         _max_youtube_results = 1000
2218         IE_NAME = u'youtube:search'
2219
2220         def __init__(self, youtube_ie, downloader=None):
2221                 InfoExtractor.__init__(self, downloader)
2222                 self._youtube_ie = youtube_ie
2223
2224         def report_download_page(self, query, pagenum):
2225                 """Report attempt to download playlist page with given number."""
2226                 query = query.decode(preferredencoding())
2227                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2228
2229         def _real_initialize(self):
2230                 self._youtube_ie.initialize()
2231
2232         def _real_extract(self, query):
2233                 mobj = re.match(self._VALID_URL, query)
2234                 if mobj is None:
2235                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2236                         return
2237
2238                 prefix, query = query.split(':')
2239                 prefix = prefix[8:]
2240                 query = query.encode('utf-8')
2241                 if prefix == '':
2242                         self._download_n_results(query, 1)
2243                         return
2244                 elif prefix == 'all':
2245                         self._download_n_results(query, self._max_youtube_results)
2246                         return
2247                 else:
2248                         try:
2249                                 n = long(prefix)
2250                                 if n <= 0:
2251                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2252                                         return
2253                                 elif n > self._max_youtube_results:
2254                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2255                                         n = self._max_youtube_results
2256                                 self._download_n_results(query, n)
2257                                 return
2258                         except ValueError: # parsing prefix as integer fails
2259                                 self._download_n_results(query, 1)
2260                                 return
2261
2262         def _download_n_results(self, query, n):
2263                 """Downloads a specified number of results for a query"""
2264
2265                 video_ids = []
2266                 already_seen = set()
2267                 pagenum = 1
2268
2269                 while True:
2270                         self.report_download_page(query, pagenum)
2271                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2272                         request = urllib2.Request(result_url)
2273                         try:
2274                                 page = urllib2.urlopen(request).read()
2275                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2276                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2277                                 return
2278
2279                         # Extract video identifiers
2280                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2281                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2282                                 if video_id not in already_seen:
2283                                         video_ids.append(video_id)
2284                                         already_seen.add(video_id)
2285                                         if len(video_ids) == n:
2286                                                 # Specified n videos reached
2287                                                 for id in video_ids:
2288                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2289                                                 return
2290
2291                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2292                                 for id in video_ids:
2293                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2294                                 return
2295
2296                         pagenum = pagenum + 1
2297
2298
2299 class GoogleSearchIE(InfoExtractor):
2300         """Information Extractor for Google Video search queries."""
2301         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2302         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2303         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2304         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2305         _google_ie = None
2306         _max_google_results = 1000
2307         IE_NAME = u'video.google:search'
2308
2309         def __init__(self, google_ie, downloader=None):
2310                 InfoExtractor.__init__(self, downloader)
2311                 self._google_ie = google_ie
2312
2313         def report_download_page(self, query, pagenum):
2314                 """Report attempt to download playlist page with given number."""
2315                 query = query.decode(preferredencoding())
2316                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2317
2318         def _real_initialize(self):
2319                 self._google_ie.initialize()
2320
2321         def _real_extract(self, query):
2322                 mobj = re.match(self._VALID_URL, query)
2323                 if mobj is None:
2324                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2325                         return
2326
2327                 prefix, query = query.split(':')
2328                 prefix = prefix[8:]
2329                 query = query.encode('utf-8')
2330                 if prefix == '':
2331                         self._download_n_results(query, 1)
2332                         return
2333                 elif prefix == 'all':
2334                         self._download_n_results(query, self._max_google_results)
2335                         return
2336                 else:
2337                         try:
2338                                 n = long(prefix)
2339                                 if n <= 0:
2340                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2341                                         return
2342                                 elif n > self._max_google_results:
2343                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2344                                         n = self._max_google_results
2345                                 self._download_n_results(query, n)
2346                                 return
2347                         except ValueError: # parsing prefix as integer fails
2348                                 self._download_n_results(query, 1)
2349                                 return
2350
2351         def _download_n_results(self, query, n):
2352                 """Downloads a specified number of results for a query"""
2353
2354                 video_ids = []
2355                 already_seen = set()
2356                 pagenum = 1
2357
2358                 while True:
2359                         self.report_download_page(query, pagenum)
2360                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2361                         request = urllib2.Request(result_url)
2362                         try:
2363                                 page = urllib2.urlopen(request).read()
2364                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2365                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2366                                 return
2367
2368                         # Extract video identifiers
2369                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2370                                 video_id = mobj.group(1)
2371                                 if video_id not in already_seen:
2372                                         video_ids.append(video_id)
2373                                         already_seen.add(video_id)
2374                                         if len(video_ids) == n:
2375                                                 # Specified n videos reached
2376                                                 for id in video_ids:
2377                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2378                                                 return
2379
2380                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2381                                 for id in video_ids:
2382                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2383                                 return
2384
2385                         pagenum = pagenum + 1
2386
2387
2388 class YahooSearchIE(InfoExtractor):
2389         """Information Extractor for Yahoo! Video search queries."""
2390         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2391         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2392         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2393         _MORE_PAGES_INDICATOR = r'\s*Next'
2394         _yahoo_ie = None
2395         _max_yahoo_results = 1000
2396         IE_NAME = u'video.yahoo:search'
2397
2398         def __init__(self, yahoo_ie, downloader=None):
2399                 InfoExtractor.__init__(self, downloader)
2400                 self._yahoo_ie = yahoo_ie
2401
2402         def report_download_page(self, query, pagenum):
2403                 """Report attempt to download playlist page with given number."""
2404                 query = query.decode(preferredencoding())
2405                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2406
2407         def _real_initialize(self):
2408                 self._yahoo_ie.initialize()
2409
2410         def _real_extract(self, query):
2411                 mobj = re.match(self._VALID_URL, query)
2412                 if mobj is None:
2413                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2414                         return
2415
2416                 prefix, query = query.split(':')
2417                 prefix = prefix[8:]
2418                 query = query.encode('utf-8')
2419                 if prefix == '':
2420                         self._download_n_results(query, 1)
2421                         return
2422                 elif prefix == 'all':
2423                         self._download_n_results(query, self._max_yahoo_results)
2424                         return
2425                 else:
2426                         try:
2427                                 n = long(prefix)
2428                                 if n <= 0:
2429                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2430                                         return
2431                                 elif n > self._max_yahoo_results:
2432                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2433                                         n = self._max_yahoo_results
2434                                 self._download_n_results(query, n)
2435                                 return
2436                         except ValueError: # parsing prefix as integer fails
2437                                 self._download_n_results(query, 1)
2438                                 return
2439
2440         def _download_n_results(self, query, n):
2441                 """Downloads a specified number of results for a query"""
2442
2443                 video_ids = []
2444                 already_seen = set()
2445                 pagenum = 1
2446
2447                 while True:
2448                         self.report_download_page(query, pagenum)
2449                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2450                         request = urllib2.Request(result_url)
2451                         try:
2452                                 page = urllib2.urlopen(request).read()
2453                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2454                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2455                                 return
2456
2457                         # Extract video identifiers
2458                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2459                                 video_id = mobj.group(1)
2460                                 if video_id not in already_seen:
2461                                         video_ids.append(video_id)
2462                                         already_seen.add(video_id)
2463                                         if len(video_ids) == n:
2464                                                 # Specified n videos reached
2465                                                 for id in video_ids:
2466                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2467                                                 return
2468
2469                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2470                                 for id in video_ids:
2471                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2472                                 return
2473
2474                         pagenum = pagenum + 1
2475
2476
2477 class YoutubePlaylistIE(InfoExtractor):
2478         """Information Extractor for YouTube playlists."""
2479
2480         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2481         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2482         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2483         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2484         _youtube_ie = None
2485         IE_NAME = u'youtube:playlist'
2486
2487         def __init__(self, youtube_ie, downloader=None):
2488                 InfoExtractor.__init__(self, downloader)
2489                 self._youtube_ie = youtube_ie
2490
2491         def report_download_page(self, playlist_id, pagenum):
2492                 """Report attempt to download playlist page with given number."""
2493                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2494
2495         def _real_initialize(self):
2496                 self._youtube_ie.initialize()
2497
2498         def _real_extract(self, url):
2499                 # Extract playlist id
2500                 mobj = re.match(self._VALID_URL, url)
2501                 if mobj is None:
2502                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2503                         return
2504
2505                 # Single video case
2506                 if mobj.group(3) is not None:
2507                         self._youtube_ie.extract(mobj.group(3))
2508                         return
2509
2510                 # Download playlist pages
2511                 # prefix is 'p' as default for playlists but there are other types that need extra care
2512                 playlist_prefix = mobj.group(1)
2513                 if playlist_prefix == 'a':
2514                         playlist_access = 'artist'
2515                 else:
2516                         playlist_prefix = 'p'
2517                         playlist_access = 'view_play_list'
2518                 playlist_id = mobj.group(2)
2519                 video_ids = []
2520                 pagenum = 1
2521
2522                 while True:
2523                         self.report_download_page(playlist_id, pagenum)
2524                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2525                         request = urllib2.Request(url)
2526                         try:
2527                                 page = urllib2.urlopen(request).read()
2528                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2529                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2530                                 return
2531
2532                         # Extract video identifiers
2533                         ids_in_page = []
2534                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2535                                 if mobj.group(1) not in ids_in_page:
2536                                         ids_in_page.append(mobj.group(1))
2537                         video_ids.extend(ids_in_page)
2538
2539                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2540                                 break
2541                         pagenum = pagenum + 1
2542
2543                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2544                 playlistend = self._downloader.params.get('playlistend', -1)
2545                 video_ids = video_ids[playliststart:playlistend]
2546
2547                 for id in video_ids:
2548                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2549                 return
2550
2551
2552 class YoutubeUserIE(InfoExtractor):
2553         """Information Extractor for YouTube users."""
2554
2555         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2556         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2557         _GDATA_PAGE_SIZE = 50
2558         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2559         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2560         _youtube_ie = None
2561         IE_NAME = u'youtube:user'
2562
2563         def __init__(self, youtube_ie, downloader=None):
2564                 InfoExtractor.__init__(self, downloader)
2565                 self._youtube_ie = youtube_ie
2566
2567         def report_download_page(self, username, start_index):
2568                 """Report attempt to download user page."""
2569                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2570                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2571
2572         def _real_initialize(self):
2573                 self._youtube_ie.initialize()
2574
2575         def _real_extract(self, url):
2576                 # Extract username
2577                 mobj = re.match(self._VALID_URL, url)
2578                 if mobj is None:
2579                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2580                         return
2581
2582                 username = mobj.group(1)
2583
2584                 # Download video ids using YouTube Data API. Result size per
2585                 # query is limited (currently to 50 videos) so we need to query
2586                 # page by page until there are no video ids - it means we got
2587                 # all of them.
2588
2589                 video_ids = []
2590                 pagenum = 0
2591
2592                 while True:
2593                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2594                         self.report_download_page(username, start_index)
2595
2596                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2597
2598                         try:
2599                                 page = urllib2.urlopen(request).read()
2600                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2601                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2602                                 return
2603
2604                         # Extract video identifiers
2605                         ids_in_page = []
2606
2607                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2608                                 if mobj.group(1) not in ids_in_page:
2609                                         ids_in_page.append(mobj.group(1))
2610
2611                         video_ids.extend(ids_in_page)
2612
2613                         # A little optimization - if current page is not
2614                         # "full", ie. does not contain PAGE_SIZE video ids then
2615                         # we can assume that this page is the last one - there
2616                         # are no more ids on further pages - no need to query
2617                         # again.
2618
2619                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2620                                 break
2621
2622                         pagenum += 1
2623
2624                 all_ids_count = len(video_ids)
2625                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2626                 playlistend = self._downloader.params.get('playlistend', -1)
2627
2628                 if playlistend == -1:
2629                         video_ids = video_ids[playliststart:]
2630                 else:
2631                         video_ids = video_ids[playliststart:playlistend]
2632
2633                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2634                                 (username, all_ids_count, len(video_ids)))
2635
2636                 for video_id in video_ids:
2637                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2638
2639
2640 class DepositFilesIE(InfoExtractor):
2641         """Information extractor for depositfiles.com"""
2642
2643         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2644         IE_NAME = u'DepositFiles'
2645
2646         def __init__(self, downloader=None):
2647                 InfoExtractor.__init__(self, downloader)
2648
2649         def report_download_webpage(self, file_id):
2650                 """Report webpage download."""
2651                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2652
2653         def report_extraction(self, file_id):
2654                 """Report information extraction."""
2655                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2656
2657         def _real_extract(self, url):
2658                 # At this point we have a new file
2659                 self._downloader.increment_downloads()
2660
2661                 file_id = url.split('/')[-1]
2662                 # Rebuild url in english locale
2663                 url = 'http://depositfiles.com/en/files/' + file_id
2664
2665                 # Retrieve file webpage with 'Free download' button pressed
2666                 free_download_indication = { 'gateway_result' : '1' }
2667                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2668                 try:
2669                         self.report_download_webpage(file_id)
2670                         webpage = urllib2.urlopen(request).read()
2671                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2672                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2673                         return
2674
2675                 # Search for the real file URL
2676                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2677                 if (mobj is None) or (mobj.group(1) is None):
2678                         # Try to figure out reason of the error.
2679                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2680                         if (mobj is not None) and (mobj.group(1) is not None):
2681                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2682                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2683                         else:
2684                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2685                         return
2686
2687                 file_url = mobj.group(1)
2688                 file_extension = os.path.splitext(file_url)[1][1:]
2689
2690                 # Search for file title
2691                 mobj = re.search(r'<b title="(.*?)">', webpage)
2692                 if mobj is None:
2693                         self._downloader.trouble(u'ERROR: unable to extract title')
2694                         return
2695                 file_title = mobj.group(1).decode('utf-8')
2696
2697                 try:
2698                         # Process file information
2699                         self._downloader.process_info({
2700                                 'id':           file_id.decode('utf-8'),
2701                                 'url':          file_url.decode('utf-8'),
2702                                 'uploader':     u'NA',
2703                                 'upload_date':  u'NA',
2704                                 'title':        file_title,
2705                                 'stitle':       file_title,
2706                                 'ext':          file_extension.decode('utf-8'),
2707                                 'format':       u'NA',
2708                                 'player_url':   None,
2709                         })
2710                 except UnavailableVideoError, err:
2711                         self._downloader.trouble(u'ERROR: unable to download file')
2712
2713
2714 class FacebookIE(InfoExtractor):
2715         """Information Extractor for Facebook"""
2716
2717         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2718         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2719         _NETRC_MACHINE = 'facebook'
2720         _available_formats = ['video', 'highqual', 'lowqual']
2721         _video_extensions = {
2722                 'video': 'mp4',
2723                 'highqual': 'mp4',
2724                 'lowqual': 'mp4',
2725         }
2726         IE_NAME = u'facebook'
2727
2728         def __init__(self, downloader=None):
2729                 InfoExtractor.__init__(self, downloader)
2730
2731         def _reporter(self, message):
2732                 """Add header and report message."""
2733                 self._downloader.to_screen(u'[facebook] %s' % message)
2734
2735         def report_login(self):
2736                 """Report attempt to log in."""
2737                 self._reporter(u'Logging in')
2738
2739         def report_video_webpage_download(self, video_id):
2740                 """Report attempt to download video webpage."""
2741                 self._reporter(u'%s: Downloading video webpage' % video_id)
2742
2743         def report_information_extraction(self, video_id):
2744                 """Report attempt to extract video information."""
2745                 self._reporter(u'%s: Extracting video information' % video_id)
2746
2747         def _parse_page(self, video_webpage):
2748                 """Extract video information from page"""
2749                 # General data
2750                 data = {'title': r'\("video_title", "(.*?)"\)',
2751                         'description': r'<div class="datawrap">(.*?)</div>',
2752                         'owner': r'\("video_owner_name", "(.*?)"\)',
2753                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2754                         }
2755                 video_info = {}
2756                 for piece in data.keys():
2757                         mobj = re.search(data[piece], video_webpage)
2758                         if mobj is not None:
2759                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2760
2761                 # Video urls
2762                 video_urls = {}
2763                 for fmt in self._available_formats:
2764                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2765                         if mobj is not None:
2766                                 # URL is in a Javascript segment inside an escaped Unicode format within
2767                                 # the generally utf-8 page
2768                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2769                 video_info['video_urls'] = video_urls
2770
2771                 return video_info
2772
2773         def _real_initialize(self):
2774                 if self._downloader is None:
2775                         return
2776
2777                 useremail = None
2778                 password = None
2779                 downloader_params = self._downloader.params
2780
2781                 # Attempt to use provided username and password or .netrc data
2782                 if downloader_params.get('username', None) is not None:
2783                         useremail = downloader_params['username']
2784                         password = downloader_params['password']
2785                 elif downloader_params.get('usenetrc', False):
2786                         try:
2787                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2788                                 if info is not None:
2789                                         useremail = info[0]
2790                                         password = info[2]
2791                                 else:
2792                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2793                         except (IOError, netrc.NetrcParseError), err:
2794                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2795                                 return
2796
2797                 if useremail is None:
2798                         return
2799
2800                 # Log in
2801                 login_form = {
2802                         'email': useremail,
2803                         'pass': password,
2804                         'login': 'Log+In'
2805                         }
2806                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2807                 try:
2808                         self.report_login()
2809                         login_results = urllib2.urlopen(request).read()
2810                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2811                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2812                                 return
2813                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2814                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2815                         return
2816
2817         def _real_extract(self, url):
2818                 mobj = re.match(self._VALID_URL, url)
2819                 if mobj is None:
2820                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2821                         return
2822                 video_id = mobj.group('ID')
2823
2824                 # Get video webpage
2825                 self.report_video_webpage_download(video_id)
2826                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2827                 try:
2828                         page = urllib2.urlopen(request)
2829                         video_webpage = page.read()
2830                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2831                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2832                         return
2833
2834                 # Start extracting information
2835                 self.report_information_extraction(video_id)
2836
2837                 # Extract information
2838                 video_info = self._parse_page(video_webpage)
2839
2840                 # uploader
2841                 if 'owner' not in video_info:
2842                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2843                         return
2844                 video_uploader = video_info['owner']
2845
2846                 # title
2847                 if 'title' not in video_info:
2848                         self._downloader.trouble(u'ERROR: unable to extract video title')
2849                         return
2850                 video_title = video_info['title']
2851                 video_title = video_title.decode('utf-8')
2852                 video_title = sanitize_title(video_title)
2853
2854                 simple_title = _simplify_title(video_title)
2855
2856                 # thumbnail image
2857                 if 'thumbnail' not in video_info:
2858                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2859                         video_thumbnail = ''
2860                 else:
2861                         video_thumbnail = video_info['thumbnail']
2862
2863                 # upload date
2864                 upload_date = u'NA'
2865                 if 'upload_date' in video_info:
2866                         upload_time = video_info['upload_date']
2867                         timetuple = email.utils.parsedate_tz(upload_time)
2868                         if timetuple is not None:
2869                                 try:
2870                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2871                                 except:
2872                                         pass
2873
2874                 # description
2875                 video_description = video_info.get('description', 'No description available.')
2876
2877                 url_map = video_info['video_urls']
2878                 if len(url_map.keys()) > 0:
2879                         # Decide which formats to download
2880                         req_format = self._downloader.params.get('format', None)
2881                         format_limit = self._downloader.params.get('format_limit', None)
2882
2883                         if format_limit is not None and format_limit in self._available_formats:
2884                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2885                         else:
2886                                 format_list = self._available_formats
2887                         existing_formats = [x for x in format_list if x in url_map]
2888                         if len(existing_formats) == 0:
2889                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2890                                 return
2891                         if req_format is None:
2892                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2893                         elif req_format == 'worst':
2894                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2895                         elif req_format == '-1':
2896                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2897                         else:
2898                                 # Specific format
2899                                 if req_format not in url_map:
2900                                         self._downloader.trouble(u'ERROR: requested format not available')
2901                                         return
2902                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2903
2904                 for format_param, video_real_url in video_url_list:
2905
2906                         # At this point we have a new video
2907                         self._downloader.increment_downloads()
2908
2909                         # Extension
2910                         video_extension = self._video_extensions.get(format_param, 'mp4')
2911
2912                         try:
2913                                 # Process video information
2914                                 self._downloader.process_info({
2915                                         'id':           video_id.decode('utf-8'),
2916                                         'url':          video_real_url.decode('utf-8'),
2917                                         'uploader':     video_uploader.decode('utf-8'),
2918                                         'upload_date':  upload_date,
2919                                         'title':        video_title,
2920                                         'stitle':       simple_title,
2921                                         'ext':          video_extension.decode('utf-8'),
2922                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2923                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2924                                         'description':  video_description.decode('utf-8'),
2925                                         'player_url':   None,
2926                                 })
2927                         except UnavailableVideoError, err:
2928                                 self._downloader.trouble(u'\nERROR: unable to download video')
2929
2930 class BlipTVIE(InfoExtractor):
2931         """Information extractor for blip.tv"""
2932
2933         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2934         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2935         IE_NAME = u'blip.tv'
2936
2937         def report_extraction(self, file_id):
2938                 """Report information extraction."""
2939                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2940
2941         def report_direct_download(self, title):
2942                 """Report information extraction."""
2943                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2944
2945         def _real_extract(self, url):
2946                 mobj = re.match(self._VALID_URL, url)
2947                 if mobj is None:
2948                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2949                         return
2950
2951                 if '?' in url:
2952                         cchar = '&'
2953                 else:
2954                         cchar = '?'
2955                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2956                 request = urllib2.Request(json_url)
2957                 self.report_extraction(mobj.group(1))
2958                 info = None
2959                 try:
2960                         urlh = urllib2.urlopen(request)
2961                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2962                                 basename = url.split('/')[-1]
2963                                 title,ext = os.path.splitext(basename)
2964                                 title = title.decode('UTF-8')
2965                                 ext = ext.replace('.', '')
2966                                 self.report_direct_download(title)
2967                                 info = {
2968                                         'id': title,
2969                                         'url': url,
2970                                         'title': title,
2971                                         'stitle': _simplify_title(title),
2972                                         'ext': ext,
2973                                         'urlhandle': urlh
2974                                 }
2975                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2976                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2977                         return
2978                 if info is None: # Regular URL
2979                         try:
2980                                 json_code = urlh.read()
2981                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2982                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2983                                 return
2984
2985                         try:
2986                                 json_data = json.loads(json_code)
2987                                 if 'Post' in json_data:
2988                                         data = json_data['Post']
2989                                 else:
2990                                         data = json_data
2991         
2992                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2993                                 video_url = data['media']['url']
2994                                 umobj = re.match(self._URL_EXT, video_url)
2995                                 if umobj is None:
2996                                         raise ValueError('Can not determine filename extension')
2997                                 ext = umobj.group(1)
2998         
2999                                 info = {
3000                                         'id': data['item_id'],
3001                                         'url': video_url,
3002                                         'uploader': data['display_name'],
3003                                         'upload_date': upload_date,
3004                                         'title': data['title'],
3005                                         'stitle': _simplify_title(data['title']),
3006                                         'ext': ext,
3007                                         'format': data['media']['mimeType'],
3008                                         'thumbnail': data['thumbnailUrl'],
3009                                         'description': data['description'],
3010                                         'player_url': data['embedUrl']
3011                                 }
3012                         except (ValueError,KeyError), err:
3013                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3014                                 return
3015
3016                 self._downloader.increment_downloads()
3017
3018                 try:
3019                         self._downloader.process_info(info)
3020                 except UnavailableVideoError, err:
3021                         self._downloader.trouble(u'\nERROR: unable to download video')
3022
3023
3024 class MyVideoIE(InfoExtractor):
3025         """Information Extractor for myvideo.de."""
3026
3027         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3028         IE_NAME = u'myvideo'
3029
3030         def __init__(self, downloader=None):
3031                 InfoExtractor.__init__(self, downloader)
3032         
3033         def report_download_webpage(self, video_id):
3034                 """Report webpage download."""
3035                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3036
3037         def report_extraction(self, video_id):
3038                 """Report information extraction."""
3039                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3040
3041         def _real_extract(self,url):
3042                 mobj = re.match(self._VALID_URL, url)
3043                 if mobj is None:
3044                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3045                         return
3046
3047                 video_id = mobj.group(1)
3048
3049                 # Get video webpage
3050                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3051                 try:
3052                         self.report_download_webpage(video_id)
3053                         webpage = urllib2.urlopen(request).read()
3054                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3055                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3056                         return
3057
3058                 self.report_extraction(video_id)
3059                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3060                                  webpage)
3061                 if mobj is None:
3062                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3063                         return
3064                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3065
3066                 mobj = re.search('<title>([^<]+)</title>', webpage)
3067                 if mobj is None:
3068                         self._downloader.trouble(u'ERROR: unable to extract title')
3069                         return
3070
3071                 video_title = mobj.group(1)
3072                 video_title = sanitize_title(video_title)
3073
3074                 simple_title = _simplify_title(video_title)
3075
3076                 try:
3077                         self._downloader.process_info({
3078                                 'id':           video_id,
3079                                 'url':          video_url,
3080                                 'uploader':     u'NA',
3081                                 'upload_date':  u'NA',
3082                                 'title':        video_title,
3083                                 'stitle':       simple_title,
3084                                 'ext':          u'flv',
3085                                 'format':       u'NA',
3086                                 'player_url':   None,
3087                         })
3088                 except UnavailableVideoError:
3089                         self._downloader.trouble(u'\nERROR: Unable to download video')
3090
3091 class ComedyCentralIE(InfoExtractor):
3092         """Information extractor for The Daily Show and Colbert Report """
3093
3094         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3095         IE_NAME = u'comedycentral'
3096
3097         def report_extraction(self, episode_id):
3098                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3099         
3100         def report_config_download(self, episode_id):
3101                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3102
3103         def report_index_download(self, episode_id):
3104                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3105
3106         def report_player_url(self, episode_id):
3107                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3108
3109         def _real_extract(self, url):
3110                 mobj = re.match(self._VALID_URL, url)
3111                 if mobj is None:
3112                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3113                         return
3114
3115                 if mobj.group('shortname'):
3116                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3117                                 url = u'http://www.thedailyshow.com/full-episodes/'
3118                         else:
3119                                 url = u'http://www.colbertnation.com/full-episodes/'
3120                         mobj = re.match(self._VALID_URL, url)
3121                         assert mobj is not None
3122
3123                 dlNewest = not mobj.group('episode')
3124                 if dlNewest:
3125                         epTitle = mobj.group('showname')
3126                 else:
3127                         epTitle = mobj.group('episode')
3128
3129                 req = urllib2.Request(url)
3130                 self.report_extraction(epTitle)
3131                 try:
3132                         htmlHandle = urllib2.urlopen(req)
3133                         html = htmlHandle.read()
3134                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3135                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3136                         return
3137                 if dlNewest:
3138                         url = htmlHandle.geturl()
3139                         mobj = re.match(self._VALID_URL, url)
3140                         if mobj is None:
3141                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3142                                 return
3143                         if mobj.group('episode') == '':
3144                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3145                                 return
3146                         epTitle = mobj.group('episode')
3147
3148                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3149                 if len(mMovieParams) == 0:
3150                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3151                         return
3152
3153                 playerUrl_raw = mMovieParams[0][0]
3154                 self.report_player_url(epTitle)
3155                 try:
3156                         urlHandle = urllib2.urlopen(playerUrl_raw)
3157                         playerUrl = urlHandle.geturl()
3158                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3159                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3160                         return
3161
3162                 uri = mMovieParams[0][1]
3163                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3164                 self.report_index_download(epTitle)
3165                 try:
3166                         indexXml = urllib2.urlopen(indexUrl).read()
3167                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3168                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3169                         return
3170
3171                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3172                 itemEls = idoc.findall('.//item')
3173                 for itemEl in itemEls:
3174                         mediaId = itemEl.findall('./guid')[0].text
3175                         shortMediaId = mediaId.split(':')[-1]
3176                         showId = mediaId.split(':')[-2].replace('.com', '')
3177                         officialTitle = itemEl.findall('./title')[0].text
3178                         officialDate = itemEl.findall('./pubDate')[0].text
3179
3180                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3181                                                 urllib.urlencode({'uri': mediaId}))
3182                         configReq = urllib2.Request(configUrl)
3183                         self.report_config_download(epTitle)
3184                         try:
3185                                 configXml = urllib2.urlopen(configReq).read()
3186                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3187                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3188                                 return
3189
3190                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3191                         turls = []
3192                         for rendition in cdoc.findall('.//rendition'):
3193                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3194                                 turls.append(finfo)
3195
3196                         if len(turls) == 0:
3197                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3198                                 continue
3199
3200                         # For now, just pick the highest bitrate
3201                         format,video_url = turls[-1]
3202
3203                         self._downloader.increment_downloads()
3204
3205                         effTitle = showId + u'-' + epTitle
3206                         info = {
3207                                 'id': shortMediaId,
3208                                 'url': video_url,
3209                                 'uploader': showId,
3210                                 'upload_date': officialDate,
3211                                 'title': effTitle,
3212                                 'stitle': _simplify_title(effTitle),
3213                                 'ext': 'mp4',
3214                                 'format': format,
3215                                 'thumbnail': None,
3216                                 'description': officialTitle,
3217                                 'player_url': playerUrl
3218                         }
3219
3220                         try:
3221                                 self._downloader.process_info(info)
3222                         except UnavailableVideoError, err:
3223                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3224                                 continue
3225
3226
3227 class EscapistIE(InfoExtractor):
3228         """Information extractor for The Escapist """
3229
3230         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3231         IE_NAME = u'escapist'
3232
3233         def report_extraction(self, showName):
3234                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3235
3236         def report_config_download(self, showName):
3237                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3238
3239         def _real_extract(self, url):
3240                 htmlParser = HTMLParser.HTMLParser()
3241
3242                 mobj = re.match(self._VALID_URL, url)
3243                 if mobj is None:
3244                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3245                         return
3246                 showName = mobj.group('showname')
3247                 videoId = mobj.group('episode')
3248
3249                 self.report_extraction(showName)
3250                 try:
3251                         webPage = urllib2.urlopen(url).read()
3252                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3253                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3254                         return
3255
3256                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3257                 description = htmlParser.unescape(descMatch.group(1))
3258                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3259                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3260                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3261                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3262                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3263                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3264
3265                 self.report_config_download(showName)
3266                 try:
3267                         configJSON = urllib2.urlopen(configUrl).read()
3268                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3269                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3270                         return
3271
3272                 # Technically, it's JavaScript, not JSON
3273                 configJSON = configJSON.replace("'", '"')
3274
3275                 try:
3276                         config = json.loads(configJSON)
3277                 except (ValueError,), err:
3278                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3279                         return
3280
3281                 playlist = config['playlist']
3282                 videoUrl = playlist[1]['url']
3283
3284                 self._downloader.increment_downloads()
3285                 info = {
3286                         'id': videoId,
3287                         'url': videoUrl,
3288                         'uploader': showName,
3289                         'upload_date': None,
3290                         'title': showName,
3291                         'stitle': _simplify_title(showName),
3292                         'ext': 'flv',
3293                         'format': 'flv',
3294                         'thumbnail': imgUrl,
3295                         'description': description,
3296                         'player_url': playerUrl,
3297                 }
3298
3299                 try:
3300                         self._downloader.process_info(info)
3301                 except UnavailableVideoError, err:
3302                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3303
3304
3305 class CollegeHumorIE(InfoExtractor):
3306         """Information extractor for collegehumor.com"""
3307
3308         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3309         IE_NAME = u'collegehumor'
3310
3311         def report_webpage(self, video_id):
3312                 """Report information extraction."""
3313                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3314
3315         def report_extraction(self, video_id):
3316                 """Report information extraction."""
3317                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3318
3319         def _real_extract(self, url):
3320                 htmlParser = HTMLParser.HTMLParser()
3321
3322                 mobj = re.match(self._VALID_URL, url)
3323                 if mobj is None:
3324                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3325                         return
3326                 video_id = mobj.group('videoid')
3327
3328                 self.report_webpage(video_id)
3329                 request = urllib2.Request(url)
3330                 try:
3331                         webpage = urllib2.urlopen(request).read()
3332                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3333                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3334                         return
3335
3336                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3337                 if m is None:
3338                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3339                         return
3340                 internal_video_id = m.group('internalvideoid')
3341
3342                 info = {
3343                         'id': video_id,
3344                         'internal_id': internal_video_id,
3345                 }
3346
3347                 self.report_extraction(video_id)
3348                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3349                 try:
3350                         metaXml = urllib2.urlopen(xmlUrl).read()
3351                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3352                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3353                         return
3354
3355                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3356                 try:
3357                         videoNode = mdoc.findall('./video')[0]
3358                         info['description'] = videoNode.findall('./description')[0].text
3359                         info['title'] = videoNode.findall('./caption')[0].text
3360                         info['stitle'] = _simplify_title(info['title'])
3361                         info['url'] = videoNode.findall('./file')[0].text
3362                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3363                         info['ext'] = info['url'].rpartition('.')[2]
3364                         info['format'] = info['ext']
3365                 except IndexError:
3366                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3367                         return
3368
3369                 self._downloader.increment_downloads()
3370
3371                 try:
3372                         self._downloader.process_info(info)
3373                 except UnavailableVideoError, err:
3374                         self._downloader.trouble(u'\nERROR: unable to download video')
3375
3376
3377 class XVideosIE(InfoExtractor):
3378         """Information extractor for xvideos.com"""
3379
3380         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3381         IE_NAME = u'xvideos'
3382
3383         def report_webpage(self, video_id):
3384                 """Report information extraction."""
3385                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3386
3387         def report_extraction(self, video_id):
3388                 """Report information extraction."""
3389                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3390
3391         def _real_extract(self, url):
3392                 htmlParser = HTMLParser.HTMLParser()
3393
3394                 mobj = re.match(self._VALID_URL, url)
3395                 if mobj is None:
3396                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3397                         return
3398                 video_id = mobj.group(1).decode('utf-8')
3399
3400                 self.report_webpage(video_id)
3401
3402                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3403                 try:
3404                         webpage = urllib2.urlopen(request).read()
3405                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3406                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3407                         return
3408
3409                 self.report_extraction(video_id)
3410
3411
3412                 # Extract video URL
3413                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3414                 if mobj is None:
3415                         self._downloader.trouble(u'ERROR: unable to extract video url')
3416                         return
3417                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3418
3419
3420                 # Extract title
3421                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3422                 if mobj is None:
3423                         self._downloader.trouble(u'ERROR: unable to extract video title')
3424                         return
3425                 video_title = mobj.group(1).decode('utf-8')
3426
3427
3428                 # Extract video thumbnail
3429                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3430                 if mobj is None:
3431                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3432                         return
3433                 video_thumbnail = mobj.group(1).decode('utf-8')
3434
3435
3436
3437                 self._downloader.increment_downloads()
3438                 info = {
3439                         'id': video_id,
3440                         'url': video_url,
3441                         'uploader': None,
3442                         'upload_date': None,
3443                         'title': video_title,
3444                         'stitle': _simplify_title(video_title),
3445                         'ext': 'flv',
3446                         'format': 'flv',
3447                         'thumbnail': video_thumbnail,
3448                         'description': None,
3449                         'player_url': None,
3450                 }
3451
3452                 try:
3453                         self._downloader.process_info(info)
3454                 except UnavailableVideoError, err:
3455                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3456
3457
3458 class SoundcloudIE(InfoExtractor):
3459         """Information extractor for soundcloud.com
3460            To access the media, the uid of the song and a stream token
3461            must be extracted from the page source and the script must make
3462            a request to media.soundcloud.com/crossdomain.xml. Then
3463            the media can be grabbed by requesting from an url composed
3464            of the stream token and uid
3465          """
3466
3467         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3468         IE_NAME = u'soundcloud'
3469
3470         def __init__(self, downloader=None):
3471                 InfoExtractor.__init__(self, downloader)
3472
3473         def report_webpage(self, video_id):
3474                 """Report information extraction."""
3475                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3476
3477         def report_extraction(self, video_id):
3478                 """Report information extraction."""
3479                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3480
3481         def _real_extract(self, url):
3482                 htmlParser = HTMLParser.HTMLParser()
3483
3484                 mobj = re.match(self._VALID_URL, url)
3485                 if mobj is None:
3486                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3487                         return
3488
3489                 # extract uploader (which is in the url)
3490                 uploader = mobj.group(1).decode('utf-8')
3491                 # extract simple title (uploader + slug of song title)
3492                 slug_title =  mobj.group(2).decode('utf-8')
3493                 simple_title = uploader + '-' + slug_title
3494
3495                 self.report_webpage('%s/%s' % (uploader, slug_title))
3496
3497                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3498                 try:
3499                         webpage = urllib2.urlopen(request).read()
3500                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3501                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3502                         return
3503
3504                 self.report_extraction('%s/%s' % (uploader, slug_title))
3505
3506                 # extract uid and stream token that soundcloud hands out for access
3507                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3508                 if mobj:
3509                         video_id = mobj.group(1)
3510                         stream_token = mobj.group(2)
3511
3512                 # extract unsimplified title
3513                 mobj = re.search('"title":"(.*?)",', webpage)
3514                 if mobj:
3515                         title = mobj.group(1)
3516
3517                 # construct media url (with uid/token)
3518                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3519                 mediaURL = mediaURL % (video_id, stream_token)
3520
3521                 # description
3522                 description = u'No description available'
3523                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3524                 if mobj:
3525                         description = mobj.group(1)
3526                 
3527                 # upload date
3528                 upload_date = None
3529                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3530                 if mobj:
3531                         try:
3532                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3533                         except Exception, e:
3534                                 print str(e)
3535
3536                 # for soundcloud, a request to a cross domain is required for cookies
3537                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3538
3539                 try:
3540                         self._downloader.process_info({
3541                                 'id':           video_id.decode('utf-8'),
3542                                 'url':          mediaURL,
3543                                 'uploader':     uploader.decode('utf-8'),
3544                                 'upload_date':  upload_date,
3545                                 'title':        simple_title.decode('utf-8'),
3546                                 'stitle':       simple_title.decode('utf-8'),
3547                                 'ext':          u'mp3',
3548                                 'format':       u'NA',
3549                                 'player_url':   None,
3550                                 'description': description.decode('utf-8')
3551                         })
3552                 except UnavailableVideoError:
3553                         self._downloader.trouble(u'\nERROR: unable to download video')
3554
3555
3556 class InfoQIE(InfoExtractor):
3557         """Information extractor for infoq.com"""
3558
3559         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3560         IE_NAME = u'infoq'
3561
3562         def report_webpage(self, video_id):
3563                 """Report information extraction."""
3564                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3565
3566         def report_extraction(self, video_id):
3567                 """Report information extraction."""
3568                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3569
3570         def _real_extract(self, url):
3571                 htmlParser = HTMLParser.HTMLParser()
3572
3573                 mobj = re.match(self._VALID_URL, url)
3574                 if mobj is None:
3575                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3576                         return
3577
3578                 self.report_webpage(url)
3579
3580                 request = urllib2.Request(url)
3581                 try:
3582                         webpage = urllib2.urlopen(request).read()
3583                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3584                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3585                         return
3586
3587                 self.report_extraction(url)
3588
3589
3590                 # Extract video URL
3591                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3592                 if mobj is None:
3593                         self._downloader.trouble(u'ERROR: unable to extract video url')
3594                         return
3595                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3596
3597
3598                 # Extract title
3599                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3600                 if mobj is None:
3601                         self._downloader.trouble(u'ERROR: unable to extract video title')
3602                         return
3603                 video_title = mobj.group(1).decode('utf-8')
3604
3605                 # Extract description
3606                 video_description = u'No description available.'
3607                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3608                 if mobj is not None:
3609                         video_description = mobj.group(1).decode('utf-8')
3610
3611                 video_filename = video_url.split('/')[-1]
3612                 video_id, extension = video_filename.split('.')
3613
3614                 self._downloader.increment_downloads()
3615                 info = {
3616                         'id': video_id,
3617                         'url': video_url,
3618                         'uploader': None,
3619                         'upload_date': None,
3620                         'title': video_title,
3621                         'stitle': _simplify_title(video_title),
3622                         'ext': extension,
3623                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3624                         'thumbnail': None,
3625                         'description': video_description,
3626                         'player_url': None,
3627                 }
3628
3629                 try:
3630                         self._downloader.process_info(info)
3631                 except UnavailableVideoError, err:
3632                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3633
3634 class MixcloudIE(InfoExtractor):
3635         """Information extractor for www.mixcloud.com"""
3636         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3637         IE_NAME = u'mixcloud'
3638
3639         def __init__(self, downloader=None):
3640                 InfoExtractor.__init__(self, downloader)
3641
3642         def report_download_json(self, file_id):
3643                 """Report JSON download."""
3644                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3645
3646         def report_extraction(self, file_id):
3647                 """Report information extraction."""
3648                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3649
3650         def get_urls(self, jsonData, fmt, bitrate='best'):
3651                 """Get urls from 'audio_formats' section in json"""
3652                 file_url = None
3653                 try:
3654                         bitrate_list = jsonData[fmt]
3655                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3656                                 bitrate = max(bitrate_list) # select highest
3657
3658                         url_list = jsonData[fmt][bitrate]
3659                 except TypeError: # we have no bitrate info.
3660                         url_list = jsonData[fmt]
3661                                 
3662                 return url_list
3663
3664         def check_urls(self, url_list):
3665                 """Returns 1st active url from list"""
3666                 for url in url_list:
3667                         try:
3668                                 urllib2.urlopen(url)
3669                                 return url
3670                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3671                                 url = None
3672
3673                 return None
3674
3675         def _print_formats(self, formats):
3676                 print 'Available formats:'
3677                 for fmt in formats.keys():
3678                         for b in formats[fmt]:
3679                                 try:
3680                                         ext = formats[fmt][b][0]
3681                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3682                                 except TypeError: # we have no bitrate info
3683                                         ext = formats[fmt][0]
3684                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3685                                         break
3686
3687         def _real_extract(self, url):
3688                 mobj = re.match(self._VALID_URL, url)
3689                 if mobj is None:
3690                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3691                         return
3692                 # extract uploader & filename from url
3693                 uploader = mobj.group(1).decode('utf-8')
3694                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3695
3696                 # construct API request
3697                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3698                 # retrieve .json file with links to files
3699                 request = urllib2.Request(file_url)
3700                 try:
3701                         self.report_download_json(file_url)
3702                         jsonData = urllib2.urlopen(request).read()
3703                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3704                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3705                         return
3706
3707                 # parse JSON
3708                 json_data = json.loads(jsonData)
3709                 player_url = json_data['player_swf_url']
3710                 formats = dict(json_data['audio_formats'])
3711
3712                 req_format = self._downloader.params.get('format', None)
3713                 bitrate = None
3714
3715                 if self._downloader.params.get('listformats', None):
3716                         self._print_formats(formats)
3717                         return
3718
3719                 if req_format is None or req_format == 'best':
3720                         for format_param in formats.keys():
3721                                 url_list = self.get_urls(formats, format_param)
3722                                 # check urls
3723                                 file_url = self.check_urls(url_list)
3724                                 if file_url is not None:
3725                                         break # got it!
3726                 else:
3727                         if req_format not in formats.keys():
3728                                 self._downloader.trouble(u'ERROR: format is not available')
3729                                 return
3730
3731                         url_list = self.get_urls(formats, req_format)
3732                         file_url = self.check_urls(url_list)
3733                         format_param = req_format
3734
3735                 # We have audio
3736                 self._downloader.increment_downloads()
3737                 try:
3738                         # Process file information
3739                         self._downloader.process_info({
3740                                 'id':           file_id.decode('utf-8'),
3741                                 'url':          file_url.decode('utf-8'),
3742                                 'uploader':     uploader.decode('utf-8'),
3743                                 'upload_date':  u'NA',
3744                                 'title':        json_data['name'],
3745                                 'stitle':       _simplify_title(json_data['name']),
3746                                 'ext':          file_url.split('.')[-1].decode('utf-8'),
3747                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3748                                 'thumbnail':    json_data['thumbnail_url'],
3749                                 'description':  json_data['description'],
3750                                 'player_url':   player_url.decode('utf-8'),
3751                         })
3752                 except UnavailableVideoError, err:
3753                         self._downloader.trouble(u'ERROR: unable to download file')
3754
3755 class StanfordOpenClassroomIE(InfoExtractor):
3756         """Information extractor for Stanford's Open ClassRoom"""
3757
3758         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3759         IE_NAME = u'stanfordoc'
3760
3761         def report_download_webpage(self, objid):
3762                 """Report information extraction."""
3763                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3764
3765         def report_extraction(self, video_id):
3766                 """Report information extraction."""
3767                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3768
3769         def _real_extract(self, url):
3770                 mobj = re.match(self._VALID_URL, url)
3771                 if mobj is None:
3772                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3773                         return
3774
3775                 if mobj.group('course') and mobj.group('video'): # A specific video
3776                         course = mobj.group('course')
3777                         video = mobj.group('video')
3778                         info = {
3779                                 'id': _simplify_title(course + '_' + video),
3780                         }
3781         
3782                         self.report_extraction(info['id'])
3783                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3784                         xmlUrl = baseUrl + video + '.xml'
3785                         try:
3786                                 metaXml = urllib2.urlopen(xmlUrl).read()
3787                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3788                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3789                                 return
3790                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3791                         try:
3792                                 info['title'] = mdoc.findall('./title')[0].text
3793                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3794                         except IndexError:
3795                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3796                                 return
3797                         info['stitle'] = _simplify_title(info['title'])
3798                         info['ext'] = info['url'].rpartition('.')[2]
3799                         info['format'] = info['ext']
3800                         self._downloader.increment_downloads()
3801                         try:
3802                                 self._downloader.process_info(info)
3803                         except UnavailableVideoError, err:
3804                                 self._downloader.trouble(u'\nERROR: unable to download video')
3805                 elif mobj.group('course'): # A course page
3806                         unescapeHTML = HTMLParser.HTMLParser().unescape
3807
3808                         course = mobj.group('course')
3809                         info = {
3810                                 'id': _simplify_title(course),
3811                                 'type': 'playlist',
3812                         }
3813
3814                         self.report_download_webpage(info['id'])
3815                         try:
3816                                 coursepage = urllib2.urlopen(url).read()
3817                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3818                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3819                                 return
3820
3821                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3822                         if m:
3823                                 info['title'] = unescapeHTML(m.group(1))
3824                         else:
3825                                 info['title'] = info['id']
3826                         info['stitle'] = _simplify_title(info['title'])
3827
3828                         m = re.search('<description>([^<]+)</description>', coursepage)
3829                         if m:
3830                                 info['description'] = unescapeHTML(m.group(1))
3831
3832                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3833                         info['list'] = [
3834                                 {
3835                                         'type': 'reference',
3836                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3837                                 }
3838                                         for vpage in links]
3839
3840                         for entry in info['list']:
3841                                 assert entry['type'] == 'reference'
3842                                 self.extract(entry['url'])
3843                 else: # Root page
3844                         unescapeHTML = HTMLParser.HTMLParser().unescape
3845
3846                         info = {
3847                                 'id': 'Stanford OpenClassroom',
3848                                 'type': 'playlist',
3849                         }
3850
3851                         self.report_download_webpage(info['id'])
3852                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3853                         try:
3854                                 rootpage = urllib2.urlopen(rootURL).read()
3855                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3856                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3857                                 return
3858
3859                         info['title'] = info['id']
3860                         info['stitle'] = _simplify_title(info['title'])
3861
3862                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3863                         info['list'] = [
3864                                 {
3865                                         'type': 'reference',
3866                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3867                                 }
3868                                         for cpage in links]
3869
3870                         for entry in info['list']:
3871                                 assert entry['type'] == 'reference'
3872                                 self.extract(entry['url'])
3873
3874
3875 class PostProcessor(object):
3876         """Post Processor class.
3877
3878         PostProcessor objects can be added to downloaders with their
3879         add_post_processor() method. When the downloader has finished a
3880         successful download, it will take its internal chain of PostProcessors
3881         and start calling the run() method on each one of them, first with
3882         an initial argument and then with the returned value of the previous
3883         PostProcessor.
3884
3885         The chain will be stopped if one of them ever returns None or the end
3886         of the chain is reached.
3887
3888         PostProcessor objects follow a "mutual registration" process similar
3889         to InfoExtractor objects.
3890         """
3891
3892         _downloader = None
3893
3894         def __init__(self, downloader=None):
3895                 self._downloader = downloader
3896
3897         def set_downloader(self, downloader):
3898                 """Sets the downloader for this PP."""
3899                 self._downloader = downloader
3900
3901         def run(self, information):
3902                 """Run the PostProcessor.
3903
3904                 The "information" argument is a dictionary like the ones
3905                 composed by InfoExtractors. The only difference is that this
3906                 one has an extra field called "filepath" that points to the
3907                 downloaded file.
3908
3909                 When this method returns None, the postprocessing chain is
3910                 stopped. However, this method may return an information
3911                 dictionary that will be passed to the next postprocessing
3912                 object in the chain. It can be the one it received after
3913                 changing some fields.
3914
3915                 In addition, this method may raise a PostProcessingError
3916                 exception that will be taken into account by the downloader
3917                 it was called from.
3918                 """
3919                 return information # by default, do nothing
3920
3921
3922 class FFmpegExtractAudioPP(PostProcessor):
3923
3924         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3925                 PostProcessor.__init__(self, downloader)
3926                 if preferredcodec is None:
3927                         preferredcodec = 'best'
3928                 self._preferredcodec = preferredcodec
3929                 self._preferredquality = preferredquality
3930                 self._keepvideo = keepvideo
3931
3932         @staticmethod
3933         def get_audio_codec(path):
3934                 try:
3935                         cmd = ['ffprobe', '-show_streams', '--', path]
3936                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3937                         output = handle.communicate()[0]
3938                         if handle.wait() != 0:
3939                                 return None
3940                 except (IOError, OSError):
3941                         return None
3942                 audio_codec = None
3943                 for line in output.split('\n'):
3944                         if line.startswith('codec_name='):
3945                                 audio_codec = line.split('=')[1].strip()
3946                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3947                                 return audio_codec
3948                 return None
3949
3950         @staticmethod
3951         def run_ffmpeg(path, out_path, codec, more_opts):
3952                 try:
3953                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3954                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3955                         return (ret == 0)
3956                 except (IOError, OSError):
3957                         return False
3958
3959         def run(self, information):
3960                 path = information['filepath']
3961
3962                 filecodec = self.get_audio_codec(path)
3963                 if filecodec is None:
3964                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3965                         return None
3966
3967                 more_opts = []
3968                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3969                         if filecodec in ['aac', 'mp3', 'vorbis']:
3970                                 # Lossless if possible
3971                                 acodec = 'copy'
3972                                 extension = filecodec
3973                                 if filecodec == 'aac':
3974                                         more_opts = ['-f', 'adts']
3975                                 if filecodec == 'vorbis':
3976                                         extension = 'ogg'
3977                         else:
3978                                 # MP3 otherwise.
3979                                 acodec = 'libmp3lame'
3980                                 extension = 'mp3'
3981                                 more_opts = []
3982                                 if self._preferredquality is not None:
3983                                         more_opts += ['-ab', self._preferredquality]
3984                 else:
3985                         # We convert the audio (lossy)
3986                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3987                         extension = self._preferredcodec
3988                         more_opts = []
3989                         if self._preferredquality is not None:
3990                                 more_opts += ['-ab', self._preferredquality]
3991                         if self._preferredcodec == 'aac':
3992                                 more_opts += ['-f', 'adts']
3993                         if self._preferredcodec == 'vorbis':
3994                                 extension = 'ogg'
3995
3996                 (prefix, ext) = os.path.splitext(path)
3997                 new_path = prefix + '.' + extension
3998                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3999                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
4000
4001                 if not status:
4002                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
4003                         return None
4004
4005                 # Try to update the date time for extracted audio file.
4006                 if information.get('filetime') is not None:
4007                         try:
4008                                 os.utime(new_path, (time.time(), information['filetime']))
4009                         except:
4010                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4011
4012                 if not self._keepvideo:
4013                         try:
4014                                 os.remove(path)
4015                         except (IOError, OSError):
4016                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4017                                 return None
4018
4019                 information['filepath'] = new_path
4020                 return information
4021
4022
4023 def updateSelf(downloader, filename):
4024         ''' Update the program file with the latest version from the repository '''
4025         # Note: downloader only used for options
4026         if not os.access(filename, os.W_OK):
4027                 sys.exit('ERROR: no write permissions on %s' % filename)
4028
4029         downloader.to_screen('Updating to latest version...')
4030
4031         try:
4032                 try:
4033                         urlh = urllib.urlopen(UPDATE_URL)
4034                         newcontent = urlh.read()
4035                         
4036                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4037                         if vmatch is not None and vmatch.group(1) == __version__:
4038                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4039                                 return
4040                 finally:
4041                         urlh.close()
4042         except (IOError, OSError), err:
4043                 sys.exit('ERROR: unable to download latest version')
4044
4045         try:
4046                 outf = open(filename, 'wb')
4047                 try:
4048                         outf.write(newcontent)
4049                 finally:
4050                         outf.close()
4051         except (IOError, OSError), err:
4052                 sys.exit('ERROR: unable to overwrite current version')
4053
4054         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4055
4056 def parseOpts():
4057         # Deferred imports
4058         import getpass
4059         import optparse
4060         import shlex
4061
4062         def _readOptions(filename):
4063                 try:
4064                         optionf = open(filename)
4065                 except IOError:
4066                         return [] # silently skip if file is not present
4067                 try:
4068                         res = []
4069                         for l in optionf:
4070                                 res += shlex.split(l, comments=True)
4071                 finally:
4072                         optionf.close()
4073                 return res
4074
4075         def _format_option_string(option):
4076                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4077
4078                 opts = []
4079
4080                 if option._short_opts: opts.append(option._short_opts[0])
4081                 if option._long_opts: opts.append(option._long_opts[0])
4082                 if len(opts) > 1: opts.insert(1, ', ')
4083
4084                 if option.takes_value(): opts.append(' %s' % option.metavar)
4085
4086                 return "".join(opts)
4087
4088         def _find_term_columns():
4089                 columns = os.environ.get('COLUMNS', None)
4090                 if columns:
4091                         return int(columns)
4092
4093                 try:
4094                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4095                         out,err = sp.communicate()
4096                         return int(out.split()[1])
4097                 except:
4098                         pass
4099                 return None
4100
4101         max_width = 80
4102         max_help_position = 80
4103
4104         # No need to wrap help messages if we're on a wide console
4105         columns = _find_term_columns()
4106         if columns: max_width = columns
4107
4108         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4109         fmt.format_option_strings = _format_option_string
4110
4111         kw = {
4112                 'version'   : __version__,
4113                 'formatter' : fmt,
4114                 'usage' : '%prog [options] url [url...]',
4115                 'conflict_handler' : 'resolve',
4116         }
4117
4118         parser = optparse.OptionParser(**kw)
4119
4120         # option groups
4121         general        = optparse.OptionGroup(parser, 'General Options')
4122         selection      = optparse.OptionGroup(parser, 'Video Selection')
4123         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4124         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4125         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4126         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4127         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4128
4129         general.add_option('-h', '--help',
4130                         action='help', help='print this help text and exit')
4131         general.add_option('-v', '--version',
4132                         action='version', help='print program version and exit')
4133         general.add_option('-U', '--update',
4134                         action='store_true', dest='update_self', help='update this program to latest version')
4135         general.add_option('-i', '--ignore-errors',
4136                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4137         general.add_option('-r', '--rate-limit',
4138                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4139         general.add_option('-R', '--retries',
4140                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4141         general.add_option('--dump-user-agent',
4142                         action='store_true', dest='dump_user_agent',
4143                         help='display the current browser identification', default=False)
4144         general.add_option('--list-extractors',
4145                         action='store_true', dest='list_extractors',
4146                         help='List all supported extractors and the URLs they would handle', default=False)
4147
4148         selection.add_option('--playlist-start',
4149                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4150         selection.add_option('--playlist-end',
4151                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4152         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4153         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4154         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4155
4156         authentication.add_option('-u', '--username',
4157                         dest='username', metavar='USERNAME', help='account username')
4158         authentication.add_option('-p', '--password',
4159                         dest='password', metavar='PASSWORD', help='account password')
4160         authentication.add_option('-n', '--netrc',
4161                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4162
4163
4164         video_format.add_option('-f', '--format',
4165                         action='store', dest='format', metavar='FORMAT', help='video format code')
4166         video_format.add_option('--all-formats',
4167                         action='store_const', dest='format', help='download all available video formats', const='all')
4168         video_format.add_option('--max-quality',
4169                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4170         video_format.add_option('-F', '--list-formats',
4171                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4172
4173
4174         verbosity.add_option('-q', '--quiet',
4175                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4176         verbosity.add_option('-s', '--simulate',
4177                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4178         verbosity.add_option('--skip-download',
4179                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4180         verbosity.add_option('-g', '--get-url',
4181                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4182         verbosity.add_option('-e', '--get-title',
4183                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4184         verbosity.add_option('--get-thumbnail',
4185                         action='store_true', dest='getthumbnail',
4186                         help='simulate, quiet but print thumbnail URL', default=False)
4187         verbosity.add_option('--get-description',
4188                         action='store_true', dest='getdescription',
4189                         help='simulate, quiet but print video description', default=False)
4190         verbosity.add_option('--get-filename',
4191                         action='store_true', dest='getfilename',
4192                         help='simulate, quiet but print output filename', default=False)
4193         verbosity.add_option('--get-format',
4194                         action='store_true', dest='getformat',
4195                         help='simulate, quiet but print output format', default=False)
4196         verbosity.add_option('--no-progress',
4197                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4198         verbosity.add_option('--console-title',
4199                         action='store_true', dest='consoletitle',
4200                         help='display progress in console titlebar', default=False)
4201
4202
4203         filesystem.add_option('-t', '--title',
4204                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4205         filesystem.add_option('-l', '--literal',
4206                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4207         filesystem.add_option('-A', '--auto-number',
4208                         action='store_true', dest='autonumber',
4209                         help='number downloaded files starting from 00000', default=False)
4210         filesystem.add_option('-o', '--output',
4211                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4212         filesystem.add_option('-a', '--batch-file',
4213                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4214         filesystem.add_option('-w', '--no-overwrites',
4215                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4216         filesystem.add_option('-c', '--continue',
4217                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4218         filesystem.add_option('--no-continue',
4219                         action='store_false', dest='continue_dl',
4220                         help='do not resume partially downloaded files (restart from beginning)')
4221         filesystem.add_option('--cookies',
4222                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4223         filesystem.add_option('--no-part',
4224                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4225         filesystem.add_option('--no-mtime',
4226                         action='store_false', dest='updatetime',
4227                         help='do not use the Last-modified header to set the file modification time', default=True)
4228         filesystem.add_option('--write-description',
4229                         action='store_true', dest='writedescription',
4230                         help='write video description to a .description file', default=False)
4231         filesystem.add_option('--write-info-json',
4232                         action='store_true', dest='writeinfojson',
4233                         help='write video metadata to a .info.json file', default=False)
4234
4235
4236         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4237                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4238         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4239                         help='"best", "aac", "vorbis" or "mp3"; best by default')
4240         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4241                         help='ffmpeg audio bitrate specification, 128k by default')
4242         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4243                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4244
4245
4246         parser.add_option_group(general)
4247         parser.add_option_group(selection)
4248         parser.add_option_group(filesystem)
4249         parser.add_option_group(verbosity)
4250         parser.add_option_group(video_format)
4251         parser.add_option_group(authentication)
4252         parser.add_option_group(postproc)
4253
4254         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4255         if xdg_config_home:
4256                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4257         else:
4258                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4259         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4260         opts, args = parser.parse_args(argv)
4261
4262         return parser, opts, args
4263
4264 def gen_extractors():
4265         """ Return a list of an instance of every supported extractor.
4266         The order does matter; the first extractor matched is the one handling the URL.
4267         """
4268         youtube_ie = YoutubeIE()
4269         google_ie = GoogleIE()
4270         yahoo_ie = YahooIE()
4271         return [
4272                 YoutubePlaylistIE(youtube_ie),
4273                 YoutubeUserIE(youtube_ie),
4274                 YoutubeSearchIE(youtube_ie),
4275                 youtube_ie,
4276                 MetacafeIE(youtube_ie),
4277                 DailymotionIE(),
4278                 google_ie,
4279                 GoogleSearchIE(google_ie),
4280                 PhotobucketIE(),
4281                 yahoo_ie,
4282                 YahooSearchIE(yahoo_ie),
4283                 DepositFilesIE(),
4284                 FacebookIE(),
4285                 BlipTVIE(),
4286                 VimeoIE(),
4287                 MyVideoIE(),
4288                 ComedyCentralIE(),
4289                 EscapistIE(),
4290                 CollegeHumorIE(),
4291                 XVideosIE(),
4292                 SoundcloudIE(),
4293                 InfoQIE(),
4294                 MixcloudIE(),
4295                 StanfordOpenClassroomIE(),
4296
4297                 GenericIE()
4298         ]
4299
4300 def _real_main():
4301         parser, opts, args = parseOpts()
4302
4303         # Open appropriate CookieJar
4304         if opts.cookiefile is None:
4305                 jar = cookielib.CookieJar()
4306         else:
4307                 try:
4308                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4309                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4310                                 jar.load()
4311                 except (IOError, OSError), err:
4312                         sys.exit(u'ERROR: unable to open cookie file')
4313
4314         # Dump user agent
4315         if opts.dump_user_agent:
4316                 print std_headers['User-Agent']
4317                 sys.exit(0)
4318
4319         # Batch file verification
4320         batchurls = []
4321         if opts.batchfile is not None:
4322                 try:
4323                         if opts.batchfile == '-':
4324                                 batchfd = sys.stdin
4325                         else:
4326                                 batchfd = open(opts.batchfile, 'r')
4327                         batchurls = batchfd.readlines()
4328                         batchurls = [x.strip() for x in batchurls]
4329                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4330                 except IOError:
4331                         sys.exit(u'ERROR: batch file could not be read')
4332         all_urls = batchurls + args
4333
4334         # General configuration
4335         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4336         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4337         urllib2.install_opener(opener)
4338         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4339
4340         extractors = gen_extractors()
4341
4342         if opts.list_extractors:
4343                 for ie in extractors:
4344                         print(ie.IE_NAME)
4345                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4346                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4347                         for mu in matchedUrls:
4348                                 print(u'  ' + mu)
4349                 sys.exit(0)
4350
4351         # Conflicting, missing and erroneous options
4352         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4353                 parser.error(u'using .netrc conflicts with giving username/password')
4354         if opts.password is not None and opts.username is None:
4355                 parser.error(u'account username missing')
4356         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4357                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4358         if opts.usetitle and opts.useliteral:
4359                 parser.error(u'using title conflicts with using literal title')
4360         if opts.username is not None and opts.password is None:
4361                 opts.password = getpass.getpass(u'Type account password and press return:')
4362         if opts.ratelimit is not None:
4363                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4364                 if numeric_limit is None:
4365                         parser.error(u'invalid rate limit specified')
4366                 opts.ratelimit = numeric_limit
4367         if opts.retries is not None:
4368                 try:
4369                         opts.retries = long(opts.retries)
4370                 except (TypeError, ValueError), err:
4371                         parser.error(u'invalid retry count specified')
4372         try:
4373                 opts.playliststart = int(opts.playliststart)
4374                 if opts.playliststart <= 0:
4375                         raise ValueError(u'Playlist start must be positive')
4376         except (TypeError, ValueError), err:
4377                 parser.error(u'invalid playlist start number specified')
4378         try:
4379                 opts.playlistend = int(opts.playlistend)
4380                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4381                         raise ValueError(u'Playlist end must be greater than playlist start')
4382         except (TypeError, ValueError), err:
4383                 parser.error(u'invalid playlist end number specified')
4384         if opts.extractaudio:
4385                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4386                         parser.error(u'invalid audio format specified')
4387
4388         # File downloader
4389         fd = FileDownloader({
4390                 'usenetrc': opts.usenetrc,
4391                 'username': opts.username,
4392                 'password': opts.password,
4393                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4394                 'forceurl': opts.geturl,
4395                 'forcetitle': opts.gettitle,
4396                 'forcethumbnail': opts.getthumbnail,
4397                 'forcedescription': opts.getdescription,
4398                 'forcefilename': opts.getfilename,
4399                 'forceformat': opts.getformat,
4400                 'simulate': opts.simulate,
4401                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4402                 'format': opts.format,
4403                 'format_limit': opts.format_limit,
4404                 'listformats': opts.listformats,
4405                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4406                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4407                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4408                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4409                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4410                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4411                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4412                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4413                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4414                         or u'%(id)s.%(ext)s'),
4415                 'ignoreerrors': opts.ignoreerrors,
4416                 'ratelimit': opts.ratelimit,
4417                 'nooverwrites': opts.nooverwrites,
4418                 'retries': opts.retries,
4419                 'continuedl': opts.continue_dl,
4420                 'noprogress': opts.noprogress,
4421                 'playliststart': opts.playliststart,
4422                 'playlistend': opts.playlistend,
4423                 'logtostderr': opts.outtmpl == '-',
4424                 'consoletitle': opts.consoletitle,
4425                 'nopart': opts.nopart,
4426                 'updatetime': opts.updatetime,
4427                 'writedescription': opts.writedescription,
4428                 'writeinfojson': opts.writeinfojson,
4429                 'matchtitle': opts.matchtitle,
4430                 'rejecttitle': opts.rejecttitle,
4431                 'max_downloads': opts.max_downloads,
4432                 })
4433         for extractor in extractors:
4434                 fd.add_info_extractor(extractor)
4435
4436         # PostProcessors
4437         if opts.extractaudio:
4438                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4439
4440         # Update version
4441         if opts.update_self:
4442                 updateSelf(fd, sys.argv[0])
4443
4444         # Maybe do nothing
4445         if len(all_urls) < 1:
4446                 if not opts.update_self:
4447                         parser.error(u'you must provide at least one URL')
4448                 else:
4449                         sys.exit()
4450         retcode = fd.download(all_urls)
4451
4452         # Dump cookie jar if requested
4453         if opts.cookiefile is not None:
4454                 try:
4455                         jar.save()
4456                 except (IOError, OSError), err:
4457                         sys.exit(u'ERROR: unable to save cookie jar')
4458
4459         sys.exit(retcode)
4460
4461 def main():
4462         try:
4463                 _real_main()
4464         except DownloadError:
4465                 sys.exit(1)
4466         except SameFileError:
4467                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4468         except KeyboardInterrupt:
4469                 sys.exit(u'\nERROR: Interrupted by user')
4470
4471 if __name__ == '__main__':
4472         main()
4473
4474 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: