Add wav audio output
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2011.12.15'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48         import ctypes
49
50 try:
51         import email.utils
52 except ImportError: # Python 2.4
53         import email.Utils
54 try:
55         import cStringIO as StringIO
56 except ImportError:
57         import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61         from urlparse import parse_qs
62 except ImportError:
63         from cgi import parse_qs
64
65 try:
66         import lxml.etree
67 except ImportError:
68         pass # Handled below
69
70 try:
71         import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79         'Accept-Encoding': 'gzip, deflate',
80         'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84         import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86         import re
87         class json(object):
88                 @staticmethod
89                 def loads(s):
90                         s = s.decode('UTF-8')
91                         def raiseError(msg, i):
92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93                         def skipSpace(i, expectMore=True):
94                                 while i < len(s) and s[i] in ' \t\r\n':
95                                         i += 1
96                                 if expectMore:
97                                         if i >= len(s):
98                                                 raiseError('Premature end', i)
99                                 return i
100                         def decodeEscape(match):
101                                 esc = match.group(1)
102                                 _STATIC = {
103                                         '"': '"',
104                                         '\\': '\\',
105                                         '/': '/',
106                                         'b': unichr(0x8),
107                                         'f': unichr(0xc),
108                                         'n': '\n',
109                                         'r': '\r',
110                                         't': '\t',
111                                 }
112                                 if esc in _STATIC:
113                                         return _STATIC[esc]
114                                 if esc[0] == 'u':
115                                         if len(esc) == 1+4:
116                                                 return unichr(int(esc[1:5], 16))
117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
118                                                 hi = int(esc[1:5], 16)
119                                                 low = int(esc[7:11], 16)
120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121                                 raise ValueError('Unknown escape ' + str(esc))
122                         def parseString(i):
123                                 i += 1
124                                 e = i
125                                 while True:
126                                         e = s.index('"', e)
127                                         bslashes = 0
128                                         while s[e-bslashes-1] == '\\':
129                                                 bslashes += 1
130                                         if bslashes % 2 == 1:
131                                                 e += 1
132                                                 continue
133                                         break
134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135                                 stri = rexp.sub(decodeEscape, s[i:e])
136                                 return (e+1,stri)
137                         def parseObj(i):
138                                 i += 1
139                                 res = {}
140                                 i = skipSpace(i)
141                                 if s[i] == '}': # Empty dictionary
142                                         return (i+1,res)
143                                 while True:
144                                         if s[i] != '"':
145                                                 raiseError('Expected a string object key', i)
146                                         i,key = parseString(i)
147                                         i = skipSpace(i)
148                                         if i >= len(s) or s[i] != ':':
149                                                 raiseError('Expected a colon', i)
150                                         i,val = parse(i+1)
151                                         res[key] = val
152                                         i = skipSpace(i)
153                                         if s[i] == '}':
154                                                 return (i+1, res)
155                                         if s[i] != ',':
156                                                 raiseError('Expected comma or closing curly brace', i)
157                                         i = skipSpace(i+1)
158                         def parseArray(i):
159                                 res = []
160                                 i = skipSpace(i+1)
161                                 if s[i] == ']': # Empty array
162                                         return (i+1,res)
163                                 while True:
164                                         i,val = parse(i)
165                                         res.append(val)
166                                         i = skipSpace(i) # Raise exception if premature end
167                                         if s[i] == ']':
168                                                 return (i+1, res)
169                                         if s[i] != ',':
170                                                 raiseError('Expected a comma or closing bracket', i)
171                                         i = skipSpace(i+1)
172                         def parseDiscrete(i):
173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
174                                         if s.startswith(k, i):
175                                                 return (i+len(k), v)
176                                 raiseError('Not a boolean (or null)', i)
177                         def parseNumber(i):
178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179                                 if mobj is None:
180                                         raiseError('Not a number', i)
181                                 nums = mobj.group(1)
182                                 if '.' in nums or 'e' in nums or 'E' in nums:
183                                         return (i+len(nums), float(nums))
184                                 return (i+len(nums), int(nums))
185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186                         def parse(i):
187                                 i = skipSpace(i)
188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
189                                 i = skipSpace(i, False)
190                                 return (i,res)
191                         i,res = parse(0)
192                         if i < len(s):
193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194                         return res
195
196 def preferredencoding():
197         """Get preferred encoding.
198
199         Returns the best encoding scheme for the system, based on
200         locale.getpreferredencoding() and some further tweaks.
201         """
202         def yield_preferredencoding():
203                 try:
204                         pref = locale.getpreferredencoding()
205                         u'TEST'.encode(pref)
206                 except:
207                         pref = 'UTF-8'
208                 while True:
209                         yield pref
210         return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214         """Transforms an HTML entity to a Unicode character.
215
216         This function receives a match object and is intended to be used with
217         the re.sub() function.
218         """
219         entity = matchobj.group(1)
220
221         # Known non-numeric HTML entity
222         if entity in htmlentitydefs.name2codepoint:
223                 return unichr(htmlentitydefs.name2codepoint[entity])
224
225         # Unicode character
226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
227         if mobj is not None:
228                 numstr = mobj.group(1)
229                 if numstr.startswith(u'x'):
230                         base = 16
231                         numstr = u'0%s' % numstr
232                 else:
233                         base = 10
234                 return unichr(long(numstr, base))
235
236         # Unknown entity in name, return its literal representation
237         return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241         """Sanitizes a video title so it could be used as part of a filename."""
242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243         return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247         """Try to open the given filename, and slightly tweak it if this fails.
248
249         Attempts to open the given filename. If this fails, it tries to change
250         the filename slightly, step by step, until it's either able to open it
251         or it fails and raises a final exception, like the standard open()
252         function.
253
254         It returns the tuple (stream, definitive_file_name).
255         """
256         try:
257                 if filename == u'-':
258                         if sys.platform == 'win32':
259                                 import msvcrt
260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261                         return (sys.stdout, filename)
262                 stream = open(filename, open_mode)
263                 return (stream, filename)
264         except (IOError, OSError), err:
265                 # In case of error, try to remove win32 forbidden chars
266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268                 # An exception here should be caught in the caller
269                 stream = open(filename, open_mode)
270                 return (stream, filename)
271
272
273 def timeconvert(timestr):
274         """Convert RFC 2822 defined time string into system timestamp"""
275         timestamp = None
276         timetuple = email.utils.parsedate_tz(timestr)
277         if timetuple is not None:
278                 timestamp = email.utils.mktime_tz(timetuple)
279         return timestamp
280
281 def _simplify_title(title):
282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283         return expr.sub(u'_', title).strip(u'_')
284
285 def _orderedSet(iterable):
286         """ Remove all duplicates from the input iterable """
287         res = []
288         for el in iterable:
289                 if el not in res:
290                         res.append(el)
291         return res
292
293 class DownloadError(Exception):
294         """Download Error exception.
295
296         This exception may be thrown by FileDownloader objects if they are not
297         configured to continue on errors. They will contain the appropriate
298         error message.
299         """
300         pass
301
302
303 class SameFileError(Exception):
304         """Same File exception.
305
306         This exception will be thrown by FileDownloader objects if they detect
307         multiple files would have to be downloaded to the same file on disk.
308         """
309         pass
310
311
312 class PostProcessingError(Exception):
313         """Post Processing exception.
314
315         This exception may be raised by PostProcessor's .run() method to
316         indicate an error in the postprocessing task.
317         """
318         pass
319
320 class MaxDownloadsReached(Exception):
321         """ --max-downloads limit has been reached. """
322         pass
323
324
325 class UnavailableVideoError(Exception):
326         """Unavailable Format exception.
327
328         This exception will be thrown when a video is requested
329         in a format that is not available for that video.
330         """
331         pass
332
333
334 class ContentTooShortError(Exception):
335         """Content Too Short exception.
336
337         This exception may be raised by FileDownloader objects when a file they
338         download is too small for what the server announced first, indicating
339         the connection was probably interrupted.
340         """
341         # Both in bytes
342         downloaded = None
343         expected = None
344
345         def __init__(self, downloaded, expected):
346                 self.downloaded = downloaded
347                 self.expected = expected
348
349
350 class YoutubeDLHandler(urllib2.HTTPHandler):
351         """Handler for HTTP requests and responses.
352
353         This class, when installed with an OpenerDirector, automatically adds
354         the standard headers to every HTTP request and handles gzipped and
355         deflated responses from web servers. If compression is to be avoided in
356         a particular request, the original request in the program code only has
357         to include the HTTP header "Youtubedl-No-Compression", which will be
358         removed before making the real request.
359
360         Part of this code was copied from:
361
362         http://techknack.net/python-urllib2-handlers/
363
364         Andrew Rowls, the author of that code, agreed to release it to the
365         public domain.
366         """
367
368         @staticmethod
369         def deflate(data):
370                 try:
371                         return zlib.decompress(data, -zlib.MAX_WBITS)
372                 except zlib.error:
373                         return zlib.decompress(data)
374
375         @staticmethod
376         def addinfourl_wrapper(stream, headers, url, code):
377                 if hasattr(urllib2.addinfourl, 'getcode'):
378                         return urllib2.addinfourl(stream, headers, url, code)
379                 ret = urllib2.addinfourl(stream, headers, url)
380                 ret.code = code
381                 return ret
382
383         def http_request(self, req):
384                 for h in std_headers:
385                         if h in req.headers:
386                                 del req.headers[h]
387                         req.add_header(h, std_headers[h])
388                 if 'Youtubedl-no-compression' in req.headers:
389                         if 'Accept-encoding' in req.headers:
390                                 del req.headers['Accept-encoding']
391                         del req.headers['Youtubedl-no-compression']
392                 return req
393
394         def http_response(self, req, resp):
395                 old_resp = resp
396                 # gzip
397                 if resp.headers.get('Content-encoding', '') == 'gzip':
398                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
399                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
400                         resp.msg = old_resp.msg
401                 # deflate
402                 if resp.headers.get('Content-encoding', '') == 'deflate':
403                         gz = StringIO.StringIO(self.deflate(resp.read()))
404                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
405                         resp.msg = old_resp.msg
406                 return resp
407
408
409 class FileDownloader(object):
410         """File Downloader class.
411
412         File downloader objects are the ones responsible of downloading the
413         actual video file and writing it to disk if the user has requested
414         it, among some other tasks. In most cases there should be one per
415         program. As, given a video URL, the downloader doesn't know how to
416         extract all the needed information, task that InfoExtractors do, it
417         has to pass the URL to one of them.
418
419         For this, file downloader objects have a method that allows
420         InfoExtractors to be registered in a given order. When it is passed
421         a URL, the file downloader handles it to the first InfoExtractor it
422         finds that reports being able to handle it. The InfoExtractor extracts
423         all the information about the video or videos the URL refers to, and
424         asks the FileDownloader to process the video information, possibly
425         downloading the video.
426
427         File downloaders accept a lot of parameters. In order not to saturate
428         the object constructor with arguments, it receives a dictionary of
429         options instead. These options are available through the params
430         attribute for the InfoExtractors to use. The FileDownloader also
431         registers itself as the downloader in charge for the InfoExtractors
432         that are added to it, so this is a "mutual registration".
433
434         Available options:
435
436         username:         Username for authentication purposes.
437         password:         Password for authentication purposes.
438         usenetrc:         Use netrc for authentication instead.
439         quiet:            Do not print messages to stdout.
440         forceurl:         Force printing final URL.
441         forcetitle:       Force printing title.
442         forcethumbnail:   Force printing thumbnail URL.
443         forcedescription: Force printing description.
444         forcefilename:    Force printing final filename.
445         simulate:         Do not download the video files.
446         format:           Video format code.
447         format_limit:     Highest quality format to try.
448         outtmpl:          Template for output names.
449         ignoreerrors:     Do not stop on download errors.
450         ratelimit:        Download speed limit, in bytes/sec.
451         nooverwrites:     Prevent overwriting files.
452         retries:          Number of times to retry for HTTP error 5xx
453         continuedl:       Try to continue downloads if possible.
454         noprogress:       Do not print the progress bar.
455         playliststart:    Playlist item to start at.
456         playlistend:      Playlist item to end at.
457         matchtitle:       Download only matching titles.
458         rejecttitle:      Reject downloads for matching titles.
459         logtostderr:      Log messages to stderr instead of stdout.
460         consoletitle:     Display progress in console window's titlebar.
461         nopart:           Do not use temporary .part files.
462         updatetime:       Use the Last-modified header to set output file timestamps.
463         writedescription: Write the video description to a .description file
464         writeinfojson:    Write the video description to a .info.json file
465         """
466
467         params = None
468         _ies = []
469         _pps = []
470         _download_retcode = None
471         _num_downloads = None
472         _screen_file = None
473
474         def __init__(self, params):
475                 """Create a FileDownloader object with the given options."""
476                 self._ies = []
477                 self._pps = []
478                 self._download_retcode = 0
479                 self._num_downloads = 0
480                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
481                 self.params = params
482
483         @staticmethod
484         def format_bytes(bytes):
485                 if bytes is None:
486                         return 'N/A'
487                 if type(bytes) is str:
488                         bytes = float(bytes)
489                 if bytes == 0.0:
490                         exponent = 0
491                 else:
492                         exponent = long(math.log(bytes, 1024.0))
493                 suffix = 'bkMGTPEZY'[exponent]
494                 converted = float(bytes) / float(1024 ** exponent)
495                 return '%.2f%s' % (converted, suffix)
496
497         @staticmethod
498         def calc_percent(byte_counter, data_len):
499                 if data_len is None:
500                         return '---.-%'
501                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
502
503         @staticmethod
504         def calc_eta(start, now, total, current):
505                 if total is None:
506                         return '--:--'
507                 dif = now - start
508                 if current == 0 or dif < 0.001: # One millisecond
509                         return '--:--'
510                 rate = float(current) / dif
511                 eta = long((float(total) - float(current)) / rate)
512                 (eta_mins, eta_secs) = divmod(eta, 60)
513                 if eta_mins > 99:
514                         return '--:--'
515                 return '%02d:%02d' % (eta_mins, eta_secs)
516
517         @staticmethod
518         def calc_speed(start, now, bytes):
519                 dif = now - start
520                 if bytes == 0 or dif < 0.001: # One millisecond
521                         return '%10s' % '---b/s'
522                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
523
524         @staticmethod
525         def best_block_size(elapsed_time, bytes):
526                 new_min = max(bytes / 2.0, 1.0)
527                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
528                 if elapsed_time < 0.001:
529                         return long(new_max)
530                 rate = bytes / elapsed_time
531                 if rate > new_max:
532                         return long(new_max)
533                 if rate < new_min:
534                         return long(new_min)
535                 return long(rate)
536
537         @staticmethod
538         def parse_bytes(bytestr):
539                 """Parse a string indicating a byte quantity into a long integer."""
540                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
541                 if matchobj is None:
542                         return None
543                 number = float(matchobj.group(1))
544                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
545                 return long(round(number * multiplier))
546
547         def add_info_extractor(self, ie):
548                 """Add an InfoExtractor object to the end of the list."""
549                 self._ies.append(ie)
550                 ie.set_downloader(self)
551
552         def add_post_processor(self, pp):
553                 """Add a PostProcessor object to the end of the chain."""
554                 self._pps.append(pp)
555                 pp.set_downloader(self)
556
557         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
558                 """Print message to stdout if not in quiet mode."""
559                 try:
560                         if not self.params.get('quiet', False):
561                                 terminator = [u'\n', u''][skip_eol]
562                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
563                         self._screen_file.flush()
564                 except (UnicodeEncodeError), err:
565                         if not ignore_encoding_errors:
566                                 raise
567
568         def to_stderr(self, message):
569                 """Print message to stderr."""
570                 print >>sys.stderr, message.encode(preferredencoding())
571
572         def to_cons_title(self, message):
573                 """Set console/terminal window title to message."""
574                 if not self.params.get('consoletitle', False):
575                         return
576                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
577                         # c_wchar_p() might not be necessary if `message` is
578                         # already of type unicode()
579                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
580                 elif 'TERM' in os.environ:
581                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
582
583         def fixed_template(self):
584                 """Checks if the output template is fixed."""
585                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
586
587         def trouble(self, message=None):
588                 """Determine action to take when a download problem appears.
589
590                 Depending on if the downloader has been configured to ignore
591                 download errors or not, this method may throw an exception or
592                 not when errors are found, after printing the message.
593                 """
594                 if message is not None:
595                         self.to_stderr(message)
596                 if not self.params.get('ignoreerrors', False):
597                         raise DownloadError(message)
598                 self._download_retcode = 1
599
600         def slow_down(self, start_time, byte_counter):
601                 """Sleep if the download speed is over the rate limit."""
602                 rate_limit = self.params.get('ratelimit', None)
603                 if rate_limit is None or byte_counter == 0:
604                         return
605                 now = time.time()
606                 elapsed = now - start_time
607                 if elapsed <= 0.0:
608                         return
609                 speed = float(byte_counter) / elapsed
610                 if speed > rate_limit:
611                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
612
613         def temp_name(self, filename):
614                 """Returns a temporary filename for the given filename."""
615                 if self.params.get('nopart', False) or filename == u'-' or \
616                                 (os.path.exists(filename) and not os.path.isfile(filename)):
617                         return filename
618                 return filename + u'.part'
619
620         def undo_temp_name(self, filename):
621                 if filename.endswith(u'.part'):
622                         return filename[:-len(u'.part')]
623                 return filename
624
625         def try_rename(self, old_filename, new_filename):
626                 try:
627                         if old_filename == new_filename:
628                                 return
629                         os.rename(old_filename, new_filename)
630                 except (IOError, OSError), err:
631                         self.trouble(u'ERROR: unable to rename file')
632
633         def try_utime(self, filename, last_modified_hdr):
634                 """Try to set the last-modified time of the given file."""
635                 if last_modified_hdr is None:
636                         return
637                 if not os.path.isfile(filename):
638                         return
639                 timestr = last_modified_hdr
640                 if timestr is None:
641                         return
642                 filetime = timeconvert(timestr)
643                 if filetime is None:
644                         return filetime
645                 try:
646                         os.utime(filename, (time.time(), filetime))
647                 except:
648                         pass
649                 return filetime
650
651         def report_writedescription(self, descfn):
652                 """ Report that the description file is being written """
653                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
654
655         def report_writeinfojson(self, infofn):
656                 """ Report that the metadata file has been written """
657                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
658
659         def report_destination(self, filename):
660                 """Report destination filename."""
661                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
662
663         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
664                 """Report download progress."""
665                 if self.params.get('noprogress', False):
666                         return
667                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
668                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
669                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
670                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
671
672         def report_resuming_byte(self, resume_len):
673                 """Report attempt to resume at given byte."""
674                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
675
676         def report_retry(self, count, retries):
677                 """Report retry in case of HTTP error 5xx"""
678                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
679
680         def report_file_already_downloaded(self, file_name):
681                 """Report file has already been fully downloaded."""
682                 try:
683                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
684                 except (UnicodeEncodeError), err:
685                         self.to_screen(u'[download] The file has already been downloaded')
686
687         def report_unable_to_resume(self):
688                 """Report it was impossible to resume download."""
689                 self.to_screen(u'[download] Unable to resume')
690
691         def report_finish(self):
692                 """Report download finished."""
693                 if self.params.get('noprogress', False):
694                         self.to_screen(u'[download] Download completed')
695                 else:
696                         self.to_screen(u'')
697
698         def increment_downloads(self):
699                 """Increment the ordinal that assigns a number to each file."""
700                 self._num_downloads += 1
701
702         def prepare_filename(self, info_dict):
703                 """Generate the output filename."""
704                 try:
705                         template_dict = dict(info_dict)
706                         template_dict['epoch'] = unicode(long(time.time()))
707                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
708                         filename = self.params['outtmpl'] % template_dict
709                         return filename
710                 except (ValueError, KeyError), err:
711                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
712                         return None
713
714         def _match_entry(self, info_dict):
715                 """ Returns None iff the file should be downloaded """
716
717                 title = info_dict['title']
718                 matchtitle = self.params.get('matchtitle', False)
719                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
720                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
721                 rejecttitle = self.params.get('rejecttitle', False)
722                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
723                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
724                 return None
725
726         def process_info(self, info_dict):
727                 """Process a single dictionary returned by an InfoExtractor."""
728
729                 reason = self._match_entry(info_dict)
730                 if reason is not None:
731                         self.to_screen(u'[download] ' + reason)
732                         return
733
734                 max_downloads = self.params.get('max_downloads')
735                 if max_downloads is not None:
736                         if self._num_downloads > int(max_downloads):
737                                 raise MaxDownloadsReached()
738
739                 filename = self.prepare_filename(info_dict)
740                 
741                 # Forced printings
742                 if self.params.get('forcetitle', False):
743                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
744                 if self.params.get('forceurl', False):
745                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
746                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
747                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
748                 if self.params.get('forcedescription', False) and 'description' in info_dict:
749                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
750                 if self.params.get('forcefilename', False) and filename is not None:
751                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
752                 if self.params.get('forceformat', False):
753                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
754
755                 # Do nothing else if in simulate mode
756                 if self.params.get('simulate', False):
757                         return
758
759                 if filename is None:
760                         return
761
762                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
763                         self.to_stderr(u'WARNING: file exists and will be skipped')
764                         return
765
766                 try:
767                         dn = os.path.dirname(filename)
768                         if dn != '' and not os.path.exists(dn):
769                                 os.makedirs(dn)
770                 except (OSError, IOError), err:
771                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
772                         return
773
774                 if self.params.get('writedescription', False):
775                         try:
776                                 descfn = filename + '.description'
777                                 self.report_writedescription(descfn)
778                                 descfile = open(descfn, 'wb')
779                                 try:
780                                         descfile.write(info_dict['description'].encode('utf-8'))
781                                 finally:
782                                         descfile.close()
783                         except (OSError, IOError):
784                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
785                                 return
786
787                 if self.params.get('writeinfojson', False):
788                         infofn = filename + '.info.json'
789                         self.report_writeinfojson(infofn)
790                         try:
791                                 json.dump
792                         except (NameError,AttributeError):
793                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
794                                 return
795                         try:
796                                 infof = open(infofn, 'wb')
797                                 try:
798                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
799                                         json.dump(json_info_dict, infof)
800                                 finally:
801                                         infof.close()
802                         except (OSError, IOError):
803                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
804                                 return
805
806                 if not self.params.get('skip_download', False):
807                         try:
808                                 success = self._do_download(filename, info_dict)
809                         except (OSError, IOError), err:
810                                 raise UnavailableVideoError
811                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
812                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
813                                 return
814                         except (ContentTooShortError, ), err:
815                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
816                                 return
817         
818                         if success:
819                                 try:
820                                         self.post_process(filename, info_dict)
821                                 except (PostProcessingError), err:
822                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
823                                         return
824
825         def download(self, url_list):
826                 """Download a given list of URLs."""
827                 if len(url_list) > 1 and self.fixed_template():
828                         raise SameFileError(self.params['outtmpl'])
829
830                 for url in url_list:
831                         suitable_found = False
832                         for ie in self._ies:
833                                 # Go to next InfoExtractor if not suitable
834                                 if not ie.suitable(url):
835                                         continue
836
837                                 # Suitable InfoExtractor found
838                                 suitable_found = True
839
840                                 # Extract information from URL and process it
841                                 ie.extract(url)
842
843                                 # Suitable InfoExtractor had been found; go to next URL
844                                 break
845
846                         if not suitable_found:
847                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
848
849                 return self._download_retcode
850
851         def post_process(self, filename, ie_info):
852                 """Run the postprocessing chain on the given file."""
853                 info = dict(ie_info)
854                 info['filepath'] = filename
855                 for pp in self._pps:
856                         info = pp.run(info)
857                         if info is None:
858                                 break
859
860         def _download_with_rtmpdump(self, filename, url, player_url):
861                 self.report_destination(filename)
862                 tmpfilename = self.temp_name(filename)
863
864                 # Check for rtmpdump first
865                 try:
866                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
867                 except (OSError, IOError):
868                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
869                         return False
870
871                 # Download using rtmpdump. rtmpdump returns exit code 2 when
872                 # the connection was interrumpted and resuming appears to be
873                 # possible. This is part of rtmpdump's normal usage, AFAIK.
874                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
875                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
876                 while retval == 2 or retval == 1:
877                         prevsize = os.path.getsize(tmpfilename)
878                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
879                         time.sleep(5.0) # This seems to be needed
880                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
881                         cursize = os.path.getsize(tmpfilename)
882                         if prevsize == cursize and retval == 1:
883                                 break
884                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
885                         if prevsize == cursize and retval == 2 and cursize > 1024:
886                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
887                                 retval = 0
888                                 break
889                 if retval == 0:
890                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
891                         self.try_rename(tmpfilename, filename)
892                         return True
893                 else:
894                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
895                         return False
896
897         def _do_download(self, filename, info_dict):
898                 url = info_dict['url']
899                 player_url = info_dict.get('player_url', None)
900
901                 # Check file already present
902                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
903                         self.report_file_already_downloaded(filename)
904                         return True
905
906                 # Attempt to download using rtmpdump
907                 if url.startswith('rtmp'):
908                         return self._download_with_rtmpdump(filename, url, player_url)
909
910                 tmpfilename = self.temp_name(filename)
911                 stream = None
912
913                 # Do not include the Accept-Encoding header
914                 headers = {'Youtubedl-no-compression': 'True'}
915                 basic_request = urllib2.Request(url, None, headers)
916                 request = urllib2.Request(url, None, headers)
917
918                 # Establish possible resume length
919                 if os.path.isfile(tmpfilename):
920                         resume_len = os.path.getsize(tmpfilename)
921                 else:
922                         resume_len = 0
923
924                 open_mode = 'wb'
925                 if resume_len != 0:
926                         if self.params.get('continuedl', False):
927                                 self.report_resuming_byte(resume_len)
928                                 request.add_header('Range','bytes=%d-' % resume_len)
929                                 open_mode = 'ab'
930                         else:
931                                 resume_len = 0
932
933                 count = 0
934                 retries = self.params.get('retries', 0)
935                 while count <= retries:
936                         # Establish connection
937                         try:
938                                 if count == 0 and 'urlhandle' in info_dict:
939                                         data = info_dict['urlhandle']
940                                 data = urllib2.urlopen(request)
941                                 break
942                         except (urllib2.HTTPError, ), err:
943                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
944                                         # Unexpected HTTP error
945                                         raise
946                                 elif err.code == 416:
947                                         # Unable to resume (requested range not satisfiable)
948                                         try:
949                                                 # Open the connection again without the range header
950                                                 data = urllib2.urlopen(basic_request)
951                                                 content_length = data.info()['Content-Length']
952                                         except (urllib2.HTTPError, ), err:
953                                                 if err.code < 500 or err.code >= 600:
954                                                         raise
955                                         else:
956                                                 # Examine the reported length
957                                                 if (content_length is not None and
958                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
959                                                         # The file had already been fully downloaded.
960                                                         # Explanation to the above condition: in issue #175 it was revealed that
961                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
962                                                         # changing the file size slightly and causing problems for some users. So
963                                                         # I decided to implement a suggested change and consider the file
964                                                         # completely downloaded if the file size differs less than 100 bytes from
965                                                         # the one in the hard drive.
966                                                         self.report_file_already_downloaded(filename)
967                                                         self.try_rename(tmpfilename, filename)
968                                                         return True
969                                                 else:
970                                                         # The length does not match, we start the download over
971                                                         self.report_unable_to_resume()
972                                                         open_mode = 'wb'
973                                                         break
974                         # Retry
975                         count += 1
976                         if count <= retries:
977                                 self.report_retry(count, retries)
978
979                 if count > retries:
980                         self.trouble(u'ERROR: giving up after %s retries' % retries)
981                         return False
982
983                 data_len = data.info().get('Content-length', None)
984                 if data_len is not None:
985                         data_len = long(data_len) + resume_len
986                 data_len_str = self.format_bytes(data_len)
987                 byte_counter = 0 + resume_len
988                 block_size = 1024
989                 start = time.time()
990                 while True:
991                         # Download and write
992                         before = time.time()
993                         data_block = data.read(block_size)
994                         after = time.time()
995                         if len(data_block) == 0:
996                                 break
997                         byte_counter += len(data_block)
998
999                         # Open file just in time
1000                         if stream is None:
1001                                 try:
1002                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1003                                         assert stream is not None
1004                                         filename = self.undo_temp_name(tmpfilename)
1005                                         self.report_destination(filename)
1006                                 except (OSError, IOError), err:
1007                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1008                                         return False
1009                         try:
1010                                 stream.write(data_block)
1011                         except (IOError, OSError), err:
1012                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1013                                 return False
1014                         block_size = self.best_block_size(after - before, len(data_block))
1015
1016                         # Progress message
1017                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1018                         if data_len is None:
1019                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1020                         else:
1021                                 percent_str = self.calc_percent(byte_counter, data_len)
1022                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1023                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1024
1025                         # Apply rate limit
1026                         self.slow_down(start, byte_counter - resume_len)
1027
1028                 if stream is None:
1029                         self.trouble(u'\nERROR: Did not get any data blocks')
1030                         return False
1031                 stream.close()
1032                 self.report_finish()
1033                 if data_len is not None and byte_counter != data_len:
1034                         raise ContentTooShortError(byte_counter, long(data_len))
1035                 self.try_rename(tmpfilename, filename)
1036
1037                 # Update file modification time
1038                 if self.params.get('updatetime', True):
1039                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1040
1041                 return True
1042
1043
1044 class InfoExtractor(object):
1045         """Information Extractor class.
1046
1047         Information extractors are the classes that, given a URL, extract
1048         information from the video (or videos) the URL refers to. This
1049         information includes the real video URL, the video title and simplified
1050         title, author and others. The information is stored in a dictionary
1051         which is then passed to the FileDownloader. The FileDownloader
1052         processes this information possibly downloading the video to the file
1053         system, among other possible outcomes. The dictionaries must include
1054         the following fields:
1055
1056         id:             Video identifier.
1057         url:            Final video URL.
1058         uploader:       Nickname of the video uploader.
1059         title:          Literal title.
1060         stitle:         Simplified title.
1061         ext:            Video filename extension.
1062         format:         Video format.
1063         player_url:     SWF Player URL (may be None).
1064
1065         The following fields are optional. Their primary purpose is to allow
1066         youtube-dl to serve as the backend for a video search function, such
1067         as the one in youtube2mp3.  They are only used when their respective
1068         forced printing functions are called:
1069
1070         thumbnail:      Full URL to a video thumbnail image.
1071         description:    One-line video description.
1072
1073         Subclasses of this one should re-define the _real_initialize() and
1074         _real_extract() methods and define a _VALID_URL regexp.
1075         Probably, they should also be added to the list of extractors.
1076         """
1077
1078         _ready = False
1079         _downloader = None
1080
1081         def __init__(self, downloader=None):
1082                 """Constructor. Receives an optional downloader."""
1083                 self._ready = False
1084                 self.set_downloader(downloader)
1085
1086         def suitable(self, url):
1087                 """Receives a URL and returns True if suitable for this IE."""
1088                 return re.match(self._VALID_URL, url) is not None
1089
1090         def initialize(self):
1091                 """Initializes an instance (authentication, etc)."""
1092                 if not self._ready:
1093                         self._real_initialize()
1094                         self._ready = True
1095
1096         def extract(self, url):
1097                 """Extracts URL information and returns it in list of dicts."""
1098                 self.initialize()
1099                 return self._real_extract(url)
1100
1101         def set_downloader(self, downloader):
1102                 """Sets the downloader for this IE."""
1103                 self._downloader = downloader
1104
1105         def _real_initialize(self):
1106                 """Real initialization process. Redefine in subclasses."""
1107                 pass
1108
1109         def _real_extract(self, url):
1110                 """Real extraction process. Redefine in subclasses."""
1111                 pass
1112
1113
1114 class YoutubeIE(InfoExtractor):
1115         """Information extractor for youtube.com."""
1116
1117         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1118         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1119         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1120         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1121         _NETRC_MACHINE = 'youtube'
1122         # Listed in order of quality
1123         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1124         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1125         _video_extensions = {
1126                 '13': '3gp',
1127                 '17': 'mp4',
1128                 '18': 'mp4',
1129                 '22': 'mp4',
1130                 '37': 'mp4',
1131                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1132                 '43': 'webm',
1133                 '44': 'webm',
1134                 '45': 'webm',
1135         }
1136         _video_dimensions = {
1137                 '5': '240x400',
1138                 '6': '???',
1139                 '13': '???',
1140                 '17': '144x176',
1141                 '18': '360x640',
1142                 '22': '720x1280',
1143                 '34': '360x640',
1144                 '35': '480x854',
1145                 '37': '1080x1920',
1146                 '38': '3072x4096',
1147                 '43': '360x640',
1148                 '44': '480x854',
1149                 '45': '720x1280',
1150         }       
1151         IE_NAME = u'youtube'
1152
1153         def report_lang(self):
1154                 """Report attempt to set language."""
1155                 self._downloader.to_screen(u'[youtube] Setting language')
1156
1157         def report_login(self):
1158                 """Report attempt to log in."""
1159                 self._downloader.to_screen(u'[youtube] Logging in')
1160
1161         def report_age_confirmation(self):
1162                 """Report attempt to confirm age."""
1163                 self._downloader.to_screen(u'[youtube] Confirming age')
1164
1165         def report_video_webpage_download(self, video_id):
1166                 """Report attempt to download video webpage."""
1167                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1168
1169         def report_video_info_webpage_download(self, video_id):
1170                 """Report attempt to download video info webpage."""
1171                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1172
1173         def report_information_extraction(self, video_id):
1174                 """Report attempt to extract video information."""
1175                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1176
1177         def report_unavailable_format(self, video_id, format):
1178                 """Report extracted video URL."""
1179                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1180
1181         def report_rtmp_download(self):
1182                 """Indicate the download will use the RTMP protocol."""
1183                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1184
1185         def _print_formats(self, formats):
1186                 print 'Available formats:'
1187                 for x in formats:
1188                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1189
1190         def _real_initialize(self):
1191                 if self._downloader is None:
1192                         return
1193
1194                 username = None
1195                 password = None
1196                 downloader_params = self._downloader.params
1197
1198                 # Attempt to use provided username and password or .netrc data
1199                 if downloader_params.get('username', None) is not None:
1200                         username = downloader_params['username']
1201                         password = downloader_params['password']
1202                 elif downloader_params.get('usenetrc', False):
1203                         try:
1204                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1205                                 if info is not None:
1206                                         username = info[0]
1207                                         password = info[2]
1208                                 else:
1209                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1210                         except (IOError, netrc.NetrcParseError), err:
1211                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1212                                 return
1213
1214                 # Set language
1215                 request = urllib2.Request(self._LANG_URL)
1216                 try:
1217                         self.report_lang()
1218                         urllib2.urlopen(request).read()
1219                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1221                         return
1222
1223                 # No authentication to be performed
1224                 if username is None:
1225                         return
1226
1227                 # Log in
1228                 login_form = {
1229                                 'current_form': 'loginForm',
1230                                 'next':         '/',
1231                                 'action_login': 'Log In',
1232                                 'username':     username,
1233                                 'password':     password,
1234                                 }
1235                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1236                 try:
1237                         self.report_login()
1238                         login_results = urllib2.urlopen(request).read()
1239                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1240                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1241                                 return
1242                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1244                         return
1245
1246                 # Confirm age
1247                 age_form = {
1248                                 'next_url':             '/',
1249                                 'action_confirm':       'Confirm',
1250                                 }
1251                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1252                 try:
1253                         self.report_age_confirmation()
1254                         age_results = urllib2.urlopen(request).read()
1255                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1257                         return
1258
1259         def _real_extract(self, url):
1260                 # Extract video id from URL
1261                 mobj = re.match(self._VALID_URL, url)
1262                 if mobj is None:
1263                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1264                         return
1265                 video_id = mobj.group(2)
1266
1267                 # Get video webpage
1268                 self.report_video_webpage_download(video_id)
1269                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1270                 try:
1271                         video_webpage = urllib2.urlopen(request).read()
1272                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1273                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1274                         return
1275
1276                 # Attempt to extract SWF player URL
1277                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1278                 if mobj is not None:
1279                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1280                 else:
1281                         player_url = None
1282
1283                 # Get video info
1284                 self.report_video_info_webpage_download(video_id)
1285                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1286                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1287                                         % (video_id, el_type))
1288                         request = urllib2.Request(video_info_url)
1289                         try:
1290                                 video_info_webpage = urllib2.urlopen(request).read()
1291                                 video_info = parse_qs(video_info_webpage)
1292                                 if 'token' in video_info:
1293                                         break
1294                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1295                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1296                                 return
1297                 if 'token' not in video_info:
1298                         if 'reason' in video_info:
1299                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1300                         else:
1301                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1302                         return
1303
1304                 # Start extracting information
1305                 self.report_information_extraction(video_id)
1306
1307                 # uploader
1308                 if 'author' not in video_info:
1309                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1310                         return
1311                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1312
1313                 # title
1314                 if 'title' not in video_info:
1315                         self._downloader.trouble(u'ERROR: unable to extract video title')
1316                         return
1317                 video_title = urllib.unquote_plus(video_info['title'][0])
1318                 video_title = video_title.decode('utf-8')
1319                 video_title = sanitize_title(video_title)
1320
1321                 # simplified title
1322                 simple_title = _simplify_title(video_title)
1323
1324                 # thumbnail image
1325                 if 'thumbnail_url' not in video_info:
1326                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1327                         video_thumbnail = ''
1328                 else:   # don't panic if we can't find it
1329                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1330
1331                 # upload date
1332                 upload_date = u'NA'
1333                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1334                 if mobj is not None:
1335                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1336                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1337                         for expression in format_expressions:
1338                                 try:
1339                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1340                                 except:
1341                                         pass
1342
1343                 # description
1344                 try:
1345                         lxml.etree
1346                 except NameError:
1347                         video_description = u'No description available.'
1348                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1349                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1350                                 if mobj is not None:
1351                                         video_description = mobj.group(1).decode('utf-8')
1352                 else:
1353                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1354                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1355                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1356                         # TODO use another parser
1357
1358                 # token
1359                 video_token = urllib.unquote_plus(video_info['token'][0])
1360
1361                 # Decide which formats to download
1362                 req_format = self._downloader.params.get('format', None)
1363
1364                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1365                         self.report_rtmp_download()
1366                         video_url_list = [(None, video_info['conn'][0])]
1367                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1368                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1369                         url_data = [parse_qs(uds) for uds in url_data_strs]
1370                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1371                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1372
1373                         format_limit = self._downloader.params.get('format_limit', None)
1374                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1375                         if format_limit is not None and format_limit in available_formats:
1376                                 format_list = available_formats[available_formats.index(format_limit):]
1377                         else:
1378                                 format_list = available_formats
1379                         existing_formats = [x for x in format_list if x in url_map]
1380                         if len(existing_formats) == 0:
1381                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1382                                 return
1383                         if self._downloader.params.get('listformats', None):
1384                                 self._print_formats(existing_formats)
1385                                 return
1386                         if req_format is None or req_format == 'best':
1387                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1388                         elif req_format == 'worst':
1389                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1390                         elif req_format in ('-1', 'all'):
1391                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1392                         else:
1393                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1394                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1395                                 req_formats = req_format.split('/')
1396                                 video_url_list = None
1397                                 for rf in req_formats:
1398                                         if rf in url_map:
1399                                                 video_url_list = [(rf, url_map[rf])]
1400                                                 break
1401                                 if video_url_list is None:
1402                                         self._downloader.trouble(u'ERROR: requested format not available')
1403                                         return
1404                 else:
1405                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1406                         return
1407
1408                 for format_param, video_real_url in video_url_list:
1409                         # At this point we have a new video
1410                         self._downloader.increment_downloads()
1411
1412                         # Extension
1413                         video_extension = self._video_extensions.get(format_param, 'flv')
1414
1415                         try:
1416                                 # Process video information
1417                                 self._downloader.process_info({
1418                                         'id':           video_id.decode('utf-8'),
1419                                         'url':          video_real_url.decode('utf-8'),
1420                                         'uploader':     video_uploader.decode('utf-8'),
1421                                         'upload_date':  upload_date,
1422                                         'title':        video_title,
1423                                         'stitle':       simple_title,
1424                                         'ext':          video_extension.decode('utf-8'),
1425                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1426                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1427                                         'description':  video_description,
1428                                         'player_url':   player_url,
1429                                 })
1430                         except UnavailableVideoError, err:
1431                                 self._downloader.trouble(u'\nERROR: unable to download video')
1432
1433
1434 class MetacafeIE(InfoExtractor):
1435         """Information Extractor for metacafe.com."""
1436
1437         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1438         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1439         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1440         _youtube_ie = None
1441         IE_NAME = u'metacafe'
1442
1443         def __init__(self, youtube_ie, downloader=None):
1444                 InfoExtractor.__init__(self, downloader)
1445                 self._youtube_ie = youtube_ie
1446
1447         def report_disclaimer(self):
1448                 """Report disclaimer retrieval."""
1449                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1450
1451         def report_age_confirmation(self):
1452                 """Report attempt to confirm age."""
1453                 self._downloader.to_screen(u'[metacafe] Confirming age')
1454
1455         def report_download_webpage(self, video_id):
1456                 """Report webpage download."""
1457                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1458
1459         def report_extraction(self, video_id):
1460                 """Report information extraction."""
1461                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1462
1463         def _real_initialize(self):
1464                 # Retrieve disclaimer
1465                 request = urllib2.Request(self._DISCLAIMER)
1466                 try:
1467                         self.report_disclaimer()
1468                         disclaimer = urllib2.urlopen(request).read()
1469                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1470                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1471                         return
1472
1473                 # Confirm age
1474                 disclaimer_form = {
1475                         'filters': '0',
1476                         'submit': "Continue - I'm over 18",
1477                         }
1478                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1479                 try:
1480                         self.report_age_confirmation()
1481                         disclaimer = urllib2.urlopen(request).read()
1482                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1483                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1484                         return
1485
1486         def _real_extract(self, url):
1487                 # Extract id and simplified title from URL
1488                 mobj = re.match(self._VALID_URL, url)
1489                 if mobj is None:
1490                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1491                         return
1492
1493                 video_id = mobj.group(1)
1494
1495                 # Check if video comes from YouTube
1496                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1497                 if mobj2 is not None:
1498                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1499                         return
1500
1501                 # At this point we have a new video
1502                 self._downloader.increment_downloads()
1503
1504                 simple_title = mobj.group(2).decode('utf-8')
1505
1506                 # Retrieve video webpage to extract further information
1507                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1508                 try:
1509                         self.report_download_webpage(video_id)
1510                         webpage = urllib2.urlopen(request).read()
1511                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1512                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1513                         return
1514
1515                 # Extract URL, uploader and title from webpage
1516                 self.report_extraction(video_id)
1517                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1518                 if mobj is not None:
1519                         mediaURL = urllib.unquote(mobj.group(1))
1520                         video_extension = mediaURL[-3:]
1521
1522                         # Extract gdaKey if available
1523                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1524                         if mobj is None:
1525                                 video_url = mediaURL
1526                         else:
1527                                 gdaKey = mobj.group(1)
1528                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1529                 else:
1530                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1531                         if mobj is None:
1532                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1533                                 return
1534                         vardict = parse_qs(mobj.group(1))
1535                         if 'mediaData' not in vardict:
1536                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1537                                 return
1538                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1539                         if mobj is None:
1540                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1541                                 return
1542                         mediaURL = mobj.group(1).replace('\\/', '/')
1543                         video_extension = mediaURL[-3:]
1544                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1545
1546                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1547                 if mobj is None:
1548                         self._downloader.trouble(u'ERROR: unable to extract title')
1549                         return
1550                 video_title = mobj.group(1).decode('utf-8')
1551                 video_title = sanitize_title(video_title)
1552
1553                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1554                 if mobj is None:
1555                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1556                         return
1557                 video_uploader = mobj.group(1)
1558
1559                 try:
1560                         # Process video information
1561                         self._downloader.process_info({
1562                                 'id':           video_id.decode('utf-8'),
1563                                 'url':          video_url.decode('utf-8'),
1564                                 'uploader':     video_uploader.decode('utf-8'),
1565                                 'upload_date':  u'NA',
1566                                 'title':        video_title,
1567                                 'stitle':       simple_title,
1568                                 'ext':          video_extension.decode('utf-8'),
1569                                 'format':       u'NA',
1570                                 'player_url':   None,
1571                         })
1572                 except UnavailableVideoError:
1573                         self._downloader.trouble(u'\nERROR: unable to download video')
1574
1575
1576 class DailymotionIE(InfoExtractor):
1577         """Information Extractor for Dailymotion"""
1578
1579         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1580         IE_NAME = u'dailymotion'
1581
1582         def __init__(self, downloader=None):
1583                 InfoExtractor.__init__(self, downloader)
1584
1585         def report_download_webpage(self, video_id):
1586                 """Report webpage download."""
1587                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1588
1589         def report_extraction(self, video_id):
1590                 """Report information extraction."""
1591                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1592
1593         def _real_extract(self, url):
1594                 htmlParser = HTMLParser.HTMLParser()
1595                 
1596                 # Extract id and simplified title from URL
1597                 mobj = re.match(self._VALID_URL, url)
1598                 if mobj is None:
1599                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1600                         return
1601
1602                 # At this point we have a new video
1603                 self._downloader.increment_downloads()
1604                 video_id = mobj.group(1)
1605
1606                 video_extension = 'flv'
1607
1608                 # Retrieve video webpage to extract further information
1609                 request = urllib2.Request(url)
1610                 request.add_header('Cookie', 'family_filter=off')
1611                 try:
1612                         self.report_download_webpage(video_id)
1613                         webpage = urllib2.urlopen(request).read()
1614                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1615                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1616                         return
1617
1618                 # Extract URL, uploader and title from webpage
1619                 self.report_extraction(video_id)
1620                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1621                 if mobj is None:
1622                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1623                         return
1624                 sequence = urllib.unquote(mobj.group(1))
1625                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1626                 if mobj is None:
1627                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1628                         return
1629                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1630
1631                 # if needed add http://www.dailymotion.com/ if relative URL
1632
1633                 video_url = mediaURL
1634
1635                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1636                 if mobj is None:
1637                         self._downloader.trouble(u'ERROR: unable to extract title')
1638                         return
1639                 video_title = htmlParser.unescape(mobj.group('title')).decode('utf-8')
1640                 video_title = sanitize_title(video_title)
1641                 simple_title = _simplify_title(video_title)
1642
1643                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1644                 if mobj is None:
1645                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1646                         return
1647                 video_uploader = mobj.group(1)
1648
1649                 try:
1650                         # Process video information
1651                         self._downloader.process_info({
1652                                 'id':           video_id.decode('utf-8'),
1653                                 'url':          video_url.decode('utf-8'),
1654                                 'uploader':     video_uploader.decode('utf-8'),
1655                                 'upload_date':  u'NA',
1656                                 'title':        video_title,
1657                                 'stitle':       simple_title,
1658                                 'ext':          video_extension.decode('utf-8'),
1659                                 'format':       u'NA',
1660                                 'player_url':   None,
1661                         })
1662                 except UnavailableVideoError:
1663                         self._downloader.trouble(u'\nERROR: unable to download video')
1664
1665
1666 class GoogleIE(InfoExtractor):
1667         """Information extractor for video.google.com."""
1668
1669         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1670         IE_NAME = u'video.google'
1671
1672         def __init__(self, downloader=None):
1673                 InfoExtractor.__init__(self, downloader)
1674
1675         def report_download_webpage(self, video_id):
1676                 """Report webpage download."""
1677                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1678
1679         def report_extraction(self, video_id):
1680                 """Report information extraction."""
1681                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1682
1683         def _real_extract(self, url):
1684                 # Extract id from URL
1685                 mobj = re.match(self._VALID_URL, url)
1686                 if mobj is None:
1687                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1688                         return
1689
1690                 # At this point we have a new video
1691                 self._downloader.increment_downloads()
1692                 video_id = mobj.group(1)
1693
1694                 video_extension = 'mp4'
1695
1696                 # Retrieve video webpage to extract further information
1697                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1698                 try:
1699                         self.report_download_webpage(video_id)
1700                         webpage = urllib2.urlopen(request).read()
1701                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1702                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1703                         return
1704
1705                 # Extract URL, uploader, and title from webpage
1706                 self.report_extraction(video_id)
1707                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1708                 if mobj is None:
1709                         video_extension = 'flv'
1710                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1711                 if mobj is None:
1712                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1713                         return
1714                 mediaURL = urllib.unquote(mobj.group(1))
1715                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1716                 mediaURL = mediaURL.replace('\\x26', '\x26')
1717
1718                 video_url = mediaURL
1719
1720                 mobj = re.search(r'<title>(.*)</title>', webpage)
1721                 if mobj is None:
1722                         self._downloader.trouble(u'ERROR: unable to extract title')
1723                         return
1724                 video_title = mobj.group(1).decode('utf-8')
1725                 video_title = sanitize_title(video_title)
1726                 simple_title = _simplify_title(video_title)
1727
1728                 # Extract video description
1729                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1730                 if mobj is None:
1731                         self._downloader.trouble(u'ERROR: unable to extract video description')
1732                         return
1733                 video_description = mobj.group(1).decode('utf-8')
1734                 if not video_description:
1735                         video_description = 'No description available.'
1736
1737                 # Extract video thumbnail
1738                 if self._downloader.params.get('forcethumbnail', False):
1739                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1740                         try:
1741                                 webpage = urllib2.urlopen(request).read()
1742                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1743                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1744                                 return
1745                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1746                         if mobj is None:
1747                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1748                                 return
1749                         video_thumbnail = mobj.group(1)
1750                 else:   # we need something to pass to process_info
1751                         video_thumbnail = ''
1752
1753                 try:
1754                         # Process video information
1755                         self._downloader.process_info({
1756                                 'id':           video_id.decode('utf-8'),
1757                                 'url':          video_url.decode('utf-8'),
1758                                 'uploader':     u'NA',
1759                                 'upload_date':  u'NA',
1760                                 'title':        video_title,
1761                                 'stitle':       simple_title,
1762                                 'ext':          video_extension.decode('utf-8'),
1763                                 'format':       u'NA',
1764                                 'player_url':   None,
1765                         })
1766                 except UnavailableVideoError:
1767                         self._downloader.trouble(u'\nERROR: unable to download video')
1768
1769
1770 class PhotobucketIE(InfoExtractor):
1771         """Information extractor for photobucket.com."""
1772
1773         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1774         IE_NAME = u'photobucket'
1775
1776         def __init__(self, downloader=None):
1777                 InfoExtractor.__init__(self, downloader)
1778
1779         def report_download_webpage(self, video_id):
1780                 """Report webpage download."""
1781                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1782
1783         def report_extraction(self, video_id):
1784                 """Report information extraction."""
1785                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1786
1787         def _real_extract(self, url):
1788                 # Extract id from URL
1789                 mobj = re.match(self._VALID_URL, url)
1790                 if mobj is None:
1791                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1792                         return
1793
1794                 # At this point we have a new video
1795                 self._downloader.increment_downloads()
1796                 video_id = mobj.group(1)
1797
1798                 video_extension = 'flv'
1799
1800                 # Retrieve video webpage to extract further information
1801                 request = urllib2.Request(url)
1802                 try:
1803                         self.report_download_webpage(video_id)
1804                         webpage = urllib2.urlopen(request).read()
1805                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1806                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1807                         return
1808
1809                 # Extract URL, uploader, and title from webpage
1810                 self.report_extraction(video_id)
1811                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1812                 if mobj is None:
1813                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1814                         return
1815                 mediaURL = urllib.unquote(mobj.group(1))
1816
1817                 video_url = mediaURL
1818
1819                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1820                 if mobj is None:
1821                         self._downloader.trouble(u'ERROR: unable to extract title')
1822                         return
1823                 video_title = mobj.group(1).decode('utf-8')
1824                 video_title = sanitize_title(video_title)
1825                 simple_title = _simplify_title(vide_title)
1826
1827                 video_uploader = mobj.group(2).decode('utf-8')
1828
1829                 try:
1830                         # Process video information
1831                         self._downloader.process_info({
1832                                 'id':           video_id.decode('utf-8'),
1833                                 'url':          video_url.decode('utf-8'),
1834                                 'uploader':     video_uploader,
1835                                 'upload_date':  u'NA',
1836                                 'title':        video_title,
1837                                 'stitle':       simple_title,
1838                                 'ext':          video_extension.decode('utf-8'),
1839                                 'format':       u'NA',
1840                                 'player_url':   None,
1841                         })
1842                 except UnavailableVideoError:
1843                         self._downloader.trouble(u'\nERROR: unable to download video')
1844
1845
1846 class YahooIE(InfoExtractor):
1847         """Information extractor for video.yahoo.com."""
1848
1849         # _VALID_URL matches all Yahoo! Video URLs
1850         # _VPAGE_URL matches only the extractable '/watch/' URLs
1851         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1852         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1853         IE_NAME = u'video.yahoo'
1854
1855         def __init__(self, downloader=None):
1856                 InfoExtractor.__init__(self, downloader)
1857
1858         def report_download_webpage(self, video_id):
1859                 """Report webpage download."""
1860                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1861
1862         def report_extraction(self, video_id):
1863                 """Report information extraction."""
1864                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1865
1866         def _real_extract(self, url, new_video=True):
1867                 # Extract ID from URL
1868                 mobj = re.match(self._VALID_URL, url)
1869                 if mobj is None:
1870                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1871                         return
1872
1873                 # At this point we have a new video
1874                 self._downloader.increment_downloads()
1875                 video_id = mobj.group(2)
1876                 video_extension = 'flv'
1877
1878                 # Rewrite valid but non-extractable URLs as
1879                 # extractable English language /watch/ URLs
1880                 if re.match(self._VPAGE_URL, url) is None:
1881                         request = urllib2.Request(url)
1882                         try:
1883                                 webpage = urllib2.urlopen(request).read()
1884                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1885                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1886                                 return
1887
1888                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1889                         if mobj is None:
1890                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1891                                 return
1892                         yahoo_id = mobj.group(1)
1893
1894                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1895                         if mobj is None:
1896                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1897                                 return
1898                         yahoo_vid = mobj.group(1)
1899
1900                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1901                         return self._real_extract(url, new_video=False)
1902
1903                 # Retrieve video webpage to extract further information
1904                 request = urllib2.Request(url)
1905                 try:
1906                         self.report_download_webpage(video_id)
1907                         webpage = urllib2.urlopen(request).read()
1908                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1909                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1910                         return
1911
1912                 # Extract uploader and title from webpage
1913                 self.report_extraction(video_id)
1914                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1915                 if mobj is None:
1916                         self._downloader.trouble(u'ERROR: unable to extract video title')
1917                         return
1918                 video_title = mobj.group(1).decode('utf-8')
1919                 simple_title = _simplify_title(video_title)
1920
1921                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1922                 if mobj is None:
1923                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1924                         return
1925                 video_uploader = mobj.group(1).decode('utf-8')
1926
1927                 # Extract video thumbnail
1928                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1929                 if mobj is None:
1930                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1931                         return
1932                 video_thumbnail = mobj.group(1).decode('utf-8')
1933
1934                 # Extract video description
1935                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1936                 if mobj is None:
1937                         self._downloader.trouble(u'ERROR: unable to extract video description')
1938                         return
1939                 video_description = mobj.group(1).decode('utf-8')
1940                 if not video_description:
1941                         video_description = 'No description available.'
1942
1943                 # Extract video height and width
1944                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1945                 if mobj is None:
1946                         self._downloader.trouble(u'ERROR: unable to extract video height')
1947                         return
1948                 yv_video_height = mobj.group(1)
1949
1950                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1951                 if mobj is None:
1952                         self._downloader.trouble(u'ERROR: unable to extract video width')
1953                         return
1954                 yv_video_width = mobj.group(1)
1955
1956                 # Retrieve video playlist to extract media URL
1957                 # I'm not completely sure what all these options are, but we
1958                 # seem to need most of them, otherwise the server sends a 401.
1959                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1960                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1961                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1962                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1963                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1964                 try:
1965                         self.report_download_webpage(video_id)
1966                         webpage = urllib2.urlopen(request).read()
1967                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1968                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1969                         return
1970
1971                 # Extract media URL from playlist XML
1972                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1973                 if mobj is None:
1974                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1975                         return
1976                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1977                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1978
1979                 try:
1980                         # Process video information
1981                         self._downloader.process_info({
1982                                 'id':           video_id.decode('utf-8'),
1983                                 'url':          video_url,
1984                                 'uploader':     video_uploader,
1985                                 'upload_date':  u'NA',
1986                                 'title':        video_title,
1987                                 'stitle':       simple_title,
1988                                 'ext':          video_extension.decode('utf-8'),
1989                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1990                                 'description':  video_description,
1991                                 'thumbnail':    video_thumbnail,
1992                                 'player_url':   None,
1993                         })
1994                 except UnavailableVideoError:
1995                         self._downloader.trouble(u'\nERROR: unable to download video')
1996
1997
1998 class VimeoIE(InfoExtractor):
1999         """Information extractor for vimeo.com."""
2000
2001         # _VALID_URL matches Vimeo URLs
2002         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2003         IE_NAME = u'vimeo'
2004
2005         def __init__(self, downloader=None):
2006                 InfoExtractor.__init__(self, downloader)
2007
2008         def report_download_webpage(self, video_id):
2009                 """Report webpage download."""
2010                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2011
2012         def report_extraction(self, video_id):
2013                 """Report information extraction."""
2014                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2015
2016         def _real_extract(self, url, new_video=True):
2017                 # Extract ID from URL
2018                 mobj = re.match(self._VALID_URL, url)
2019                 if mobj is None:
2020                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2021                         return
2022
2023                 # At this point we have a new video
2024                 self._downloader.increment_downloads()
2025                 video_id = mobj.group(1)
2026
2027                 # Retrieve video webpage to extract further information
2028                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2029                 try:
2030                         self.report_download_webpage(video_id)
2031                         webpage = urllib2.urlopen(request).read()
2032                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2033                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2034                         return
2035
2036                 # Now we begin extracting as much information as we can from what we
2037                 # retrieved. First we extract the information common to all extractors,
2038                 # and latter we extract those that are Vimeo specific.
2039                 self.report_extraction(video_id)
2040
2041                 # Extract title
2042                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2043                 if mobj is None:
2044                         self._downloader.trouble(u'ERROR: unable to extract video title')
2045                         return
2046                 video_title = mobj.group(1).decode('utf-8')
2047                 simple_title = _simplify_title(video_title)
2048
2049                 # Extract uploader
2050                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2051                 if mobj is None:
2052                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2053                         return
2054                 video_uploader = mobj.group(1).decode('utf-8')
2055
2056                 # Extract video thumbnail
2057                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2058                 if mobj is None:
2059                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2060                         return
2061                 video_thumbnail = mobj.group(1).decode('utf-8')
2062
2063                 # # Extract video description
2064                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2065                 # if mobj is None:
2066                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2067                 #       return
2068                 # video_description = mobj.group(1).decode('utf-8')
2069                 # if not video_description: video_description = 'No description available.'
2070                 video_description = 'Foo.'
2071
2072                 # Vimeo specific: extract request signature
2073                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2074                 if mobj is None:
2075                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2076                         return
2077                 sig = mobj.group(1).decode('utf-8')
2078
2079                 # Vimeo specific: extract video quality information
2080                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2081                 if mobj is None:
2082                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2083                         return
2084                 quality = mobj.group(1).decode('utf-8')
2085
2086                 if int(quality) == 1:
2087                         quality = 'hd'
2088                 else:
2089                         quality = 'sd'
2090
2091                 # Vimeo specific: Extract request signature expiration
2092                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2093                 if mobj is None:
2094                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2095                         return
2096                 sig_exp = mobj.group(1).decode('utf-8')
2097
2098                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2099
2100                 try:
2101                         # Process video information
2102                         self._downloader.process_info({
2103                                 'id':           video_id.decode('utf-8'),
2104                                 'url':          video_url,
2105                                 'uploader':     video_uploader,
2106                                 'upload_date':  u'NA',
2107                                 'title':        video_title,
2108                                 'stitle':       simple_title,
2109                                 'ext':          u'mp4',
2110                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2111                                 'description':  video_description,
2112                                 'thumbnail':    video_thumbnail,
2113                                 'description':  video_description,
2114                                 'player_url':   None,
2115                         })
2116                 except UnavailableVideoError:
2117                         self._downloader.trouble(u'ERROR: unable to download video')
2118
2119
2120 class GenericIE(InfoExtractor):
2121         """Generic last-resort information extractor."""
2122
2123         _VALID_URL = r'.*'
2124         IE_NAME = u'generic'
2125
2126         def __init__(self, downloader=None):
2127                 InfoExtractor.__init__(self, downloader)
2128
2129         def report_download_webpage(self, video_id):
2130                 """Report webpage download."""
2131                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2132                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2133
2134         def report_extraction(self, video_id):
2135                 """Report information extraction."""
2136                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2137
2138         def _real_extract(self, url):
2139                 # At this point we have a new video
2140                 self._downloader.increment_downloads()
2141
2142                 video_id = url.split('/')[-1]
2143                 request = urllib2.Request(url)
2144                 try:
2145                         self.report_download_webpage(video_id)
2146                         webpage = urllib2.urlopen(request).read()
2147                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2148                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2149                         return
2150                 except ValueError, err:
2151                         # since this is the last-resort InfoExtractor, if
2152                         # this error is thrown, it'll be thrown here
2153                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2154                         return
2155
2156                 self.report_extraction(video_id)
2157                 # Start with something easy: JW Player in SWFObject
2158                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2159                 if mobj is None:
2160                         # Broaden the search a little bit
2161                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2162                 if mobj is None:
2163                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2164                         return
2165
2166                 # It's possible that one of the regexes
2167                 # matched, but returned an empty group:
2168                 if mobj.group(1) is None:
2169                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2170                         return
2171
2172                 video_url = urllib.unquote(mobj.group(1))
2173                 video_id = os.path.basename(video_url)
2174
2175                 # here's a fun little line of code for you:
2176                 video_extension = os.path.splitext(video_id)[1][1:]
2177                 video_id = os.path.splitext(video_id)[0]
2178
2179                 # it's tempting to parse this further, but you would
2180                 # have to take into account all the variations like
2181                 #   Video Title - Site Name
2182                 #   Site Name | Video Title
2183                 #   Video Title - Tagline | Site Name
2184                 # and so on and so forth; it's just not practical
2185                 mobj = re.search(r'<title>(.*)</title>', webpage)
2186                 if mobj is None:
2187                         self._downloader.trouble(u'ERROR: unable to extract title')
2188                         return
2189                 video_title = mobj.group(1).decode('utf-8')
2190                 video_title = sanitize_title(video_title)
2191                 simple_title = _simplify_title(video_title)
2192
2193                 # video uploader is domain name
2194                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2195                 if mobj is None:
2196                         self._downloader.trouble(u'ERROR: unable to extract title')
2197                         return
2198                 video_uploader = mobj.group(1).decode('utf-8')
2199
2200                 try:
2201                         # Process video information
2202                         self._downloader.process_info({
2203                                 'id':           video_id.decode('utf-8'),
2204                                 'url':          video_url.decode('utf-8'),
2205                                 'uploader':     video_uploader,
2206                                 'upload_date':  u'NA',
2207                                 'title':        video_title,
2208                                 'stitle':       simple_title,
2209                                 'ext':          video_extension.decode('utf-8'),
2210                                 'format':       u'NA',
2211                                 'player_url':   None,
2212                         })
2213                 except UnavailableVideoError, err:
2214                         self._downloader.trouble(u'\nERROR: unable to download video')
2215
2216
2217 class YoutubeSearchIE(InfoExtractor):
2218         """Information Extractor for YouTube search queries."""
2219         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2220         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2221         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2222         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2223         _youtube_ie = None
2224         _max_youtube_results = 1000
2225         IE_NAME = u'youtube:search'
2226
2227         def __init__(self, youtube_ie, downloader=None):
2228                 InfoExtractor.__init__(self, downloader)
2229                 self._youtube_ie = youtube_ie
2230
2231         def report_download_page(self, query, pagenum):
2232                 """Report attempt to download playlist page with given number."""
2233                 query = query.decode(preferredencoding())
2234                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2235
2236         def _real_initialize(self):
2237                 self._youtube_ie.initialize()
2238
2239         def _real_extract(self, query):
2240                 mobj = re.match(self._VALID_URL, query)
2241                 if mobj is None:
2242                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2243                         return
2244
2245                 prefix, query = query.split(':')
2246                 prefix = prefix[8:]
2247                 query = query.encode('utf-8')
2248                 if prefix == '':
2249                         self._download_n_results(query, 1)
2250                         return
2251                 elif prefix == 'all':
2252                         self._download_n_results(query, self._max_youtube_results)
2253                         return
2254                 else:
2255                         try:
2256                                 n = long(prefix)
2257                                 if n <= 0:
2258                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2259                                         return
2260                                 elif n > self._max_youtube_results:
2261                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2262                                         n = self._max_youtube_results
2263                                 self._download_n_results(query, n)
2264                                 return
2265                         except ValueError: # parsing prefix as integer fails
2266                                 self._download_n_results(query, 1)
2267                                 return
2268
2269         def _download_n_results(self, query, n):
2270                 """Downloads a specified number of results for a query"""
2271
2272                 video_ids = []
2273                 already_seen = set()
2274                 pagenum = 1
2275
2276                 while True:
2277                         self.report_download_page(query, pagenum)
2278                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2279                         request = urllib2.Request(result_url)
2280                         try:
2281                                 page = urllib2.urlopen(request).read()
2282                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2283                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2284                                 return
2285
2286                         # Extract video identifiers
2287                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2288                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2289                                 if video_id not in already_seen:
2290                                         video_ids.append(video_id)
2291                                         already_seen.add(video_id)
2292                                         if len(video_ids) == n:
2293                                                 # Specified n videos reached
2294                                                 for id in video_ids:
2295                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2296                                                 return
2297
2298                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2299                                 for id in video_ids:
2300                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2301                                 return
2302
2303                         pagenum = pagenum + 1
2304
2305
2306 class GoogleSearchIE(InfoExtractor):
2307         """Information Extractor for Google Video search queries."""
2308         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2309         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2310         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2311         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2312         _google_ie = None
2313         _max_google_results = 1000
2314         IE_NAME = u'video.google:search'
2315
2316         def __init__(self, google_ie, downloader=None):
2317                 InfoExtractor.__init__(self, downloader)
2318                 self._google_ie = google_ie
2319
2320         def report_download_page(self, query, pagenum):
2321                 """Report attempt to download playlist page with given number."""
2322                 query = query.decode(preferredencoding())
2323                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2324
2325         def _real_initialize(self):
2326                 self._google_ie.initialize()
2327
2328         def _real_extract(self, query):
2329                 mobj = re.match(self._VALID_URL, query)
2330                 if mobj is None:
2331                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2332                         return
2333
2334                 prefix, query = query.split(':')
2335                 prefix = prefix[8:]
2336                 query = query.encode('utf-8')
2337                 if prefix == '':
2338                         self._download_n_results(query, 1)
2339                         return
2340                 elif prefix == 'all':
2341                         self._download_n_results(query, self._max_google_results)
2342                         return
2343                 else:
2344                         try:
2345                                 n = long(prefix)
2346                                 if n <= 0:
2347                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2348                                         return
2349                                 elif n > self._max_google_results:
2350                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2351                                         n = self._max_google_results
2352                                 self._download_n_results(query, n)
2353                                 return
2354                         except ValueError: # parsing prefix as integer fails
2355                                 self._download_n_results(query, 1)
2356                                 return
2357
2358         def _download_n_results(self, query, n):
2359                 """Downloads a specified number of results for a query"""
2360
2361                 video_ids = []
2362                 already_seen = set()
2363                 pagenum = 1
2364
2365                 while True:
2366                         self.report_download_page(query, pagenum)
2367                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2368                         request = urllib2.Request(result_url)
2369                         try:
2370                                 page = urllib2.urlopen(request).read()
2371                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2372                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2373                                 return
2374
2375                         # Extract video identifiers
2376                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2377                                 video_id = mobj.group(1)
2378                                 if video_id not in already_seen:
2379                                         video_ids.append(video_id)
2380                                         already_seen.add(video_id)
2381                                         if len(video_ids) == n:
2382                                                 # Specified n videos reached
2383                                                 for id in video_ids:
2384                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2385                                                 return
2386
2387                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2388                                 for id in video_ids:
2389                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2390                                 return
2391
2392                         pagenum = pagenum + 1
2393
2394
2395 class YahooSearchIE(InfoExtractor):
2396         """Information Extractor for Yahoo! Video search queries."""
2397         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2398         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2399         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2400         _MORE_PAGES_INDICATOR = r'\s*Next'
2401         _yahoo_ie = None
2402         _max_yahoo_results = 1000
2403         IE_NAME = u'video.yahoo:search'
2404
2405         def __init__(self, yahoo_ie, downloader=None):
2406                 InfoExtractor.__init__(self, downloader)
2407                 self._yahoo_ie = yahoo_ie
2408
2409         def report_download_page(self, query, pagenum):
2410                 """Report attempt to download playlist page with given number."""
2411                 query = query.decode(preferredencoding())
2412                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2413
2414         def _real_initialize(self):
2415                 self._yahoo_ie.initialize()
2416
2417         def _real_extract(self, query):
2418                 mobj = re.match(self._VALID_URL, query)
2419                 if mobj is None:
2420                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2421                         return
2422
2423                 prefix, query = query.split(':')
2424                 prefix = prefix[8:]
2425                 query = query.encode('utf-8')
2426                 if prefix == '':
2427                         self._download_n_results(query, 1)
2428                         return
2429                 elif prefix == 'all':
2430                         self._download_n_results(query, self._max_yahoo_results)
2431                         return
2432                 else:
2433                         try:
2434                                 n = long(prefix)
2435                                 if n <= 0:
2436                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2437                                         return
2438                                 elif n > self._max_yahoo_results:
2439                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2440                                         n = self._max_yahoo_results
2441                                 self._download_n_results(query, n)
2442                                 return
2443                         except ValueError: # parsing prefix as integer fails
2444                                 self._download_n_results(query, 1)
2445                                 return
2446
2447         def _download_n_results(self, query, n):
2448                 """Downloads a specified number of results for a query"""
2449
2450                 video_ids = []
2451                 already_seen = set()
2452                 pagenum = 1
2453
2454                 while True:
2455                         self.report_download_page(query, pagenum)
2456                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2457                         request = urllib2.Request(result_url)
2458                         try:
2459                                 page = urllib2.urlopen(request).read()
2460                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2461                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2462                                 return
2463
2464                         # Extract video identifiers
2465                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2466                                 video_id = mobj.group(1)
2467                                 if video_id not in already_seen:
2468                                         video_ids.append(video_id)
2469                                         already_seen.add(video_id)
2470                                         if len(video_ids) == n:
2471                                                 # Specified n videos reached
2472                                                 for id in video_ids:
2473                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2474                                                 return
2475
2476                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2477                                 for id in video_ids:
2478                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2479                                 return
2480
2481                         pagenum = pagenum + 1
2482
2483
2484 class YoutubePlaylistIE(InfoExtractor):
2485         """Information Extractor for YouTube playlists."""
2486
2487         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2488         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2489         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2490         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2491         _youtube_ie = None
2492         IE_NAME = u'youtube:playlist'
2493
2494         def __init__(self, youtube_ie, downloader=None):
2495                 InfoExtractor.__init__(self, downloader)
2496                 self._youtube_ie = youtube_ie
2497
2498         def report_download_page(self, playlist_id, pagenum):
2499                 """Report attempt to download playlist page with given number."""
2500                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2501
2502         def _real_initialize(self):
2503                 self._youtube_ie.initialize()
2504
2505         def _real_extract(self, url):
2506                 # Extract playlist id
2507                 mobj = re.match(self._VALID_URL, url)
2508                 if mobj is None:
2509                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2510                         return
2511
2512                 # Single video case
2513                 if mobj.group(3) is not None:
2514                         self._youtube_ie.extract(mobj.group(3))
2515                         return
2516
2517                 # Download playlist pages
2518                 # prefix is 'p' as default for playlists but there are other types that need extra care
2519                 playlist_prefix = mobj.group(1)
2520                 if playlist_prefix == 'a':
2521                         playlist_access = 'artist'
2522                 else:
2523                         playlist_prefix = 'p'
2524                         playlist_access = 'view_play_list'
2525                 playlist_id = mobj.group(2)
2526                 video_ids = []
2527                 pagenum = 1
2528
2529                 while True:
2530                         self.report_download_page(playlist_id, pagenum)
2531                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2532                         request = urllib2.Request(url)
2533                         try:
2534                                 page = urllib2.urlopen(request).read()
2535                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2536                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2537                                 return
2538
2539                         # Extract video identifiers
2540                         ids_in_page = []
2541                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2542                                 if mobj.group(1) not in ids_in_page:
2543                                         ids_in_page.append(mobj.group(1))
2544                         video_ids.extend(ids_in_page)
2545
2546                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2547                                 break
2548                         pagenum = pagenum + 1
2549
2550                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2551                 playlistend = self._downloader.params.get('playlistend', -1)
2552                 video_ids = video_ids[playliststart:playlistend]
2553
2554                 for id in video_ids:
2555                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2556                 return
2557
2558
2559 class YoutubeUserIE(InfoExtractor):
2560         """Information Extractor for YouTube users."""
2561
2562         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2563         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2564         _GDATA_PAGE_SIZE = 50
2565         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2566         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2567         _youtube_ie = None
2568         IE_NAME = u'youtube:user'
2569
2570         def __init__(self, youtube_ie, downloader=None):
2571                 InfoExtractor.__init__(self, downloader)
2572                 self._youtube_ie = youtube_ie
2573
2574         def report_download_page(self, username, start_index):
2575                 """Report attempt to download user page."""
2576                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2577                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2578
2579         def _real_initialize(self):
2580                 self._youtube_ie.initialize()
2581
2582         def _real_extract(self, url):
2583                 # Extract username
2584                 mobj = re.match(self._VALID_URL, url)
2585                 if mobj is None:
2586                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2587                         return
2588
2589                 username = mobj.group(1)
2590
2591                 # Download video ids using YouTube Data API. Result size per
2592                 # query is limited (currently to 50 videos) so we need to query
2593                 # page by page until there are no video ids - it means we got
2594                 # all of them.
2595
2596                 video_ids = []
2597                 pagenum = 0
2598
2599                 while True:
2600                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2601                         self.report_download_page(username, start_index)
2602
2603                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2604
2605                         try:
2606                                 page = urllib2.urlopen(request).read()
2607                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2608                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2609                                 return
2610
2611                         # Extract video identifiers
2612                         ids_in_page = []
2613
2614                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2615                                 if mobj.group(1) not in ids_in_page:
2616                                         ids_in_page.append(mobj.group(1))
2617
2618                         video_ids.extend(ids_in_page)
2619
2620                         # A little optimization - if current page is not
2621                         # "full", ie. does not contain PAGE_SIZE video ids then
2622                         # we can assume that this page is the last one - there
2623                         # are no more ids on further pages - no need to query
2624                         # again.
2625
2626                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2627                                 break
2628
2629                         pagenum += 1
2630
2631                 all_ids_count = len(video_ids)
2632                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2633                 playlistend = self._downloader.params.get('playlistend', -1)
2634
2635                 if playlistend == -1:
2636                         video_ids = video_ids[playliststart:]
2637                 else:
2638                         video_ids = video_ids[playliststart:playlistend]
2639
2640                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2641                                 (username, all_ids_count, len(video_ids)))
2642
2643                 for video_id in video_ids:
2644                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2645
2646
2647 class DepositFilesIE(InfoExtractor):
2648         """Information extractor for depositfiles.com"""
2649
2650         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2651         IE_NAME = u'DepositFiles'
2652
2653         def __init__(self, downloader=None):
2654                 InfoExtractor.__init__(self, downloader)
2655
2656         def report_download_webpage(self, file_id):
2657                 """Report webpage download."""
2658                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2659
2660         def report_extraction(self, file_id):
2661                 """Report information extraction."""
2662                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2663
2664         def _real_extract(self, url):
2665                 # At this point we have a new file
2666                 self._downloader.increment_downloads()
2667
2668                 file_id = url.split('/')[-1]
2669                 # Rebuild url in english locale
2670                 url = 'http://depositfiles.com/en/files/' + file_id
2671
2672                 # Retrieve file webpage with 'Free download' button pressed
2673                 free_download_indication = { 'gateway_result' : '1' }
2674                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2675                 try:
2676                         self.report_download_webpage(file_id)
2677                         webpage = urllib2.urlopen(request).read()
2678                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2679                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2680                         return
2681
2682                 # Search for the real file URL
2683                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2684                 if (mobj is None) or (mobj.group(1) is None):
2685                         # Try to figure out reason of the error.
2686                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2687                         if (mobj is not None) and (mobj.group(1) is not None):
2688                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2689                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2690                         else:
2691                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2692                         return
2693
2694                 file_url = mobj.group(1)
2695                 file_extension = os.path.splitext(file_url)[1][1:]
2696
2697                 # Search for file title
2698                 mobj = re.search(r'<b title="(.*?)">', webpage)
2699                 if mobj is None:
2700                         self._downloader.trouble(u'ERROR: unable to extract title')
2701                         return
2702                 file_title = mobj.group(1).decode('utf-8')
2703
2704                 try:
2705                         # Process file information
2706                         self._downloader.process_info({
2707                                 'id':           file_id.decode('utf-8'),
2708                                 'url':          file_url.decode('utf-8'),
2709                                 'uploader':     u'NA',
2710                                 'upload_date':  u'NA',
2711                                 'title':        file_title,
2712                                 'stitle':       file_title,
2713                                 'ext':          file_extension.decode('utf-8'),
2714                                 'format':       u'NA',
2715                                 'player_url':   None,
2716                         })
2717                 except UnavailableVideoError, err:
2718                         self._downloader.trouble(u'ERROR: unable to download file')
2719
2720
2721 class FacebookIE(InfoExtractor):
2722         """Information Extractor for Facebook"""
2723
2724         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2725         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2726         _NETRC_MACHINE = 'facebook'
2727         _available_formats = ['video', 'highqual', 'lowqual']
2728         _video_extensions = {
2729                 'video': 'mp4',
2730                 'highqual': 'mp4',
2731                 'lowqual': 'mp4',
2732         }
2733         IE_NAME = u'facebook'
2734
2735         def __init__(self, downloader=None):
2736                 InfoExtractor.__init__(self, downloader)
2737
2738         def _reporter(self, message):
2739                 """Add header and report message."""
2740                 self._downloader.to_screen(u'[facebook] %s' % message)
2741
2742         def report_login(self):
2743                 """Report attempt to log in."""
2744                 self._reporter(u'Logging in')
2745
2746         def report_video_webpage_download(self, video_id):
2747                 """Report attempt to download video webpage."""
2748                 self._reporter(u'%s: Downloading video webpage' % video_id)
2749
2750         def report_information_extraction(self, video_id):
2751                 """Report attempt to extract video information."""
2752                 self._reporter(u'%s: Extracting video information' % video_id)
2753
2754         def _parse_page(self, video_webpage):
2755                 """Extract video information from page"""
2756                 # General data
2757                 data = {'title': r'\("video_title", "(.*?)"\)',
2758                         'description': r'<div class="datawrap">(.*?)</div>',
2759                         'owner': r'\("video_owner_name", "(.*?)"\)',
2760                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2761                         }
2762                 video_info = {}
2763                 for piece in data.keys():
2764                         mobj = re.search(data[piece], video_webpage)
2765                         if mobj is not None:
2766                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2767
2768                 # Video urls
2769                 video_urls = {}
2770                 for fmt in self._available_formats:
2771                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2772                         if mobj is not None:
2773                                 # URL is in a Javascript segment inside an escaped Unicode format within
2774                                 # the generally utf-8 page
2775                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2776                 video_info['video_urls'] = video_urls
2777
2778                 return video_info
2779
2780         def _real_initialize(self):
2781                 if self._downloader is None:
2782                         return
2783
2784                 useremail = None
2785                 password = None
2786                 downloader_params = self._downloader.params
2787
2788                 # Attempt to use provided username and password or .netrc data
2789                 if downloader_params.get('username', None) is not None:
2790                         useremail = downloader_params['username']
2791                         password = downloader_params['password']
2792                 elif downloader_params.get('usenetrc', False):
2793                         try:
2794                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2795                                 if info is not None:
2796                                         useremail = info[0]
2797                                         password = info[2]
2798                                 else:
2799                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2800                         except (IOError, netrc.NetrcParseError), err:
2801                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2802                                 return
2803
2804                 if useremail is None:
2805                         return
2806
2807                 # Log in
2808                 login_form = {
2809                         'email': useremail,
2810                         'pass': password,
2811                         'login': 'Log+In'
2812                         }
2813                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2814                 try:
2815                         self.report_login()
2816                         login_results = urllib2.urlopen(request).read()
2817                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2818                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2819                                 return
2820                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2821                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2822                         return
2823
2824         def _real_extract(self, url):
2825                 mobj = re.match(self._VALID_URL, url)
2826                 if mobj is None:
2827                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2828                         return
2829                 video_id = mobj.group('ID')
2830
2831                 # Get video webpage
2832                 self.report_video_webpage_download(video_id)
2833                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2834                 try:
2835                         page = urllib2.urlopen(request)
2836                         video_webpage = page.read()
2837                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2838                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2839                         return
2840
2841                 # Start extracting information
2842                 self.report_information_extraction(video_id)
2843
2844                 # Extract information
2845                 video_info = self._parse_page(video_webpage)
2846
2847                 # uploader
2848                 if 'owner' not in video_info:
2849                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2850                         return
2851                 video_uploader = video_info['owner']
2852
2853                 # title
2854                 if 'title' not in video_info:
2855                         self._downloader.trouble(u'ERROR: unable to extract video title')
2856                         return
2857                 video_title = video_info['title']
2858                 video_title = video_title.decode('utf-8')
2859                 video_title = sanitize_title(video_title)
2860
2861                 simple_title = _simplify_title(video_title)
2862
2863                 # thumbnail image
2864                 if 'thumbnail' not in video_info:
2865                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2866                         video_thumbnail = ''
2867                 else:
2868                         video_thumbnail = video_info['thumbnail']
2869
2870                 # upload date
2871                 upload_date = u'NA'
2872                 if 'upload_date' in video_info:
2873                         upload_time = video_info['upload_date']
2874                         timetuple = email.utils.parsedate_tz(upload_time)
2875                         if timetuple is not None:
2876                                 try:
2877                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2878                                 except:
2879                                         pass
2880
2881                 # description
2882                 video_description = video_info.get('description', 'No description available.')
2883
2884                 url_map = video_info['video_urls']
2885                 if len(url_map.keys()) > 0:
2886                         # Decide which formats to download
2887                         req_format = self._downloader.params.get('format', None)
2888                         format_limit = self._downloader.params.get('format_limit', None)
2889
2890                         if format_limit is not None and format_limit in self._available_formats:
2891                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2892                         else:
2893                                 format_list = self._available_formats
2894                         existing_formats = [x for x in format_list if x in url_map]
2895                         if len(existing_formats) == 0:
2896                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2897                                 return
2898                         if req_format is None:
2899                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2900                         elif req_format == 'worst':
2901                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2902                         elif req_format == '-1':
2903                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2904                         else:
2905                                 # Specific format
2906                                 if req_format not in url_map:
2907                                         self._downloader.trouble(u'ERROR: requested format not available')
2908                                         return
2909                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2910
2911                 for format_param, video_real_url in video_url_list:
2912
2913                         # At this point we have a new video
2914                         self._downloader.increment_downloads()
2915
2916                         # Extension
2917                         video_extension = self._video_extensions.get(format_param, 'mp4')
2918
2919                         try:
2920                                 # Process video information
2921                                 self._downloader.process_info({
2922                                         'id':           video_id.decode('utf-8'),
2923                                         'url':          video_real_url.decode('utf-8'),
2924                                         'uploader':     video_uploader.decode('utf-8'),
2925                                         'upload_date':  upload_date,
2926                                         'title':        video_title,
2927                                         'stitle':       simple_title,
2928                                         'ext':          video_extension.decode('utf-8'),
2929                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2930                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2931                                         'description':  video_description.decode('utf-8'),
2932                                         'player_url':   None,
2933                                 })
2934                         except UnavailableVideoError, err:
2935                                 self._downloader.trouble(u'\nERROR: unable to download video')
2936
2937 class BlipTVIE(InfoExtractor):
2938         """Information extractor for blip.tv"""
2939
2940         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2941         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2942         IE_NAME = u'blip.tv'
2943
2944         def report_extraction(self, file_id):
2945                 """Report information extraction."""
2946                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2947
2948         def report_direct_download(self, title):
2949                 """Report information extraction."""
2950                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2951
2952         def _real_extract(self, url):
2953                 mobj = re.match(self._VALID_URL, url)
2954                 if mobj is None:
2955                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2956                         return
2957
2958                 if '?' in url:
2959                         cchar = '&'
2960                 else:
2961                         cchar = '?'
2962                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2963                 request = urllib2.Request(json_url)
2964                 self.report_extraction(mobj.group(1))
2965                 info = None
2966                 try:
2967                         urlh = urllib2.urlopen(request)
2968                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2969                                 basename = url.split('/')[-1]
2970                                 title,ext = os.path.splitext(basename)
2971                                 title = title.decode('UTF-8')
2972                                 ext = ext.replace('.', '')
2973                                 self.report_direct_download(title)
2974                                 info = {
2975                                         'id': title,
2976                                         'url': url,
2977                                         'title': title,
2978                                         'stitle': _simplify_title(title),
2979                                         'ext': ext,
2980                                         'urlhandle': urlh
2981                                 }
2982                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2983                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2984                         return
2985                 if info is None: # Regular URL
2986                         try:
2987                                 json_code = urlh.read()
2988                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2989                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2990                                 return
2991
2992                         try:
2993                                 json_data = json.loads(json_code)
2994                                 if 'Post' in json_data:
2995                                         data = json_data['Post']
2996                                 else:
2997                                         data = json_data
2998         
2999                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3000                                 video_url = data['media']['url']
3001                                 umobj = re.match(self._URL_EXT, video_url)
3002                                 if umobj is None:
3003                                         raise ValueError('Can not determine filename extension')
3004                                 ext = umobj.group(1)
3005         
3006                                 info = {
3007                                         'id': data['item_id'],
3008                                         'url': video_url,
3009                                         'uploader': data['display_name'],
3010                                         'upload_date': upload_date,
3011                                         'title': data['title'],
3012                                         'stitle': _simplify_title(data['title']),
3013                                         'ext': ext,
3014                                         'format': data['media']['mimeType'],
3015                                         'thumbnail': data['thumbnailUrl'],
3016                                         'description': data['description'],
3017                                         'player_url': data['embedUrl']
3018                                 }
3019                         except (ValueError,KeyError), err:
3020                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3021                                 return
3022
3023                 self._downloader.increment_downloads()
3024
3025                 try:
3026                         self._downloader.process_info(info)
3027                 except UnavailableVideoError, err:
3028                         self._downloader.trouble(u'\nERROR: unable to download video')
3029
3030
3031 class MyVideoIE(InfoExtractor):
3032         """Information Extractor for myvideo.de."""
3033
3034         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3035         IE_NAME = u'myvideo'
3036
3037         def __init__(self, downloader=None):
3038                 InfoExtractor.__init__(self, downloader)
3039         
3040         def report_download_webpage(self, video_id):
3041                 """Report webpage download."""
3042                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3043
3044         def report_extraction(self, video_id):
3045                 """Report information extraction."""
3046                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3047
3048         def _real_extract(self,url):
3049                 mobj = re.match(self._VALID_URL, url)
3050                 if mobj is None:
3051                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3052                         return
3053
3054                 video_id = mobj.group(1)
3055
3056                 # Get video webpage
3057                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3058                 try:
3059                         self.report_download_webpage(video_id)
3060                         webpage = urllib2.urlopen(request).read()
3061                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3062                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3063                         return
3064
3065                 self.report_extraction(video_id)
3066                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3067                                  webpage)
3068                 if mobj is None:
3069                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3070                         return
3071                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3072
3073                 mobj = re.search('<title>([^<]+)</title>', webpage)
3074                 if mobj is None:
3075                         self._downloader.trouble(u'ERROR: unable to extract title')
3076                         return
3077
3078                 video_title = mobj.group(1)
3079                 video_title = sanitize_title(video_title)
3080
3081                 simple_title = _simplify_title(video_title)
3082
3083                 try:
3084                         self._downloader.process_info({
3085                                 'id':           video_id,
3086                                 'url':          video_url,
3087                                 'uploader':     u'NA',
3088                                 'upload_date':  u'NA',
3089                                 'title':        video_title,
3090                                 'stitle':       simple_title,
3091                                 'ext':          u'flv',
3092                                 'format':       u'NA',
3093                                 'player_url':   None,
3094                         })
3095                 except UnavailableVideoError:
3096                         self._downloader.trouble(u'\nERROR: Unable to download video')
3097
3098 class ComedyCentralIE(InfoExtractor):
3099         """Information extractor for The Daily Show and Colbert Report """
3100
3101         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3102         IE_NAME = u'comedycentral'
3103
3104         def report_extraction(self, episode_id):
3105                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3106         
3107         def report_config_download(self, episode_id):
3108                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3109
3110         def report_index_download(self, episode_id):
3111                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3112
3113         def report_player_url(self, episode_id):
3114                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3115
3116         def _real_extract(self, url):
3117                 mobj = re.match(self._VALID_URL, url)
3118                 if mobj is None:
3119                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3120                         return
3121
3122                 if mobj.group('shortname'):
3123                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3124                                 url = u'http://www.thedailyshow.com/full-episodes/'
3125                         else:
3126                                 url = u'http://www.colbertnation.com/full-episodes/'
3127                         mobj = re.match(self._VALID_URL, url)
3128                         assert mobj is not None
3129
3130                 dlNewest = not mobj.group('episode')
3131                 if dlNewest:
3132                         epTitle = mobj.group('showname')
3133                 else:
3134                         epTitle = mobj.group('episode')
3135
3136                 req = urllib2.Request(url)
3137                 self.report_extraction(epTitle)
3138                 try:
3139                         htmlHandle = urllib2.urlopen(req)
3140                         html = htmlHandle.read()
3141                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3142                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3143                         return
3144                 if dlNewest:
3145                         url = htmlHandle.geturl()
3146                         mobj = re.match(self._VALID_URL, url)
3147                         if mobj is None:
3148                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3149                                 return
3150                         if mobj.group('episode') == '':
3151                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3152                                 return
3153                         epTitle = mobj.group('episode')
3154
3155                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3156                 if len(mMovieParams) == 0:
3157                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3158                         return
3159
3160                 playerUrl_raw = mMovieParams[0][0]
3161                 self.report_player_url(epTitle)
3162                 try:
3163                         urlHandle = urllib2.urlopen(playerUrl_raw)
3164                         playerUrl = urlHandle.geturl()
3165                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3166                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3167                         return
3168
3169                 uri = mMovieParams[0][1]
3170                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3171                 self.report_index_download(epTitle)
3172                 try:
3173                         indexXml = urllib2.urlopen(indexUrl).read()
3174                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3175                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3176                         return
3177
3178                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3179                 itemEls = idoc.findall('.//item')
3180                 for itemEl in itemEls:
3181                         mediaId = itemEl.findall('./guid')[0].text
3182                         shortMediaId = mediaId.split(':')[-1]
3183                         showId = mediaId.split(':')[-2].replace('.com', '')
3184                         officialTitle = itemEl.findall('./title')[0].text
3185                         officialDate = itemEl.findall('./pubDate')[0].text
3186
3187                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3188                                                 urllib.urlencode({'uri': mediaId}))
3189                         configReq = urllib2.Request(configUrl)
3190                         self.report_config_download(epTitle)
3191                         try:
3192                                 configXml = urllib2.urlopen(configReq).read()
3193                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3194                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3195                                 return
3196
3197                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3198                         turls = []
3199                         for rendition in cdoc.findall('.//rendition'):
3200                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3201                                 turls.append(finfo)
3202
3203                         if len(turls) == 0:
3204                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3205                                 continue
3206
3207                         # For now, just pick the highest bitrate
3208                         format,video_url = turls[-1]
3209
3210                         self._downloader.increment_downloads()
3211
3212                         effTitle = showId + u'-' + epTitle
3213                         info = {
3214                                 'id': shortMediaId,
3215                                 'url': video_url,
3216                                 'uploader': showId,
3217                                 'upload_date': officialDate,
3218                                 'title': effTitle,
3219                                 'stitle': _simplify_title(effTitle),
3220                                 'ext': 'mp4',
3221                                 'format': format,
3222                                 'thumbnail': None,
3223                                 'description': officialTitle,
3224                                 'player_url': playerUrl
3225                         }
3226
3227                         try:
3228                                 self._downloader.process_info(info)
3229                         except UnavailableVideoError, err:
3230                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3231                                 continue
3232
3233
3234 class EscapistIE(InfoExtractor):
3235         """Information extractor for The Escapist """
3236
3237         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3238         IE_NAME = u'escapist'
3239
3240         def report_extraction(self, showName):
3241                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3242
3243         def report_config_download(self, showName):
3244                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3245
3246         def _real_extract(self, url):
3247                 htmlParser = HTMLParser.HTMLParser()
3248
3249                 mobj = re.match(self._VALID_URL, url)
3250                 if mobj is None:
3251                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3252                         return
3253                 showName = mobj.group('showname')
3254                 videoId = mobj.group('episode')
3255
3256                 self.report_extraction(showName)
3257                 try:
3258                         webPage = urllib2.urlopen(url).read()
3259                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3260                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3261                         return
3262
3263                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3264                 description = htmlParser.unescape(descMatch.group(1))
3265                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3266                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3267                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3268                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3269                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3270                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3271
3272                 self.report_config_download(showName)
3273                 try:
3274                         configJSON = urllib2.urlopen(configUrl).read()
3275                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3276                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3277                         return
3278
3279                 # Technically, it's JavaScript, not JSON
3280                 configJSON = configJSON.replace("'", '"')
3281
3282                 try:
3283                         config = json.loads(configJSON)
3284                 except (ValueError,), err:
3285                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3286                         return
3287
3288                 playlist = config['playlist']
3289                 videoUrl = playlist[1]['url']
3290
3291                 self._downloader.increment_downloads()
3292                 info = {
3293                         'id': videoId,
3294                         'url': videoUrl,
3295                         'uploader': showName,
3296                         'upload_date': None,
3297                         'title': showName,
3298                         'stitle': _simplify_title(showName),
3299                         'ext': 'flv',
3300                         'format': 'flv',
3301                         'thumbnail': imgUrl,
3302                         'description': description,
3303                         'player_url': playerUrl,
3304                 }
3305
3306                 try:
3307                         self._downloader.process_info(info)
3308                 except UnavailableVideoError, err:
3309                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3310
3311
3312 class CollegeHumorIE(InfoExtractor):
3313         """Information extractor for collegehumor.com"""
3314
3315         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3316         IE_NAME = u'collegehumor'
3317
3318         def report_webpage(self, video_id):
3319                 """Report information extraction."""
3320                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3321
3322         def report_extraction(self, video_id):
3323                 """Report information extraction."""
3324                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3325
3326         def _real_extract(self, url):
3327                 htmlParser = HTMLParser.HTMLParser()
3328
3329                 mobj = re.match(self._VALID_URL, url)
3330                 if mobj is None:
3331                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3332                         return
3333                 video_id = mobj.group('videoid')
3334
3335                 self.report_webpage(video_id)
3336                 request = urllib2.Request(url)
3337                 try:
3338                         webpage = urllib2.urlopen(request).read()
3339                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3340                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3341                         return
3342
3343                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3344                 if m is None:
3345                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3346                         return
3347                 internal_video_id = m.group('internalvideoid')
3348
3349                 info = {
3350                         'id': video_id,
3351                         'internal_id': internal_video_id,
3352                 }
3353
3354                 self.report_extraction(video_id)
3355                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3356                 try:
3357                         metaXml = urllib2.urlopen(xmlUrl).read()
3358                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3359                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3360                         return
3361
3362                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3363                 try:
3364                         videoNode = mdoc.findall('./video')[0]
3365                         info['description'] = videoNode.findall('./description')[0].text
3366                         info['title'] = videoNode.findall('./caption')[0].text
3367                         info['stitle'] = _simplify_title(info['title'])
3368                         info['url'] = videoNode.findall('./file')[0].text
3369                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3370                         info['ext'] = info['url'].rpartition('.')[2]
3371                         info['format'] = info['ext']
3372                 except IndexError:
3373                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3374                         return
3375
3376                 self._downloader.increment_downloads()
3377
3378                 try:
3379                         self._downloader.process_info(info)
3380                 except UnavailableVideoError, err:
3381                         self._downloader.trouble(u'\nERROR: unable to download video')
3382
3383
3384 class XVideosIE(InfoExtractor):
3385         """Information extractor for xvideos.com"""
3386
3387         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3388         IE_NAME = u'xvideos'
3389
3390         def report_webpage(self, video_id):
3391                 """Report information extraction."""
3392                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3393
3394         def report_extraction(self, video_id):
3395                 """Report information extraction."""
3396                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3397
3398         def _real_extract(self, url):
3399                 htmlParser = HTMLParser.HTMLParser()
3400
3401                 mobj = re.match(self._VALID_URL, url)
3402                 if mobj is None:
3403                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3404                         return
3405                 video_id = mobj.group(1).decode('utf-8')
3406
3407                 self.report_webpage(video_id)
3408
3409                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3410                 try:
3411                         webpage = urllib2.urlopen(request).read()
3412                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3413                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3414                         return
3415
3416                 self.report_extraction(video_id)
3417
3418
3419                 # Extract video URL
3420                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3421                 if mobj is None:
3422                         self._downloader.trouble(u'ERROR: unable to extract video url')
3423                         return
3424                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3425
3426
3427                 # Extract title
3428                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3429                 if mobj is None:
3430                         self._downloader.trouble(u'ERROR: unable to extract video title')
3431                         return
3432                 video_title = mobj.group(1).decode('utf-8')
3433
3434
3435                 # Extract video thumbnail
3436                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3437                 if mobj is None:
3438                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3439                         return
3440                 video_thumbnail = mobj.group(1).decode('utf-8')
3441
3442
3443
3444                 self._downloader.increment_downloads()
3445                 info = {
3446                         'id': video_id,
3447                         'url': video_url,
3448                         'uploader': None,
3449                         'upload_date': None,
3450                         'title': video_title,
3451                         'stitle': _simplify_title(video_title),
3452                         'ext': 'flv',
3453                         'format': 'flv',
3454                         'thumbnail': video_thumbnail,
3455                         'description': None,
3456                         'player_url': None,
3457                 }
3458
3459                 try:
3460                         self._downloader.process_info(info)
3461                 except UnavailableVideoError, err:
3462                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3463
3464
3465 class SoundcloudIE(InfoExtractor):
3466         """Information extractor for soundcloud.com
3467            To access the media, the uid of the song and a stream token
3468            must be extracted from the page source and the script must make
3469            a request to media.soundcloud.com/crossdomain.xml. Then
3470            the media can be grabbed by requesting from an url composed
3471            of the stream token and uid
3472          """
3473
3474         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3475         IE_NAME = u'soundcloud'
3476
3477         def __init__(self, downloader=None):
3478                 InfoExtractor.__init__(self, downloader)
3479
3480         def report_webpage(self, video_id):
3481                 """Report information extraction."""
3482                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3483
3484         def report_extraction(self, video_id):
3485                 """Report information extraction."""
3486                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3487
3488         def _real_extract(self, url):
3489                 htmlParser = HTMLParser.HTMLParser()
3490
3491                 mobj = re.match(self._VALID_URL, url)
3492                 if mobj is None:
3493                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3494                         return
3495
3496                 # extract uploader (which is in the url)
3497                 uploader = mobj.group(1).decode('utf-8')
3498                 # extract simple title (uploader + slug of song title)
3499                 slug_title =  mobj.group(2).decode('utf-8')
3500                 simple_title = uploader + '-' + slug_title
3501
3502                 self.report_webpage('%s/%s' % (uploader, slug_title))
3503
3504                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3505                 try:
3506                         webpage = urllib2.urlopen(request).read()
3507                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3508                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3509                         return
3510
3511                 self.report_extraction('%s/%s' % (uploader, slug_title))
3512
3513                 # extract uid and stream token that soundcloud hands out for access
3514                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3515                 if mobj:
3516                         video_id = mobj.group(1)
3517                         stream_token = mobj.group(2)
3518
3519                 # extract unsimplified title
3520                 mobj = re.search('"title":"(.*?)",', webpage)
3521                 if mobj:
3522                         title = mobj.group(1)
3523
3524                 # construct media url (with uid/token)
3525                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3526                 mediaURL = mediaURL % (video_id, stream_token)
3527
3528                 # description
3529                 description = u'No description available'
3530                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3531                 if mobj:
3532                         description = mobj.group(1)
3533                 
3534                 # upload date
3535                 upload_date = None
3536                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3537                 if mobj:
3538                         try:
3539                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3540                         except Exception, e:
3541                                 print str(e)
3542
3543                 # for soundcloud, a request to a cross domain is required for cookies
3544                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3545
3546                 try:
3547                         self._downloader.process_info({
3548                                 'id':           video_id.decode('utf-8'),
3549                                 'url':          mediaURL,
3550                                 'uploader':     uploader.decode('utf-8'),
3551                                 'upload_date':  upload_date,
3552                                 'title':        simple_title.decode('utf-8'),
3553                                 'stitle':       simple_title.decode('utf-8'),
3554                                 'ext':          u'mp3',
3555                                 'format':       u'NA',
3556                                 'player_url':   None,
3557                                 'description': description.decode('utf-8')
3558                         })
3559                 except UnavailableVideoError:
3560                         self._downloader.trouble(u'\nERROR: unable to download video')
3561
3562
3563 class InfoQIE(InfoExtractor):
3564         """Information extractor for infoq.com"""
3565
3566         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3567         IE_NAME = u'infoq'
3568
3569         def report_webpage(self, video_id):
3570                 """Report information extraction."""
3571                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3572
3573         def report_extraction(self, video_id):
3574                 """Report information extraction."""
3575                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3576
3577         def _real_extract(self, url):
3578                 htmlParser = HTMLParser.HTMLParser()
3579
3580                 mobj = re.match(self._VALID_URL, url)
3581                 if mobj is None:
3582                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3583                         return
3584
3585                 self.report_webpage(url)
3586
3587                 request = urllib2.Request(url)
3588                 try:
3589                         webpage = urllib2.urlopen(request).read()
3590                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3591                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3592                         return
3593
3594                 self.report_extraction(url)
3595
3596
3597                 # Extract video URL
3598                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3599                 if mobj is None:
3600                         self._downloader.trouble(u'ERROR: unable to extract video url')
3601                         return
3602                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3603
3604
3605                 # Extract title
3606                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3607                 if mobj is None:
3608                         self._downloader.trouble(u'ERROR: unable to extract video title')
3609                         return
3610                 video_title = mobj.group(1).decode('utf-8')
3611
3612                 # Extract description
3613                 video_description = u'No description available.'
3614                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3615                 if mobj is not None:
3616                         video_description = mobj.group(1).decode('utf-8')
3617
3618                 video_filename = video_url.split('/')[-1]
3619                 video_id, extension = video_filename.split('.')
3620
3621                 self._downloader.increment_downloads()
3622                 info = {
3623                         'id': video_id,
3624                         'url': video_url,
3625                         'uploader': None,
3626                         'upload_date': None,
3627                         'title': video_title,
3628                         'stitle': _simplify_title(video_title),
3629                         'ext': extension,
3630                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3631                         'thumbnail': None,
3632                         'description': video_description,
3633                         'player_url': None,
3634                 }
3635
3636                 try:
3637                         self._downloader.process_info(info)
3638                 except UnavailableVideoError, err:
3639                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3640
3641 class MixcloudIE(InfoExtractor):
3642         """Information extractor for www.mixcloud.com"""
3643         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3644         IE_NAME = u'mixcloud'
3645
3646         def __init__(self, downloader=None):
3647                 InfoExtractor.__init__(self, downloader)
3648
3649         def report_download_json(self, file_id):
3650                 """Report JSON download."""
3651                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3652
3653         def report_extraction(self, file_id):
3654                 """Report information extraction."""
3655                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3656
3657         def get_urls(self, jsonData, fmt, bitrate='best'):
3658                 """Get urls from 'audio_formats' section in json"""
3659                 file_url = None
3660                 try:
3661                         bitrate_list = jsonData[fmt]
3662                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3663                                 bitrate = max(bitrate_list) # select highest
3664
3665                         url_list = jsonData[fmt][bitrate]
3666                 except TypeError: # we have no bitrate info.
3667                         url_list = jsonData[fmt]
3668                                 
3669                 return url_list
3670
3671         def check_urls(self, url_list):
3672                 """Returns 1st active url from list"""
3673                 for url in url_list:
3674                         try:
3675                                 urllib2.urlopen(url)
3676                                 return url
3677                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3678                                 url = None
3679
3680                 return None
3681
3682         def _print_formats(self, formats):
3683                 print 'Available formats:'
3684                 for fmt in formats.keys():
3685                         for b in formats[fmt]:
3686                                 try:
3687                                         ext = formats[fmt][b][0]
3688                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3689                                 except TypeError: # we have no bitrate info
3690                                         ext = formats[fmt][0]
3691                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3692                                         break
3693
3694         def _real_extract(self, url):
3695                 mobj = re.match(self._VALID_URL, url)
3696                 if mobj is None:
3697                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3698                         return
3699                 # extract uploader & filename from url
3700                 uploader = mobj.group(1).decode('utf-8')
3701                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3702
3703                 # construct API request
3704                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3705                 # retrieve .json file with links to files
3706                 request = urllib2.Request(file_url)
3707                 try:
3708                         self.report_download_json(file_url)
3709                         jsonData = urllib2.urlopen(request).read()
3710                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3711                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3712                         return
3713
3714                 # parse JSON
3715                 json_data = json.loads(jsonData)
3716                 player_url = json_data['player_swf_url']
3717                 formats = dict(json_data['audio_formats'])
3718
3719                 req_format = self._downloader.params.get('format', None)
3720                 bitrate = None
3721
3722                 if self._downloader.params.get('listformats', None):
3723                         self._print_formats(formats)
3724                         return
3725
3726                 if req_format is None or req_format == 'best':
3727                         for format_param in formats.keys():
3728                                 url_list = self.get_urls(formats, format_param)
3729                                 # check urls
3730                                 file_url = self.check_urls(url_list)
3731                                 if file_url is not None:
3732                                         break # got it!
3733                 else:
3734                         if req_format not in formats.keys():
3735                                 self._downloader.trouble(u'ERROR: format is not available')
3736                                 return
3737
3738                         url_list = self.get_urls(formats, req_format)
3739                         file_url = self.check_urls(url_list)
3740                         format_param = req_format
3741
3742                 # We have audio
3743                 self._downloader.increment_downloads()
3744                 try:
3745                         # Process file information
3746                         self._downloader.process_info({
3747                                 'id':           file_id.decode('utf-8'),
3748                                 'url':          file_url.decode('utf-8'),
3749                                 'uploader':     uploader.decode('utf-8'),
3750                                 'upload_date':  u'NA',
3751                                 'title':        json_data['name'],
3752                                 'stitle':       _simplify_title(json_data['name']),
3753                                 'ext':          file_url.split('.')[-1].decode('utf-8'),
3754                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3755                                 'thumbnail':    json_data['thumbnail_url'],
3756                                 'description':  json_data['description'],
3757                                 'player_url':   player_url.decode('utf-8'),
3758                         })
3759                 except UnavailableVideoError, err:
3760                         self._downloader.trouble(u'ERROR: unable to download file')
3761
3762 class StanfordOpenClassroomIE(InfoExtractor):
3763         """Information extractor for Stanford's Open ClassRoom"""
3764
3765         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3766         IE_NAME = u'stanfordoc'
3767
3768         def report_download_webpage(self, objid):
3769                 """Report information extraction."""
3770                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3771
3772         def report_extraction(self, video_id):
3773                 """Report information extraction."""
3774                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3775
3776         def _real_extract(self, url):
3777                 mobj = re.match(self._VALID_URL, url)
3778                 if mobj is None:
3779                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3780                         return
3781
3782                 if mobj.group('course') and mobj.group('video'): # A specific video
3783                         course = mobj.group('course')
3784                         video = mobj.group('video')
3785                         info = {
3786                                 'id': _simplify_title(course + '_' + video),
3787                         }
3788         
3789                         self.report_extraction(info['id'])
3790                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3791                         xmlUrl = baseUrl + video + '.xml'
3792                         try:
3793                                 metaXml = urllib2.urlopen(xmlUrl).read()
3794                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3795                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3796                                 return
3797                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3798                         try:
3799                                 info['title'] = mdoc.findall('./title')[0].text
3800                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3801                         except IndexError:
3802                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3803                                 return
3804                         info['stitle'] = _simplify_title(info['title'])
3805                         info['ext'] = info['url'].rpartition('.')[2]
3806                         info['format'] = info['ext']
3807                         self._downloader.increment_downloads()
3808                         try:
3809                                 self._downloader.process_info(info)
3810                         except UnavailableVideoError, err:
3811                                 self._downloader.trouble(u'\nERROR: unable to download video')
3812                 elif mobj.group('course'): # A course page
3813                         unescapeHTML = HTMLParser.HTMLParser().unescape
3814
3815                         course = mobj.group('course')
3816                         info = {
3817                                 'id': _simplify_title(course),
3818                                 'type': 'playlist',
3819                         }
3820
3821                         self.report_download_webpage(info['id'])
3822                         try:
3823                                 coursepage = urllib2.urlopen(url).read()
3824                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3825                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3826                                 return
3827
3828                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3829                         if m:
3830                                 info['title'] = unescapeHTML(m.group(1))
3831                         else:
3832                                 info['title'] = info['id']
3833                         info['stitle'] = _simplify_title(info['title'])
3834
3835                         m = re.search('<description>([^<]+)</description>', coursepage)
3836                         if m:
3837                                 info['description'] = unescapeHTML(m.group(1))
3838
3839                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3840                         info['list'] = [
3841                                 {
3842                                         'type': 'reference',
3843                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3844                                 }
3845                                         for vpage in links]
3846
3847                         for entry in info['list']:
3848                                 assert entry['type'] == 'reference'
3849                                 self.extract(entry['url'])
3850                 else: # Root page
3851                         unescapeHTML = HTMLParser.HTMLParser().unescape
3852
3853                         info = {
3854                                 'id': 'Stanford OpenClassroom',
3855                                 'type': 'playlist',
3856                         }
3857
3858                         self.report_download_webpage(info['id'])
3859                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3860                         try:
3861                                 rootpage = urllib2.urlopen(rootURL).read()
3862                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3863                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3864                                 return
3865
3866                         info['title'] = info['id']
3867                         info['stitle'] = _simplify_title(info['title'])
3868
3869                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3870                         info['list'] = [
3871                                 {
3872                                         'type': 'reference',
3873                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3874                                 }
3875                                         for cpage in links]
3876
3877                         for entry in info['list']:
3878                                 assert entry['type'] == 'reference'
3879                                 self.extract(entry['url'])
3880
3881
3882 class PostProcessor(object):
3883         """Post Processor class.
3884
3885         PostProcessor objects can be added to downloaders with their
3886         add_post_processor() method. When the downloader has finished a
3887         successful download, it will take its internal chain of PostProcessors
3888         and start calling the run() method on each one of them, first with
3889         an initial argument and then with the returned value of the previous
3890         PostProcessor.
3891
3892         The chain will be stopped if one of them ever returns None or the end
3893         of the chain is reached.
3894
3895         PostProcessor objects follow a "mutual registration" process similar
3896         to InfoExtractor objects.
3897         """
3898
3899         _downloader = None
3900
3901         def __init__(self, downloader=None):
3902                 self._downloader = downloader
3903
3904         def set_downloader(self, downloader):
3905                 """Sets the downloader for this PP."""
3906                 self._downloader = downloader
3907
3908         def run(self, information):
3909                 """Run the PostProcessor.
3910
3911                 The "information" argument is a dictionary like the ones
3912                 composed by InfoExtractors. The only difference is that this
3913                 one has an extra field called "filepath" that points to the
3914                 downloaded file.
3915
3916                 When this method returns None, the postprocessing chain is
3917                 stopped. However, this method may return an information
3918                 dictionary that will be passed to the next postprocessing
3919                 object in the chain. It can be the one it received after
3920                 changing some fields.
3921
3922                 In addition, this method may raise a PostProcessingError
3923                 exception that will be taken into account by the downloader
3924                 it was called from.
3925                 """
3926                 return information # by default, do nothing
3927
3928
3929 class FFmpegExtractAudioPP(PostProcessor):
3930
3931         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3932                 PostProcessor.__init__(self, downloader)
3933                 if preferredcodec is None:
3934                         preferredcodec = 'best'
3935                 self._preferredcodec = preferredcodec
3936                 self._preferredquality = preferredquality
3937                 self._keepvideo = keepvideo
3938
3939         @staticmethod
3940         def get_audio_codec(path):
3941                 try:
3942                         cmd = ['ffprobe', '-show_streams', '--', path]
3943                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3944                         output = handle.communicate()[0]
3945                         if handle.wait() != 0:
3946                                 return None
3947                 except (IOError, OSError):
3948                         return None
3949                 audio_codec = None
3950                 for line in output.split('\n'):
3951                         if line.startswith('codec_name='):
3952                                 audio_codec = line.split('=')[1].strip()
3953                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3954                                 return audio_codec
3955                 return None
3956
3957         @staticmethod
3958         def run_ffmpeg(path, out_path, codec, more_opts):
3959                 try:
3960                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3961                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3962                         return (ret == 0)
3963                 except (IOError, OSError):
3964                         return False
3965
3966         def run(self, information):
3967                 path = information['filepath']
3968
3969                 filecodec = self.get_audio_codec(path)
3970                 if filecodec is None:
3971                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3972                         return None
3973
3974                 more_opts = []
3975                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3976                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
3977                                 # Lossless, but in another container
3978                                 acodec = 'copy'
3979                                 extension = self._preferredcodec
3980                                 more_opts = ['-absf', 'aac_adtstoasc']
3981                         elif filecodec in ['aac', 'mp3', 'vorbis']:
3982                                 # Lossless if possible
3983                                 acodec = 'copy'
3984                                 extension = filecodec
3985                                 if filecodec == 'aac':
3986                                         more_opts = ['-f', 'adts']
3987                                 if filecodec == 'vorbis':
3988                                         extension = 'ogg'
3989                         else:
3990                                 # MP3 otherwise.
3991                                 acodec = 'libmp3lame'
3992                                 extension = 'mp3'
3993                                 more_opts = []
3994                                 if self._preferredquality is not None:
3995                                         more_opts += ['-ab', self._preferredquality]
3996                 else:
3997                         # We convert the audio (lossy)
3998                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3999                         extension = self._preferredcodec
4000                         more_opts = []
4001                         if self._preferredquality is not None:
4002                                 more_opts += ['-ab', self._preferredquality]
4003                         if self._preferredcodec == 'aac':
4004                                 more_opts += ['-f', 'adts']
4005                         if self._preferredcodec == 'm4a':
4006                                 more_opts += ['-absf', 'aac_adtstoasc']
4007                         if self._preferredcodec == 'vorbis':
4008                                 extension = 'ogg'
4009
4010                 (prefix, ext) = os.path.splitext(path)
4011                 new_path = prefix + '.' + extension
4012                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4013                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
4014
4015                 if not status:
4016                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
4017                         return None
4018
4019                 # Try to update the date time for extracted audio file.
4020                 if information.get('filetime') is not None:
4021                         try:
4022                                 os.utime(new_path, (time.time(), information['filetime']))
4023                         except:
4024                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4025
4026                 if not self._keepvideo:
4027                         try:
4028                                 os.remove(path)
4029                         except (IOError, OSError):
4030                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4031                                 return None
4032
4033                 information['filepath'] = new_path
4034                 return information
4035
4036
4037 def updateSelf(downloader, filename):
4038         ''' Update the program file with the latest version from the repository '''
4039         # Note: downloader only used for options
4040         if not os.access(filename, os.W_OK):
4041                 sys.exit('ERROR: no write permissions on %s' % filename)
4042
4043         downloader.to_screen('Updating to latest version...')
4044
4045         try:
4046                 try:
4047                         urlh = urllib.urlopen(UPDATE_URL)
4048                         newcontent = urlh.read()
4049                         
4050                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4051                         if vmatch is not None and vmatch.group(1) == __version__:
4052                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4053                                 return
4054                 finally:
4055                         urlh.close()
4056         except (IOError, OSError), err:
4057                 sys.exit('ERROR: unable to download latest version')
4058
4059         try:
4060                 outf = open(filename, 'wb')
4061                 try:
4062                         outf.write(newcontent)
4063                 finally:
4064                         outf.close()
4065         except (IOError, OSError), err:
4066                 sys.exit('ERROR: unable to overwrite current version')
4067
4068         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4069
4070 def parseOpts():
4071         # Deferred imports
4072         import getpass
4073         import optparse
4074         import shlex
4075
4076         def _readOptions(filename):
4077                 try:
4078                         optionf = open(filename)
4079                 except IOError:
4080                         return [] # silently skip if file is not present
4081                 try:
4082                         res = []
4083                         for l in optionf:
4084                                 res += shlex.split(l, comments=True)
4085                 finally:
4086                         optionf.close()
4087                 return res
4088
4089         def _format_option_string(option):
4090                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4091
4092                 opts = []
4093
4094                 if option._short_opts: opts.append(option._short_opts[0])
4095                 if option._long_opts: opts.append(option._long_opts[0])
4096                 if len(opts) > 1: opts.insert(1, ', ')
4097
4098                 if option.takes_value(): opts.append(' %s' % option.metavar)
4099
4100                 return "".join(opts)
4101
4102         def _find_term_columns():
4103                 columns = os.environ.get('COLUMNS', None)
4104                 if columns:
4105                         return int(columns)
4106
4107                 try:
4108                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4109                         out,err = sp.communicate()
4110                         return int(out.split()[1])
4111                 except:
4112                         pass
4113                 return None
4114
4115         max_width = 80
4116         max_help_position = 80
4117
4118         # No need to wrap help messages if we're on a wide console
4119         columns = _find_term_columns()
4120         if columns: max_width = columns
4121
4122         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4123         fmt.format_option_strings = _format_option_string
4124
4125         kw = {
4126                 'version'   : __version__,
4127                 'formatter' : fmt,
4128                 'usage' : '%prog [options] url [url...]',
4129                 'conflict_handler' : 'resolve',
4130         }
4131
4132         parser = optparse.OptionParser(**kw)
4133
4134         # option groups
4135         general        = optparse.OptionGroup(parser, 'General Options')
4136         selection      = optparse.OptionGroup(parser, 'Video Selection')
4137         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4138         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4139         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4140         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4141         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4142
4143         general.add_option('-h', '--help',
4144                         action='help', help='print this help text and exit')
4145         general.add_option('-v', '--version',
4146                         action='version', help='print program version and exit')
4147         general.add_option('-U', '--update',
4148                         action='store_true', dest='update_self', help='update this program to latest version')
4149         general.add_option('-i', '--ignore-errors',
4150                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4151         general.add_option('-r', '--rate-limit',
4152                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4153         general.add_option('-R', '--retries',
4154                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4155         general.add_option('--dump-user-agent',
4156                         action='store_true', dest='dump_user_agent',
4157                         help='display the current browser identification', default=False)
4158         general.add_option('--list-extractors',
4159                         action='store_true', dest='list_extractors',
4160                         help='List all supported extractors and the URLs they would handle', default=False)
4161
4162         selection.add_option('--playlist-start',
4163                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4164         selection.add_option('--playlist-end',
4165                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4166         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4167         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4168         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4169
4170         authentication.add_option('-u', '--username',
4171                         dest='username', metavar='USERNAME', help='account username')
4172         authentication.add_option('-p', '--password',
4173                         dest='password', metavar='PASSWORD', help='account password')
4174         authentication.add_option('-n', '--netrc',
4175                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4176
4177
4178         video_format.add_option('-f', '--format',
4179                         action='store', dest='format', metavar='FORMAT', help='video format code')
4180         video_format.add_option('--all-formats',
4181                         action='store_const', dest='format', help='download all available video formats', const='all')
4182         video_format.add_option('--prefer-free-formats',
4183                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4184         video_format.add_option('--max-quality',
4185                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4186         video_format.add_option('-F', '--list-formats',
4187                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4188
4189
4190         verbosity.add_option('-q', '--quiet',
4191                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4192         verbosity.add_option('-s', '--simulate',
4193                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4194         verbosity.add_option('--skip-download',
4195                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4196         verbosity.add_option('-g', '--get-url',
4197                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4198         verbosity.add_option('-e', '--get-title',
4199                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4200         verbosity.add_option('--get-thumbnail',
4201                         action='store_true', dest='getthumbnail',
4202                         help='simulate, quiet but print thumbnail URL', default=False)
4203         verbosity.add_option('--get-description',
4204                         action='store_true', dest='getdescription',
4205                         help='simulate, quiet but print video description', default=False)
4206         verbosity.add_option('--get-filename',
4207                         action='store_true', dest='getfilename',
4208                         help='simulate, quiet but print output filename', default=False)
4209         verbosity.add_option('--get-format',
4210                         action='store_true', dest='getformat',
4211                         help='simulate, quiet but print output format', default=False)
4212         verbosity.add_option('--no-progress',
4213                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4214         verbosity.add_option('--console-title',
4215                         action='store_true', dest='consoletitle',
4216                         help='display progress in console titlebar', default=False)
4217
4218
4219         filesystem.add_option('-t', '--title',
4220                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4221         filesystem.add_option('-l', '--literal',
4222                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4223         filesystem.add_option('-A', '--auto-number',
4224                         action='store_true', dest='autonumber',
4225                         help='number downloaded files starting from 00000', default=False)
4226         filesystem.add_option('-o', '--output',
4227                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4228         filesystem.add_option('-a', '--batch-file',
4229                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4230         filesystem.add_option('-w', '--no-overwrites',
4231                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4232         filesystem.add_option('-c', '--continue',
4233                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4234         filesystem.add_option('--no-continue',
4235                         action='store_false', dest='continue_dl',
4236                         help='do not resume partially downloaded files (restart from beginning)')
4237         filesystem.add_option('--cookies',
4238                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4239         filesystem.add_option('--no-part',
4240                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4241         filesystem.add_option('--no-mtime',
4242                         action='store_false', dest='updatetime',
4243                         help='do not use the Last-modified header to set the file modification time', default=True)
4244         filesystem.add_option('--write-description',
4245                         action='store_true', dest='writedescription',
4246                         help='write video description to a .description file', default=False)
4247         filesystem.add_option('--write-info-json',
4248                         action='store_true', dest='writeinfojson',
4249                         help='write video metadata to a .info.json file', default=False)
4250
4251
4252         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4253                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4254         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4255                         help='"best", "aac", "vorbis", "mp3", or "m4a"; best by default')
4256         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4257                         help='ffmpeg audio bitrate specification, 128k by default')
4258         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4259                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4260
4261
4262         parser.add_option_group(general)
4263         parser.add_option_group(selection)
4264         parser.add_option_group(filesystem)
4265         parser.add_option_group(verbosity)
4266         parser.add_option_group(video_format)
4267         parser.add_option_group(authentication)
4268         parser.add_option_group(postproc)
4269
4270         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4271         if xdg_config_home:
4272                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4273         else:
4274                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4275         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4276         opts, args = parser.parse_args(argv)
4277
4278         return parser, opts, args
4279
4280 def gen_extractors():
4281         """ Return a list of an instance of every supported extractor.
4282         The order does matter; the first extractor matched is the one handling the URL.
4283         """
4284         youtube_ie = YoutubeIE()
4285         google_ie = GoogleIE()
4286         yahoo_ie = YahooIE()
4287         return [
4288                 YoutubePlaylistIE(youtube_ie),
4289                 YoutubeUserIE(youtube_ie),
4290                 YoutubeSearchIE(youtube_ie),
4291                 youtube_ie,
4292                 MetacafeIE(youtube_ie),
4293                 DailymotionIE(),
4294                 google_ie,
4295                 GoogleSearchIE(google_ie),
4296                 PhotobucketIE(),
4297                 yahoo_ie,
4298                 YahooSearchIE(yahoo_ie),
4299                 DepositFilesIE(),
4300                 FacebookIE(),
4301                 BlipTVIE(),
4302                 VimeoIE(),
4303                 MyVideoIE(),
4304                 ComedyCentralIE(),
4305                 EscapistIE(),
4306                 CollegeHumorIE(),
4307                 XVideosIE(),
4308                 SoundcloudIE(),
4309                 InfoQIE(),
4310                 MixcloudIE(),
4311                 StanfordOpenClassroomIE(),
4312
4313                 GenericIE()
4314         ]
4315
4316 def _real_main():
4317         parser, opts, args = parseOpts()
4318
4319         # Open appropriate CookieJar
4320         if opts.cookiefile is None:
4321                 jar = cookielib.CookieJar()
4322         else:
4323                 try:
4324                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4325                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4326                                 jar.load()
4327                 except (IOError, OSError), err:
4328                         sys.exit(u'ERROR: unable to open cookie file')
4329
4330         # Dump user agent
4331         if opts.dump_user_agent:
4332                 print std_headers['User-Agent']
4333                 sys.exit(0)
4334
4335         # Batch file verification
4336         batchurls = []
4337         if opts.batchfile is not None:
4338                 try:
4339                         if opts.batchfile == '-':
4340                                 batchfd = sys.stdin
4341                         else:
4342                                 batchfd = open(opts.batchfile, 'r')
4343                         batchurls = batchfd.readlines()
4344                         batchurls = [x.strip() for x in batchurls]
4345                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4346                 except IOError:
4347                         sys.exit(u'ERROR: batch file could not be read')
4348         all_urls = batchurls + args
4349
4350         # General configuration
4351         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4352         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4353         urllib2.install_opener(opener)
4354         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4355
4356         extractors = gen_extractors()
4357
4358         if opts.list_extractors:
4359                 for ie in extractors:
4360                         print(ie.IE_NAME)
4361                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4362                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4363                         for mu in matchedUrls:
4364                                 print(u'  ' + mu)
4365                 sys.exit(0)
4366
4367         # Conflicting, missing and erroneous options
4368         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4369                 parser.error(u'using .netrc conflicts with giving username/password')
4370         if opts.password is not None and opts.username is None:
4371                 parser.error(u'account username missing')
4372         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4373                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4374         if opts.usetitle and opts.useliteral:
4375                 parser.error(u'using title conflicts with using literal title')
4376         if opts.username is not None and opts.password is None:
4377                 opts.password = getpass.getpass(u'Type account password and press return:')
4378         if opts.ratelimit is not None:
4379                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4380                 if numeric_limit is None:
4381                         parser.error(u'invalid rate limit specified')
4382                 opts.ratelimit = numeric_limit
4383         if opts.retries is not None:
4384                 try:
4385                         opts.retries = long(opts.retries)
4386                 except (TypeError, ValueError), err:
4387                         parser.error(u'invalid retry count specified')
4388         try:
4389                 opts.playliststart = int(opts.playliststart)
4390                 if opts.playliststart <= 0:
4391                         raise ValueError(u'Playlist start must be positive')
4392         except (TypeError, ValueError), err:
4393                 parser.error(u'invalid playlist start number specified')
4394         try:
4395                 opts.playlistend = int(opts.playlistend)
4396                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4397                         raise ValueError(u'Playlist end must be greater than playlist start')
4398         except (TypeError, ValueError), err:
4399                 parser.error(u'invalid playlist end number specified')
4400         if opts.extractaudio:
4401                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a']:
4402                         parser.error(u'invalid audio format specified')
4403
4404         # File downloader
4405         fd = FileDownloader({
4406                 'usenetrc': opts.usenetrc,
4407                 'username': opts.username,
4408                 'password': opts.password,
4409                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4410                 'forceurl': opts.geturl,
4411                 'forcetitle': opts.gettitle,
4412                 'forcethumbnail': opts.getthumbnail,
4413                 'forcedescription': opts.getdescription,
4414                 'forcefilename': opts.getfilename,
4415                 'forceformat': opts.getformat,
4416                 'simulate': opts.simulate,
4417                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4418                 'format': opts.format,
4419                 'format_limit': opts.format_limit,
4420                 'listformats': opts.listformats,
4421                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4422                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4423                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4424                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4425                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4426                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4427                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4428                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4429                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4430                         or u'%(id)s.%(ext)s'),
4431                 'ignoreerrors': opts.ignoreerrors,
4432                 'ratelimit': opts.ratelimit,
4433                 'nooverwrites': opts.nooverwrites,
4434                 'retries': opts.retries,
4435                 'continuedl': opts.continue_dl,
4436                 'noprogress': opts.noprogress,
4437                 'playliststart': opts.playliststart,
4438                 'playlistend': opts.playlistend,
4439                 'logtostderr': opts.outtmpl == '-',
4440                 'consoletitle': opts.consoletitle,
4441                 'nopart': opts.nopart,
4442                 'updatetime': opts.updatetime,
4443                 'writedescription': opts.writedescription,
4444                 'writeinfojson': opts.writeinfojson,
4445                 'matchtitle': opts.matchtitle,
4446                 'rejecttitle': opts.rejecttitle,
4447                 'max_downloads': opts.max_downloads,
4448                 'prefer_free_formats': opts.prefer_free_formats,
4449                 })
4450         for extractor in extractors:
4451                 fd.add_info_extractor(extractor)
4452
4453         # PostProcessors
4454         if opts.extractaudio:
4455                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4456
4457         # Update version
4458         if opts.update_self:
4459                 updateSelf(fd, sys.argv[0])
4460
4461         # Maybe do nothing
4462         if len(all_urls) < 1:
4463                 if not opts.update_self:
4464                         parser.error(u'you must provide at least one URL')
4465                 else:
4466                         sys.exit()
4467         
4468         try:
4469                 retcode = fd.download(all_urls)
4470         except MaxDownloadsReached:
4471                 fd.to_screen(u'--max-download limit reached, aborting.')
4472                 retcode = 101
4473
4474         # Dump cookie jar if requested
4475         if opts.cookiefile is not None:
4476                 try:
4477                         jar.save()
4478                 except (IOError, OSError), err:
4479                         sys.exit(u'ERROR: unable to save cookie jar')
4480
4481         sys.exit(retcode)
4482
4483 def main():
4484         try:
4485                 _real_main()
4486         except DownloadError:
4487                 sys.exit(1)
4488         except SameFileError:
4489                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4490         except KeyboardInterrupt:
4491                 sys.exit(u'\nERROR: Interrupted by user')
4492
4493 if __name__ == '__main__':
4494         main()
4495
4496 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: